Bug Summary

File:build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Warning:line 11424, column 9
Value stored to 'VectorizedTree' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name SLPVectorizer.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Transforms/Vectorize -I include -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-09-04-125545-48738-1 -x c++ /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/Optional.h"
23#include "llvm/ADT/PostOrderIterator.h"
24#include "llvm/ADT/PriorityQueue.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SetOperations.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/SmallBitVector.h"
29#include "llvm/ADT/SmallPtrSet.h"
30#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/SmallString.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/iterator.h"
34#include "llvm/ADT/iterator_range.h"
35#include "llvm/Analysis/AliasAnalysis.h"
36#include "llvm/Analysis/AssumptionCache.h"
37#include "llvm/Analysis/CodeMetrics.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/LoopAccessAnalysis.h"
42#include "llvm/Analysis/LoopInfo.h"
43#include "llvm/Analysis/MemoryLocation.h"
44#include "llvm/Analysis/OptimizationRemarkEmitter.h"
45#include "llvm/Analysis/ScalarEvolution.h"
46#include "llvm/Analysis/ScalarEvolutionExpressions.h"
47#include "llvm/Analysis/TargetLibraryInfo.h"
48#include "llvm/Analysis/TargetTransformInfo.h"
49#include "llvm/Analysis/ValueTracking.h"
50#include "llvm/Analysis/VectorUtils.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
62#include "llvm/IR/Instructions.h"
63#include "llvm/IR/IntrinsicInst.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
67#include "llvm/IR/PatternMatch.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/DOTGraphTraits.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/ErrorHandling.h"
83#include "llvm/Support/GraphWriter.h"
84#include "llvm/Support/InstructionCost.h"
85#include "llvm/Support/KnownBits.h"
86#include "llvm/Support/MathExtras.h"
87#include "llvm/Support/raw_ostream.h"
88#include "llvm/Transforms/Utils/InjectTLIMappings.h"
89#include "llvm/Transforms/Utils/Local.h"
90#include "llvm/Transforms/Utils/LoopUtils.h"
91#include "llvm/Transforms/Vectorize.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <set>
98#include <string>
99#include <tuple>
100#include <utility>
101#include <vector>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME"slp-vectorizer" "slp-vectorizer"
108#define DEBUG_TYPE"SLP" "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated")static llvm::Statistic NumVectorInstructions = {"SLP", "NumVectorInstructions"
, "Number of vector instructions generated"}
;
111
112cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
113 cl::desc("Run the SLP vectorization passes"));
114
115static cl::opt<int>
116 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
117 cl::desc("Only vectorize if you gain more than this "
118 "number "));
119
120static cl::opt<bool>
121ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
122 cl::desc("Attempt to vectorize horizontal reductions"));
123
124static cl::opt<bool> ShouldStartVectorizeHorAtStore(
125 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
126 cl::desc(
127 "Attempt to vectorize horizontal reductions feeding into a store"));
128
129static cl::opt<int>
130MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
131 cl::desc("Attempt to vectorize for this register size in bits"));
132
133static cl::opt<unsigned>
134MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
135 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
136
137static cl::opt<int>
138MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
139 cl::desc("Maximum depth of the lookup for consecutive stores."));
140
141/// Limits the size of scheduling regions in a block.
142/// It avoid long compile times for _very_ large blocks where vector
143/// instructions are spread over a wide range.
144/// This limit is way higher than needed by real-world functions.
145static cl::opt<int>
146ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
147 cl::desc("Limit the size of the SLP scheduling region per block"));
148
149static cl::opt<int> MinVectorRegSizeOption(
150 "slp-min-reg-size", cl::init(128), cl::Hidden,
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
153static cl::opt<unsigned> RecursionMaxDepth(
154 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
155 cl::desc("Limit the recursion depth when building a vectorizable tree"));
156
157static cl::opt<unsigned> MinTreeSize(
158 "slp-min-tree-size", cl::init(3), cl::Hidden,
159 cl::desc("Only vectorize small trees if they are fully vectorizable"));
160
161// The maximum depth that the look-ahead score heuristic will explore.
162// The higher this value, the higher the compilation time overhead.
163static cl::opt<int> LookAheadMaxDepth(
164 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
165 cl::desc("The maximum look-ahead depth for operand reordering scores"));
166
167// The maximum depth that the look-ahead score heuristic will explore
168// when it probing among candidates for vectorization tree roots.
169// The higher this value, the higher the compilation time overhead but unlike
170// similar limit for operands ordering this is less frequently used, hence
171// impact of higher value is less noticeable.
172static cl::opt<int> RootLookAheadMaxDepth(
173 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for searching best rooting option"));
175
176static cl::opt<bool>
177 ViewSLPTree("view-slp-tree", cl::Hidden,
178 cl::desc("Display the SLP trees with Graphviz"));
179
180// Limit the number of alias checks. The limit is chosen so that
181// it has no negative effect on the llvm benchmarks.
182static const unsigned AliasedCheckLimit = 10;
183
184// Another limit for the alias checks: The maximum distance between load/store
185// instructions where alias checks are done.
186// This limit is useful for very large basic blocks.
187static const unsigned MaxMemDepDistance = 160;
188
189/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
190/// regions to be handled.
191static const int MinScheduleRegionSize = 16;
192
193/// Predicate for the element types that the SLP vectorizer supports.
194///
195/// The most important thing to filter here are types which are invalid in LLVM
196/// vectors. We also filter target specific types which have absolutely no
197/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
198/// avoids spending time checking the cost model and realizing that they will
199/// be inevitably scalarized.
200static bool isValidElementType(Type *Ty) {
201 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
202 !Ty->isPPC_FP128Ty();
203}
204
205/// \returns True if the value is a constant (but not globals/constant
206/// expressions).
207static bool isConstant(Value *V) {
208 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
209}
210
211/// Checks if \p V is one of vector-like instructions, i.e. undef,
212/// insertelement/extractelement with constant indices for fixed vector type or
213/// extractvalue instruction.
214static bool isVectorLikeInstWithConstOps(Value *V) {
215 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
216 !isa<ExtractValueInst, UndefValue>(V))
217 return false;
218 auto *I = dyn_cast<Instruction>(V);
219 if (!I || isa<ExtractValueInst>(I))
220 return true;
221 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
222 return false;
223 if (isa<ExtractElementInst>(I))
224 return isConstant(I->getOperand(1));
225 assert(isa<InsertElementInst>(V) && "Expected only insertelement.")(static_cast <bool> (isa<InsertElementInst>(V) &&
"Expected only insertelement.") ? void (0) : __assert_fail (
"isa<InsertElementInst>(V) && \"Expected only insertelement.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 225, __extension__
__PRETTY_FUNCTION__))
;
226 return isConstant(I->getOperand(2));
227}
228
229/// \returns true if all of the instructions in \p VL are in the same block or
230/// false otherwise.
231static bool allSameBlock(ArrayRef<Value *> VL) {
232 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
233 if (!I0)
234 return false;
235 if (all_of(VL, isVectorLikeInstWithConstOps))
236 return true;
237
238 BasicBlock *BB = I0->getParent();
239 for (int I = 1, E = VL.size(); I < E; I++) {
240 auto *II = dyn_cast<Instruction>(VL[I]);
241 if (!II)
242 return false;
243
244 if (BB != II->getParent())
245 return false;
246 }
247 return true;
248}
249
250/// \returns True if all of the values in \p VL are constants (but not
251/// globals/constant expressions).
252static bool allConstant(ArrayRef<Value *> VL) {
253 // Constant expressions and globals can't be vectorized like normal integer/FP
254 // constants.
255 return all_of(VL, isConstant);
256}
257
258/// \returns True if all of the values in \p VL are identical or some of them
259/// are UndefValue.
260static bool isSplat(ArrayRef<Value *> VL) {
261 Value *FirstNonUndef = nullptr;
262 for (Value *V : VL) {
263 if (isa<UndefValue>(V))
264 continue;
265 if (!FirstNonUndef) {
266 FirstNonUndef = V;
267 continue;
268 }
269 if (V != FirstNonUndef)
270 return false;
271 }
272 return FirstNonUndef != nullptr;
273}
274
275/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
276static bool isCommutative(Instruction *I) {
277 if (auto *Cmp = dyn_cast<CmpInst>(I))
278 return Cmp->isCommutative();
279 if (auto *BO = dyn_cast<BinaryOperator>(I))
280 return BO->isCommutative();
281 // TODO: This should check for generic Instruction::isCommutative(), but
282 // we need to confirm that the caller code correctly handles Intrinsics
283 // for example (does not have 2 operands).
284 return false;
285}
286
287/// Checks if the given value is actually an undefined constant vector.
288static bool isUndefVector(const Value *V) {
289 if (isa<UndefValue>(V))
290 return true;
291 auto *C = dyn_cast<Constant>(V);
292 if (!C)
293 return false;
294 if (!C->containsUndefOrPoisonElement())
295 return false;
296 auto *VecTy = dyn_cast<FixedVectorType>(C->getType());
297 if (!VecTy)
298 return false;
299 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
300 if (Constant *Elem = C->getAggregateElement(I))
301 if (!isa<UndefValue>(Elem))
302 return false;
303 }
304 return true;
305}
306
307/// Checks if the vector of instructions can be represented as a shuffle, like:
308/// %x0 = extractelement <4 x i8> %x, i32 0
309/// %x3 = extractelement <4 x i8> %x, i32 3
310/// %y1 = extractelement <4 x i8> %y, i32 1
311/// %y2 = extractelement <4 x i8> %y, i32 2
312/// %x0x0 = mul i8 %x0, %x0
313/// %x3x3 = mul i8 %x3, %x3
314/// %y1y1 = mul i8 %y1, %y1
315/// %y2y2 = mul i8 %y2, %y2
316/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
317/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
318/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
319/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
320/// ret <4 x i8> %ins4
321/// can be transformed into:
322/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
323/// i32 6>
324/// %2 = mul <4 x i8> %1, %1
325/// ret <4 x i8> %2
326/// We convert this initially to something like:
327/// %x0 = extractelement <4 x i8> %x, i32 0
328/// %x3 = extractelement <4 x i8> %x, i32 3
329/// %y1 = extractelement <4 x i8> %y, i32 1
330/// %y2 = extractelement <4 x i8> %y, i32 2
331/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
332/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
333/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
334/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
335/// %5 = mul <4 x i8> %4, %4
336/// %6 = extractelement <4 x i8> %5, i32 0
337/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
338/// %7 = extractelement <4 x i8> %5, i32 1
339/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
340/// %8 = extractelement <4 x i8> %5, i32 2
341/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
342/// %9 = extractelement <4 x i8> %5, i32 3
343/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
344/// ret <4 x i8> %ins4
345/// InstCombiner transforms this into a shuffle and vector mul
346/// Mask will return the Shuffle Mask equivalent to the extracted elements.
347/// TODO: Can we split off and reuse the shuffle mask detection from
348/// ShuffleVectorInst/getShuffleCost?
349static Optional<TargetTransformInfo::ShuffleKind>
350isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
351 const auto *It =
352 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
353 if (It == VL.end())
354 return None;
355 auto *EI0 = cast<ExtractElementInst>(*It);
356 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
357 return None;
358 unsigned Size =
359 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
360 Value *Vec1 = nullptr;
361 Value *Vec2 = nullptr;
362 enum ShuffleMode { Unknown, Select, Permute };
363 ShuffleMode CommonShuffleMode = Unknown;
364 Mask.assign(VL.size(), UndefMaskElem);
365 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
366 // Undef can be represented as an undef element in a vector.
367 if (isa<UndefValue>(VL[I]))
368 continue;
369 auto *EI = cast<ExtractElementInst>(VL[I]);
370 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
371 return None;
372 auto *Vec = EI->getVectorOperand();
373 // We can extractelement from undef or poison vector.
374 if (isUndefVector(Vec))
375 continue;
376 // All vector operands must have the same number of vector elements.
377 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
378 return None;
379 if (isa<UndefValue>(EI->getIndexOperand()))
380 continue;
381 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
382 if (!Idx)
383 return None;
384 // Undefined behavior if Idx is negative or >= Size.
385 if (Idx->getValue().uge(Size))
386 continue;
387 unsigned IntIdx = Idx->getValue().getZExtValue();
388 Mask[I] = IntIdx;
389 // For correct shuffling we have to have at most 2 different vector operands
390 // in all extractelement instructions.
391 if (!Vec1 || Vec1 == Vec) {
392 Vec1 = Vec;
393 } else if (!Vec2 || Vec2 == Vec) {
394 Vec2 = Vec;
395 Mask[I] += Size;
396 } else {
397 return None;
398 }
399 if (CommonShuffleMode == Permute)
400 continue;
401 // If the extract index is not the same as the operation number, it is a
402 // permutation.
403 if (IntIdx != I) {
404 CommonShuffleMode = Permute;
405 continue;
406 }
407 CommonShuffleMode = Select;
408 }
409 // If we're not crossing lanes in different vectors, consider it as blending.
410 if (CommonShuffleMode == Select && Vec2)
411 return TargetTransformInfo::SK_Select;
412 // If Vec2 was never used, we have a permutation of a single vector, otherwise
413 // we have permutation of 2 vectors.
414 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
415 : TargetTransformInfo::SK_PermuteSingleSrc;
416}
417
418namespace {
419
420/// Main data required for vectorization of instructions.
421struct InstructionsState {
422 /// The very first instruction in the list with the main opcode.
423 Value *OpValue = nullptr;
424
425 /// The main/alternate instruction.
426 Instruction *MainOp = nullptr;
427 Instruction *AltOp = nullptr;
428
429 /// The main/alternate opcodes for the list of instructions.
430 unsigned getOpcode() const {
431 return MainOp ? MainOp->getOpcode() : 0;
432 }
433
434 unsigned getAltOpcode() const {
435 return AltOp ? AltOp->getOpcode() : 0;
436 }
437
438 /// Some of the instructions in the list have alternate opcodes.
439 bool isAltShuffle() const { return AltOp != MainOp; }
440
441 bool isOpcodeOrAlt(Instruction *I) const {
442 unsigned CheckedOpcode = I->getOpcode();
443 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
444 }
445
446 InstructionsState() = delete;
447 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
448 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
449};
450
451} // end anonymous namespace
452
453/// Chooses the correct key for scheduling data. If \p Op has the same (or
454/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
455/// OpValue.
456static Value *isOneOf(const InstructionsState &S, Value *Op) {
457 auto *I = dyn_cast<Instruction>(Op);
458 if (I && S.isOpcodeOrAlt(I))
459 return Op;
460 return S.OpValue;
461}
462
463/// \returns true if \p Opcode is allowed as part of of the main/alternate
464/// instruction for SLP vectorization.
465///
466/// Example of unsupported opcode is SDIV that can potentially cause UB if the
467/// "shuffled out" lane would result in division by zero.
468static bool isValidForAlternation(unsigned Opcode) {
469 if (Instruction::isIntDivRem(Opcode))
470 return false;
471
472 return true;
473}
474
475static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
476 unsigned BaseIndex = 0);
477
478/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
479/// compatible instructions or constants, or just some other regular values.
480static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
481 Value *Op1) {
482 return (isConstant(BaseOp0) && isConstant(Op0)) ||
483 (isConstant(BaseOp1) && isConstant(Op1)) ||
484 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
485 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
486 getSameOpcode({BaseOp0, Op0}).getOpcode() ||
487 getSameOpcode({BaseOp1, Op1}).getOpcode();
488}
489
490/// \returns analysis of the Instructions in \p VL described in
491/// InstructionsState, the Opcode that we suppose the whole list
492/// could be vectorized even if its structure is diverse.
493static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
494 unsigned BaseIndex) {
495 // Make sure these are all Instructions.
496 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
497 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
498
499 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
500 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
501 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
502 CmpInst::Predicate BasePred =
503 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
504 : CmpInst::BAD_ICMP_PREDICATE;
505 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
506 unsigned AltOpcode = Opcode;
507 unsigned AltIndex = BaseIndex;
508
509 // Check for one alternate opcode from another BinaryOperator.
510 // TODO - generalize to support all operators (types, calls etc.).
511 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
512 unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
513 if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
514 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
515 continue;
516 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
517 isValidForAlternation(Opcode)) {
518 AltOpcode = InstOpcode;
519 AltIndex = Cnt;
520 continue;
521 }
522 } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
523 Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
524 Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
525 if (Ty0 == Ty1) {
526 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
527 continue;
528 if (Opcode == AltOpcode) {
529 assert(isValidForAlternation(Opcode) &&(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 531, __extension__
__PRETTY_FUNCTION__))
530 isValidForAlternation(InstOpcode) &&(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 531, __extension__
__PRETTY_FUNCTION__))
531 "Cast isn't safe for alternation, logic needs to be updated!")(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 531, __extension__
__PRETTY_FUNCTION__))
;
532 AltOpcode = InstOpcode;
533 AltIndex = Cnt;
534 continue;
535 }
536 }
537 } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) {
538 auto *BaseInst = cast<Instruction>(VL[BaseIndex]);
539 auto *Inst = cast<Instruction>(VL[Cnt]);
540 Type *Ty0 = BaseInst->getOperand(0)->getType();
541 Type *Ty1 = Inst->getOperand(0)->getType();
542 if (Ty0 == Ty1) {
543 Value *BaseOp0 = BaseInst->getOperand(0);
544 Value *BaseOp1 = BaseInst->getOperand(1);
545 Value *Op0 = Inst->getOperand(0);
546 Value *Op1 = Inst->getOperand(1);
547 CmpInst::Predicate CurrentPred =
548 cast<CmpInst>(VL[Cnt])->getPredicate();
549 CmpInst::Predicate SwappedCurrentPred =
550 CmpInst::getSwappedPredicate(CurrentPred);
551 // Check for compatible operands. If the corresponding operands are not
552 // compatible - need to perform alternate vectorization.
553 if (InstOpcode == Opcode) {
554 if (BasePred == CurrentPred &&
555 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1))
556 continue;
557 if (BasePred == SwappedCurrentPred &&
558 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0))
559 continue;
560 if (E == 2 &&
561 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
562 continue;
563 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
564 CmpInst::Predicate AltPred = AltInst->getPredicate();
565 Value *AltOp0 = AltInst->getOperand(0);
566 Value *AltOp1 = AltInst->getOperand(1);
567 // Check if operands are compatible with alternate operands.
568 if (AltPred == CurrentPred &&
569 areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1))
570 continue;
571 if (AltPred == SwappedCurrentPred &&
572 areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0))
573 continue;
574 }
575 if (BaseIndex == AltIndex && BasePred != CurrentPred) {
576 assert(isValidForAlternation(Opcode) &&(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 578, __extension__
__PRETTY_FUNCTION__))
577 isValidForAlternation(InstOpcode) &&(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 578, __extension__
__PRETTY_FUNCTION__))
578 "Cast isn't safe for alternation, logic needs to be updated!")(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 578, __extension__
__PRETTY_FUNCTION__))
;
579 AltIndex = Cnt;
580 continue;
581 }
582 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
583 CmpInst::Predicate AltPred = AltInst->getPredicate();
584 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
585 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
586 continue;
587 }
588 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
589 continue;
590 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
591 }
592
593 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
594 cast<Instruction>(VL[AltIndex]));
595}
596
597/// \returns true if all of the values in \p VL have the same type or false
598/// otherwise.
599static bool allSameType(ArrayRef<Value *> VL) {
600 Type *Ty = VL[0]->getType();
601 for (int i = 1, e = VL.size(); i < e; i++)
602 if (VL[i]->getType() != Ty)
603 return false;
604
605 return true;
606}
607
608/// \returns True if Extract{Value,Element} instruction extracts element Idx.
609static Optional<unsigned> getExtractIndex(Instruction *E) {
610 unsigned Opcode = E->getOpcode();
611 assert((Opcode == Instruction::ExtractElement ||(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 613, __extension__
__PRETTY_FUNCTION__))
612 Opcode == Instruction::ExtractValue) &&(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 613, __extension__
__PRETTY_FUNCTION__))
613 "Expected extractelement or extractvalue instruction.")(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 613, __extension__
__PRETTY_FUNCTION__))
;
614 if (Opcode == Instruction::ExtractElement) {
615 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
616 if (!CI)
617 return None;
618 return CI->getZExtValue();
619 }
620 ExtractValueInst *EI = cast<ExtractValueInst>(E);
621 if (EI->getNumIndices() != 1)
622 return None;
623 return *EI->idx_begin();
624}
625
626/// \returns True if in-tree use also needs extract. This refers to
627/// possible scalar operand in vectorized instruction.
628static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
629 TargetLibraryInfo *TLI) {
630 unsigned Opcode = UserInst->getOpcode();
631 switch (Opcode) {
632 case Instruction::Load: {
633 LoadInst *LI = cast<LoadInst>(UserInst);
634 return (LI->getPointerOperand() == Scalar);
635 }
636 case Instruction::Store: {
637 StoreInst *SI = cast<StoreInst>(UserInst);
638 return (SI->getPointerOperand() == Scalar);
639 }
640 case Instruction::Call: {
641 CallInst *CI = cast<CallInst>(UserInst);
642 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
643 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
644 if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
645 return (CI->getArgOperand(i) == Scalar);
646 }
647 [[fallthrough]];
648 }
649 default:
650 return false;
651 }
652}
653
654/// \returns the AA location that is being access by the instruction.
655static MemoryLocation getLocation(Instruction *I) {
656 if (StoreInst *SI = dyn_cast<StoreInst>(I))
657 return MemoryLocation::get(SI);
658 if (LoadInst *LI = dyn_cast<LoadInst>(I))
659 return MemoryLocation::get(LI);
660 return MemoryLocation();
661}
662
663/// \returns True if the instruction is not a volatile or atomic load/store.
664static bool isSimple(Instruction *I) {
665 if (LoadInst *LI = dyn_cast<LoadInst>(I))
666 return LI->isSimple();
667 if (StoreInst *SI = dyn_cast<StoreInst>(I))
668 return SI->isSimple();
669 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
670 return !MI->isVolatile();
671 return true;
672}
673
674/// Shuffles \p Mask in accordance with the given \p SubMask.
675static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
676 if (SubMask.empty())
677 return;
678 if (Mask.empty()) {
679 Mask.append(SubMask.begin(), SubMask.end());
680 return;
681 }
682 SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
683 int TermValue = std::min(Mask.size(), SubMask.size());
684 for (int I = 0, E = SubMask.size(); I < E; ++I) {
685 if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
686 Mask[SubMask[I]] >= TermValue)
687 continue;
688 NewMask[I] = Mask[SubMask[I]];
689 }
690 Mask.swap(NewMask);
691}
692
693/// Order may have elements assigned special value (size) which is out of
694/// bounds. Such indices only appear on places which correspond to undef values
695/// (see canReuseExtract for details) and used in order to avoid undef values
696/// have effect on operands ordering.
697/// The first loop below simply finds all unused indices and then the next loop
698/// nest assigns these indices for undef values positions.
699/// As an example below Order has two undef positions and they have assigned
700/// values 3 and 7 respectively:
701/// before: 6 9 5 4 9 2 1 0
702/// after: 6 3 5 4 7 2 1 0
703static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
704 const unsigned Sz = Order.size();
705 SmallBitVector UnusedIndices(Sz, /*t=*/true);
706 SmallBitVector MaskedIndices(Sz);
707 for (unsigned I = 0; I < Sz; ++I) {
708 if (Order[I] < Sz)
709 UnusedIndices.reset(Order[I]);
710 else
711 MaskedIndices.set(I);
712 }
713 if (MaskedIndices.none())
714 return;
715 assert(UnusedIndices.count() == MaskedIndices.count() &&(static_cast <bool> (UnusedIndices.count() == MaskedIndices
.count() && "Non-synced masked/available indices.") ?
void (0) : __assert_fail ("UnusedIndices.count() == MaskedIndices.count() && \"Non-synced masked/available indices.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 716, __extension__
__PRETTY_FUNCTION__))
716 "Non-synced masked/available indices.")(static_cast <bool> (UnusedIndices.count() == MaskedIndices
.count() && "Non-synced masked/available indices.") ?
void (0) : __assert_fail ("UnusedIndices.count() == MaskedIndices.count() && \"Non-synced masked/available indices.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 716, __extension__
__PRETTY_FUNCTION__))
;
717 int Idx = UnusedIndices.find_first();
718 int MIdx = MaskedIndices.find_first();
719 while (MIdx >= 0) {
720 assert(Idx >= 0 && "Indices must be synced.")(static_cast <bool> (Idx >= 0 && "Indices must be synced."
) ? void (0) : __assert_fail ("Idx >= 0 && \"Indices must be synced.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 720, __extension__
__PRETTY_FUNCTION__))
;
721 Order[MIdx] = Idx;
722 Idx = UnusedIndices.find_next(Idx);
723 MIdx = MaskedIndices.find_next(MIdx);
724 }
725}
726
727namespace llvm {
728
729static void inversePermutation(ArrayRef<unsigned> Indices,
730 SmallVectorImpl<int> &Mask) {
731 Mask.clear();
732 const unsigned E = Indices.size();
733 Mask.resize(E, UndefMaskElem);
734 for (unsigned I = 0; I < E; ++I)
735 Mask[Indices[I]] = I;
736}
737
738/// \returns inserting index of InsertElement or InsertValue instruction,
739/// using Offset as base offset for index.
740static Optional<unsigned> getInsertIndex(const Value *InsertInst,
741 unsigned Offset = 0) {
742 int Index = Offset;
743 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
744 if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
745 auto *VT = cast<FixedVectorType>(IE->getType());
746 if (CI->getValue().uge(VT->getNumElements()))
747 return None;
748 Index *= VT->getNumElements();
749 Index += CI->getZExtValue();
750 return Index;
751 }
752 return None;
753 }
754
755 const auto *IV = cast<InsertValueInst>(InsertInst);
756 Type *CurrentType = IV->getType();
757 for (unsigned I : IV->indices()) {
758 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
759 Index *= ST->getNumElements();
760 CurrentType = ST->getElementType(I);
761 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
762 Index *= AT->getNumElements();
763 CurrentType = AT->getElementType();
764 } else {
765 return None;
766 }
767 Index += I;
768 }
769 return Index;
770}
771
772/// Reorders the list of scalars in accordance with the given \p Mask.
773static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
774 ArrayRef<int> Mask) {
775 assert(!Mask.empty() && "Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && "Expected non-empty mask."
) ? void (0) : __assert_fail ("!Mask.empty() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 775, __extension__
__PRETTY_FUNCTION__))
;
776 SmallVector<Value *> Prev(Scalars.size(),
777 UndefValue::get(Scalars.front()->getType()));
778 Prev.swap(Scalars);
779 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
780 if (Mask[I] != UndefMaskElem)
781 Scalars[Mask[I]] = Prev[I];
782}
783
784/// Checks if the provided value does not require scheduling. It does not
785/// require scheduling if this is not an instruction or it is an instruction
786/// that does not read/write memory and all operands are either not instructions
787/// or phi nodes or instructions from different blocks.
788static bool areAllOperandsNonInsts(Value *V) {
789 auto *I = dyn_cast<Instruction>(V);
790 if (!I)
791 return true;
792 return !mayHaveNonDefUseDependency(*I) &&
793 all_of(I->operands(), [I](Value *V) {
794 auto *IO = dyn_cast<Instruction>(V);
795 if (!IO)
796 return true;
797 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
798 });
799}
800
801/// Checks if the provided value does not require scheduling. It does not
802/// require scheduling if this is not an instruction or it is an instruction
803/// that does not read/write memory and all users are phi nodes or instructions
804/// from the different blocks.
805static bool isUsedOutsideBlock(Value *V) {
806 auto *I = dyn_cast<Instruction>(V);
807 if (!I)
808 return true;
809 // Limits the number of uses to save compile time.
810 constexpr int UsesLimit = 8;
811 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
812 all_of(I->users(), [I](User *U) {
813 auto *IU = dyn_cast<Instruction>(U);
814 if (!IU)
815 return true;
816 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
817 });
818}
819
820/// Checks if the specified value does not require scheduling. It does not
821/// require scheduling if all operands and all users do not need to be scheduled
822/// in the current basic block.
823static bool doesNotNeedToBeScheduled(Value *V) {
824 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
825}
826
827/// Checks if the specified array of instructions does not require scheduling.
828/// It is so if all either instructions have operands that do not require
829/// scheduling or their users do not require scheduling since they are phis or
830/// in other basic blocks.
831static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
832 return !VL.empty() &&
833 (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
834}
835
836namespace slpvectorizer {
837
838/// Bottom Up SLP Vectorizer.
839class BoUpSLP {
840 struct TreeEntry;
841 struct ScheduleData;
842
843public:
844 using ValueList = SmallVector<Value *, 8>;
845 using InstrList = SmallVector<Instruction *, 16>;
846 using ValueSet = SmallPtrSet<Value *, 16>;
847 using StoreList = SmallVector<StoreInst *, 8>;
848 using ExtraValueToDebugLocsMap =
849 MapVector<Value *, SmallVector<Instruction *, 2>>;
850 using OrdersType = SmallVector<unsigned, 4>;
851
852 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
853 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
854 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
855 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
856 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
857 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
858 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
859 // Use the vector register size specified by the target unless overridden
860 // by a command-line option.
861 // TODO: It would be better to limit the vectorization factor based on
862 // data type rather than just register size. For example, x86 AVX has
863 // 256-bit registers, but it does not support integer operations
864 // at that width (that requires AVX2).
865 if (MaxVectorRegSizeOption.getNumOccurrences())
866 MaxVecRegSize = MaxVectorRegSizeOption;
867 else
868 MaxVecRegSize =
869 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
870 .getFixedSize();
871
872 if (MinVectorRegSizeOption.getNumOccurrences())
873 MinVecRegSize = MinVectorRegSizeOption;
874 else
875 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
876 }
877
878 /// Vectorize the tree that starts with the elements in \p VL.
879 /// Returns the vectorized root.
880 Value *vectorizeTree();
881
882 /// Vectorize the tree but with the list of externally used values \p
883 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
884 /// generated extractvalue instructions.
885 Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
886
887 /// \returns the cost incurred by unwanted spills and fills, caused by
888 /// holding live values over call sites.
889 InstructionCost getSpillCost() const;
890
891 /// \returns the vectorization cost of the subtree that starts at \p VL.
892 /// A negative number means that this is profitable.
893 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);
894
895 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
896 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
897 void buildTree(ArrayRef<Value *> Roots,
898 const SmallDenseSet<Value *> &UserIgnoreLst);
899
900 /// Construct a vectorizable tree that starts at \p Roots.
901 void buildTree(ArrayRef<Value *> Roots);
902
903 /// Builds external uses of the vectorized scalars, i.e. the list of
904 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
905 /// ExternallyUsedValues contains additional list of external uses to handle
906 /// vectorization of reductions.
907 void
908 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
909
910 /// Clear the internal data structures that are created by 'buildTree'.
911 void deleteTree() {
912 VectorizableTree.clear();
913 ScalarToTreeEntry.clear();
914 MustGather.clear();
915 ExternalUses.clear();
916 for (auto &Iter : BlocksSchedules) {
917 BlockScheduling *BS = Iter.second.get();
918 BS->clear();
919 }
920 MinBWs.clear();
921 InstrElementSize.clear();
922 UserIgnoreList = nullptr;
923 }
924
925 unsigned getTreeSize() const { return VectorizableTree.size(); }
926
927 /// Perform LICM and CSE on the newly generated gather sequences.
928 void optimizeGatherSequence();
929
930 /// Checks if the specified gather tree entry \p TE can be represented as a
931 /// shuffled vector entry + (possibly) permutation with other gathers. It
932 /// implements the checks only for possibly ordered scalars (Loads,
933 /// ExtractElement, ExtractValue), which can be part of the graph.
934 Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
935
936 /// Sort loads into increasing pointers offsets to allow greater clustering.
937 Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
938
939 /// Gets reordering data for the given tree entry. If the entry is vectorized
940 /// - just return ReorderIndices, otherwise check if the scalars can be
941 /// reordered and return the most optimal order.
942 /// \param TopToBottom If true, include the order of vectorized stores and
943 /// insertelement nodes, otherwise skip them.
944 Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
945
946 /// Reorders the current graph to the most profitable order starting from the
947 /// root node to the leaf nodes. The best order is chosen only from the nodes
948 /// of the same size (vectorization factor). Smaller nodes are considered
949 /// parts of subgraph with smaller VF and they are reordered independently. We
950 /// can make it because we still need to extend smaller nodes to the wider VF
951 /// and we can merge reordering shuffles with the widening shuffles.
952 void reorderTopToBottom();
953
954 /// Reorders the current graph to the most profitable order starting from
955 /// leaves to the root. It allows to rotate small subgraphs and reduce the
956 /// number of reshuffles if the leaf nodes use the same order. In this case we
957 /// can merge the orders and just shuffle user node instead of shuffling its
958 /// operands. Plus, even the leaf nodes have different orders, it allows to
959 /// sink reordering in the graph closer to the root node and merge it later
960 /// during analysis.
961 void reorderBottomToTop(bool IgnoreReorder = false);
962
963 /// \return The vector element size in bits to use when vectorizing the
964 /// expression tree ending at \p V. If V is a store, the size is the width of
965 /// the stored value. Otherwise, the size is the width of the largest loaded
966 /// value reaching V. This method is used by the vectorizer to calculate
967 /// vectorization factors.
968 unsigned getVectorElementSize(Value *V);
969
970 /// Compute the minimum type sizes required to represent the entries in a
971 /// vectorizable tree.
972 void computeMinimumValueSizes();
973
974 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
975 unsigned getMaxVecRegSize() const {
976 return MaxVecRegSize;
977 }
978
979 // \returns minimum vector register size as set by cl::opt.
980 unsigned getMinVecRegSize() const {
981 return MinVecRegSize;
982 }
983
984 unsigned getMinVF(unsigned Sz) const {
985 return std::max(2U, getMinVecRegSize() / Sz);
986 }
987
988 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
989 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
990 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
991 return MaxVF ? MaxVF : UINT_MAX(2147483647 *2U +1U);
992 }
993
994 /// Check if homogeneous aggregate is isomorphic to some VectorType.
995 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
996 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
997 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
998 ///
999 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1000 unsigned canMapToVector(Type *T, const DataLayout &DL) const;
1001
1002 /// \returns True if the VectorizableTree is both tiny and not fully
1003 /// vectorizable. We do not vectorize such trees.
1004 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1005
1006 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1007 /// can be load combined in the backend. Load combining may not be allowed in
1008 /// the IR optimizer, so we do not want to alter the pattern. For example,
1009 /// partially transforming a scalar bswap() pattern into vector code is
1010 /// effectively impossible for the backend to undo.
1011 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1012 /// may not be necessary.
1013 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1014
1015 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1016 /// can be load combined in the backend. Load combining may not be allowed in
1017 /// the IR optimizer, so we do not want to alter the pattern. For example,
1018 /// partially transforming a scalar bswap() pattern into vector code is
1019 /// effectively impossible for the backend to undo.
1020 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1021 /// may not be necessary.
1022 bool isLoadCombineCandidate() const;
1023
1024 OptimizationRemarkEmitter *getORE() { return ORE; }
1025
1026 /// This structure holds any data we need about the edges being traversed
1027 /// during buildTree_rec(). We keep track of:
1028 /// (i) the user TreeEntry index, and
1029 /// (ii) the index of the edge.
1030 struct EdgeInfo {
1031 EdgeInfo() = default;
1032 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1033 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1034 /// The user TreeEntry.
1035 TreeEntry *UserTE = nullptr;
1036 /// The operand index of the use.
1037 unsigned EdgeIdx = UINT_MAX(2147483647 *2U +1U);
1038#ifndef NDEBUG
1039 friend inline raw_ostream &operator<<(raw_ostream &OS,
1040 const BoUpSLP::EdgeInfo &EI) {
1041 EI.dump(OS);
1042 return OS;
1043 }
1044 /// Debug print.
1045 void dump(raw_ostream &OS) const {
1046 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1047 << " EdgeIdx:" << EdgeIdx << "}";
1048 }
1049 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { dump(dbgs()); }
1050#endif
1051 };
1052
1053 /// A helper class used for scoring candidates for two consecutive lanes.
1054 class LookAheadHeuristics {
1055 const DataLayout &DL;
1056 ScalarEvolution &SE;
1057 const BoUpSLP &R;
1058 int NumLanes; // Total number of lanes (aka vectorization factor).
1059 int MaxLevel; // The maximum recursion depth for accumulating score.
1060
1061 public:
1062 LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,
1063 const BoUpSLP &R, int NumLanes, int MaxLevel)
1064 : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}
1065
1066 // The hard-coded scores listed here are not very important, though it shall
1067 // be higher for better matches to improve the resulting cost. When
1068 // computing the scores of matching one sub-tree with another, we are
1069 // basically counting the number of values that are matching. So even if all
1070 // scores are set to 1, we would still get a decent matching result.
1071 // However, sometimes we have to break ties. For example we may have to
1072 // choose between matching loads vs matching opcodes. This is what these
1073 // scores are helping us with: they provide the order of preference. Also,
1074 // this is important if the scalar is externally used or used in another
1075 // tree entry node in the different lane.
1076
1077 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1078 static const int ScoreConsecutiveLoads = 4;
1079 /// The same load multiple times. This should have a better score than
1080 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1081 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1082 /// a vector load and 1.0 for a broadcast.
1083 static const int ScoreSplatLoads = 3;
1084 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1085 static const int ScoreReversedLoads = 3;
1086 /// ExtractElementInst from same vector and consecutive indexes.
1087 static const int ScoreConsecutiveExtracts = 4;
1088 /// ExtractElementInst from same vector and reversed indices.
1089 static const int ScoreReversedExtracts = 3;
1090 /// Constants.
1091 static const int ScoreConstants = 2;
1092 /// Instructions with the same opcode.
1093 static const int ScoreSameOpcode = 2;
1094 /// Instructions with alt opcodes (e.g, add + sub).
1095 static const int ScoreAltOpcodes = 1;
1096 /// Identical instructions (a.k.a. splat or broadcast).
1097 static const int ScoreSplat = 1;
1098 /// Matching with an undef is preferable to failing.
1099 static const int ScoreUndef = 1;
1100 /// Score for failing to find a decent match.
1101 static const int ScoreFail = 0;
1102 /// Score if all users are vectorized.
1103 static const int ScoreAllUserVectorized = 1;
1104
1105 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1106 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1107 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1108 /// MainAltOps.
1109 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
1110 ArrayRef<Value *> MainAltOps) const {
1111 if (V1 == V2) {
1112 if (isa<LoadInst>(V1)) {
1113 // Retruns true if the users of V1 and V2 won't need to be extracted.
1114 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1115 // Bail out if we have too many uses to save compilation time.
1116 static constexpr unsigned Limit = 8;
1117 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1118 return false;
1119
1120 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1121 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1122 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1123 });
1124 };
1125 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1126 };
1127 // A broadcast of a load can be cheaper on some targets.
1128 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1129 ElementCount::getFixed(NumLanes)) &&
1130 ((int)V1->getNumUses() == NumLanes ||
1131 AllUsersAreInternal(V1, V2)))
1132 return LookAheadHeuristics::ScoreSplatLoads;
1133 }
1134 return LookAheadHeuristics::ScoreSplat;
1135 }
1136
1137 auto *LI1 = dyn_cast<LoadInst>(V1);
1138 auto *LI2 = dyn_cast<LoadInst>(V2);
1139 if (LI1 && LI2) {
1140 if (LI1->getParent() != LI2->getParent())
1141 return LookAheadHeuristics::ScoreFail;
1142
1143 Optional<int> Dist = getPointersDiff(
1144 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1145 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1146 if (!Dist || *Dist == 0)
1147 return LookAheadHeuristics::ScoreFail;
1148 // The distance is too large - still may be profitable to use masked
1149 // loads/gathers.
1150 if (std::abs(*Dist) > NumLanes / 2)
1151 return LookAheadHeuristics::ScoreAltOpcodes;
1152 // This still will detect consecutive loads, but we might have "holes"
1153 // in some cases. It is ok for non-power-2 vectorization and may produce
1154 // better results. It should not affect current vectorization.
1155 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
1156 : LookAheadHeuristics::ScoreReversedLoads;
1157 }
1158
1159 auto *C1 = dyn_cast<Constant>(V1);
1160 auto *C2 = dyn_cast<Constant>(V2);
1161 if (C1 && C2)
1162 return LookAheadHeuristics::ScoreConstants;
1163
1164 // Extracts from consecutive indexes of the same vector better score as
1165 // the extracts could be optimized away.
1166 Value *EV1;
1167 ConstantInt *Ex1Idx;
1168 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1169 // Undefs are always profitable for extractelements.
1170 if (isa<UndefValue>(V2))
1171 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1172 Value *EV2 = nullptr;
1173 ConstantInt *Ex2Idx = nullptr;
1174 if (match(V2,
1175 m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
1176 m_Undef())))) {
1177 // Undefs are always profitable for extractelements.
1178 if (!Ex2Idx)
1179 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1180 if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
1181 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1182 if (EV2 == EV1) {
1183 int Idx1 = Ex1Idx->getZExtValue();
1184 int Idx2 = Ex2Idx->getZExtValue();
1185 int Dist = Idx2 - Idx1;
1186 // The distance is too large - still may be profitable to use
1187 // shuffles.
1188 if (std::abs(Dist) == 0)
1189 return LookAheadHeuristics::ScoreSplat;
1190 if (std::abs(Dist) > NumLanes / 2)
1191 return LookAheadHeuristics::ScoreSameOpcode;
1192 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1193 : LookAheadHeuristics::ScoreReversedExtracts;
1194 }
1195 return LookAheadHeuristics::ScoreAltOpcodes;
1196 }
1197 return LookAheadHeuristics::ScoreFail;
1198 }
1199
1200 auto *I1 = dyn_cast<Instruction>(V1);
1201 auto *I2 = dyn_cast<Instruction>(V2);
1202 if (I1 && I2) {
1203 if (I1->getParent() != I2->getParent())
1204 return LookAheadHeuristics::ScoreFail;
1205 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1206 Ops.push_back(I1);
1207 Ops.push_back(I2);
1208 InstructionsState S = getSameOpcode(Ops);
1209 // Note: Only consider instructions with <= 2 operands to avoid
1210 // complexity explosion.
1211 if (S.getOpcode() &&
1212 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1213 !S.isAltShuffle()) &&
1214 all_of(Ops, [&S](Value *V) {
1215 return cast<Instruction>(V)->getNumOperands() ==
1216 S.MainOp->getNumOperands();
1217 }))
1218 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1219 : LookAheadHeuristics::ScoreSameOpcode;
1220 }
1221
1222 if (isa<UndefValue>(V2))
1223 return LookAheadHeuristics::ScoreUndef;
1224
1225 return LookAheadHeuristics::ScoreFail;
1226 }
1227
1228 /// Go through the operands of \p LHS and \p RHS recursively until
1229 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1230 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1231 /// of \p U1 and \p U2), except at the beginning of the recursion where
1232 /// these are set to nullptr.
1233 ///
1234 /// For example:
1235 /// \verbatim
1236 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1237 /// \ / \ / \ / \ /
1238 /// + + + +
1239 /// G1 G2 G3 G4
1240 /// \endverbatim
1241 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1242 /// each level recursively, accumulating the score. It starts from matching
1243 /// the additions at level 0, then moves on to the loads (level 1). The
1244 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1245 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1246 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1247 /// Please note that the order of the operands does not matter, as we
1248 /// evaluate the score of all profitable combinations of operands. In
1249 /// other words the score of G1 and G4 is the same as G1 and G2. This
1250 /// heuristic is based on ideas described in:
1251 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1252 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1253 /// Luís F. W. Góes
1254 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
1255 Instruction *U2, int CurrLevel,
1256 ArrayRef<Value *> MainAltOps) const {
1257
1258 // Get the shallow score of V1 and V2.
1259 int ShallowScoreAtThisLevel =
1260 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1261
1262 // If reached MaxLevel,
1263 // or if V1 and V2 are not instructions,
1264 // or if they are SPLAT,
1265 // or if they are not consecutive,
1266 // or if profitable to vectorize loads or extractelements, early return
1267 // the current cost.
1268 auto *I1 = dyn_cast<Instruction>(LHS);
1269 auto *I2 = dyn_cast<Instruction>(RHS);
1270 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1271 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1272 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1273 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1274 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1275 ShallowScoreAtThisLevel))
1276 return ShallowScoreAtThisLevel;
1277 assert(I1 && I2 && "Should have early exited.")(static_cast <bool> (I1 && I2 && "Should have early exited."
) ? void (0) : __assert_fail ("I1 && I2 && \"Should have early exited.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1277, __extension__
__PRETTY_FUNCTION__))
;
1278
1279 // Contains the I2 operand indexes that got matched with I1 operands.
1280 SmallSet<unsigned, 4> Op2Used;
1281
1282 // Recursion towards the operands of I1 and I2. We are trying all possible
1283 // operand pairs, and keeping track of the best score.
1284 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1285 OpIdx1 != NumOperands1; ++OpIdx1) {
1286 // Try to pair op1I with the best operand of I2.
1287 int MaxTmpScore = 0;
1288 unsigned MaxOpIdx2 = 0;
1289 bool FoundBest = false;
1290 // If I2 is commutative try all combinations.
1291 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1292 unsigned ToIdx = isCommutative(I2)
1293 ? I2->getNumOperands()
1294 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1295 assert(FromIdx <= ToIdx && "Bad index")(static_cast <bool> (FromIdx <= ToIdx && "Bad index"
) ? void (0) : __assert_fail ("FromIdx <= ToIdx && \"Bad index\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1295, __extension__
__PRETTY_FUNCTION__))
;
1296 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1297 // Skip operands already paired with OpIdx1.
1298 if (Op2Used.count(OpIdx2))
1299 continue;
1300 // Recursively calculate the cost at each level
1301 int TmpScore =
1302 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1303 I1, I2, CurrLevel + 1, None);
1304 // Look for the best score.
1305 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1306 TmpScore > MaxTmpScore) {
1307 MaxTmpScore = TmpScore;
1308 MaxOpIdx2 = OpIdx2;
1309 FoundBest = true;
1310 }
1311 }
1312 if (FoundBest) {
1313 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1314 Op2Used.insert(MaxOpIdx2);
1315 ShallowScoreAtThisLevel += MaxTmpScore;
1316 }
1317 }
1318 return ShallowScoreAtThisLevel;
1319 }
1320 };
1321 /// A helper data structure to hold the operands of a vector of instructions.
1322 /// This supports a fixed vector length for all operand vectors.
1323 class VLOperands {
1324 /// For each operand we need (i) the value, and (ii) the opcode that it
1325 /// would be attached to if the expression was in a left-linearized form.
1326 /// This is required to avoid illegal operand reordering.
1327 /// For example:
1328 /// \verbatim
1329 /// 0 Op1
1330 /// |/
1331 /// Op1 Op2 Linearized + Op2
1332 /// \ / ----------> |/
1333 /// - -
1334 ///
1335 /// Op1 - Op2 (0 + Op1) - Op2
1336 /// \endverbatim
1337 ///
1338 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1339 ///
1340 /// Another way to think of this is to track all the operations across the
1341 /// path from the operand all the way to the root of the tree and to
1342 /// calculate the operation that corresponds to this path. For example, the
1343 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1344 /// corresponding operation is a '-' (which matches the one in the
1345 /// linearized tree, as shown above).
1346 ///
1347 /// For lack of a better term, we refer to this operation as Accumulated
1348 /// Path Operation (APO).
1349 struct OperandData {
1350 OperandData() = default;
1351 OperandData(Value *V, bool APO, bool IsUsed)
1352 : V(V), APO(APO), IsUsed(IsUsed) {}
1353 /// The operand value.
1354 Value *V = nullptr;
1355 /// TreeEntries only allow a single opcode, or an alternate sequence of
1356 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1357 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1358 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1359 /// (e.g., Add/Mul)
1360 bool APO = false;
1361 /// Helper data for the reordering function.
1362 bool IsUsed = false;
1363 };
1364
1365 /// During operand reordering, we are trying to select the operand at lane
1366 /// that matches best with the operand at the neighboring lane. Our
1367 /// selection is based on the type of value we are looking for. For example,
1368 /// if the neighboring lane has a load, we need to look for a load that is
1369 /// accessing a consecutive address. These strategies are summarized in the
1370 /// 'ReorderingMode' enumerator.
1371 enum class ReorderingMode {
1372 Load, ///< Matching loads to consecutive memory addresses
1373 Opcode, ///< Matching instructions based on opcode (same or alternate)
1374 Constant, ///< Matching constants
1375 Splat, ///< Matching the same instruction multiple times (broadcast)
1376 Failed, ///< We failed to create a vectorizable group
1377 };
1378
1379 using OperandDataVec = SmallVector<OperandData, 2>;
1380
1381 /// A vector of operand vectors.
1382 SmallVector<OperandDataVec, 4> OpsVec;
1383
1384 const DataLayout &DL;
1385 ScalarEvolution &SE;
1386 const BoUpSLP &R;
1387
1388 /// \returns the operand data at \p OpIdx and \p Lane.
1389 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1390 return OpsVec[OpIdx][Lane];
1391 }
1392
1393 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1394 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1395 return OpsVec[OpIdx][Lane];
1396 }
1397
1398 /// Clears the used flag for all entries.
1399 void clearUsed() {
1400 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1401 OpIdx != NumOperands; ++OpIdx)
1402 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1403 ++Lane)
1404 OpsVec[OpIdx][Lane].IsUsed = false;
1405 }
1406
1407 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1408 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1409 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1410 }
1411
1412 /// \param Lane lane of the operands under analysis.
1413 /// \param OpIdx operand index in \p Lane lane we're looking the best
1414 /// candidate for.
1415 /// \param Idx operand index of the current candidate value.
1416 /// \returns The additional score due to possible broadcasting of the
1417 /// elements in the lane. It is more profitable to have power-of-2 unique
1418 /// elements in the lane, it will be vectorized with higher probability
1419 /// after removing duplicates. Currently the SLP vectorizer supports only
1420 /// vectorization of the power-of-2 number of unique scalars.
1421 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1422 Value *IdxLaneV = getData(Idx, Lane).V;
1423 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1424 return 0;
1425 SmallPtrSet<Value *, 4> Uniques;
1426 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1427 if (Ln == Lane)
1428 continue;
1429 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1430 if (!isa<Instruction>(OpIdxLnV))
1431 return 0;
1432 Uniques.insert(OpIdxLnV);
1433 }
1434 int UniquesCount = Uniques.size();
1435 int UniquesCntWithIdxLaneV =
1436 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1437 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1438 int UniquesCntWithOpIdxLaneV =
1439 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1440 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1441 return 0;
1442 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1443 UniquesCntWithOpIdxLaneV) -
1444 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1445 }
1446
1447 /// \param Lane lane of the operands under analysis.
1448 /// \param OpIdx operand index in \p Lane lane we're looking the best
1449 /// candidate for.
1450 /// \param Idx operand index of the current candidate value.
1451 /// \returns The additional score for the scalar which users are all
1452 /// vectorized.
1453 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1454 Value *IdxLaneV = getData(Idx, Lane).V;
1455 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1456 // Do not care about number of uses for vector-like instructions
1457 // (extractelement/extractvalue with constant indices), they are extracts
1458 // themselves and already externally used. Vectorization of such
1459 // instructions does not add extra extractelement instruction, just may
1460 // remove it.
1461 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1462 isVectorLikeInstWithConstOps(OpIdxLaneV))
1463 return LookAheadHeuristics::ScoreAllUserVectorized;
1464 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1465 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1466 return 0;
1467 return R.areAllUsersVectorized(IdxLaneI, None)
1468 ? LookAheadHeuristics::ScoreAllUserVectorized
1469 : 0;
1470 }
1471
1472 /// Score scaling factor for fully compatible instructions but with
1473 /// different number of external uses. Allows better selection of the
1474 /// instructions with less external uses.
1475 static const int ScoreScaleFactor = 10;
1476
1477 /// \Returns the look-ahead score, which tells us how much the sub-trees
1478 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1479 /// score. This helps break ties in an informed way when we cannot decide on
1480 /// the order of the operands by just considering the immediate
1481 /// predecessors.
1482 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1483 int Lane, unsigned OpIdx, unsigned Idx,
1484 bool &IsUsed) {
1485 LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),
1486 LookAheadMaxDepth);
1487 // Keep track of the instruction stack as we recurse into the operands
1488 // during the look-ahead score exploration.
1489 int Score =
1490 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1491 /*CurrLevel=*/1, MainAltOps);
1492 if (Score) {
1493 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1494 if (Score <= -SplatScore) {
1495 // Set the minimum score for splat-like sequence to avoid setting
1496 // failed state.
1497 Score = 1;
1498 } else {
1499 Score += SplatScore;
1500 // Scale score to see the difference between different operands
1501 // and similar operands but all vectorized/not all vectorized
1502 // uses. It does not affect actual selection of the best
1503 // compatible operand in general, just allows to select the
1504 // operand with all vectorized uses.
1505 Score *= ScoreScaleFactor;
1506 Score += getExternalUseScore(Lane, OpIdx, Idx);
1507 IsUsed = true;
1508 }
1509 }
1510 return Score;
1511 }
1512
1513 /// Best defined scores per lanes between the passes. Used to choose the
1514 /// best operand (with the highest score) between the passes.
1515 /// The key - {Operand Index, Lane}.
1516 /// The value - the best score between the passes for the lane and the
1517 /// operand.
1518 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
1519 BestScoresPerLanes;
1520
1521 // Search all operands in Ops[*][Lane] for the one that matches best
1522 // Ops[OpIdx][LastLane] and return its opreand index.
1523 // If no good match can be found, return None.
1524 Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1525 ArrayRef<ReorderingMode> ReorderingModes,
1526 ArrayRef<Value *> MainAltOps) {
1527 unsigned NumOperands = getNumOperands();
1528
1529 // The operand of the previous lane at OpIdx.
1530 Value *OpLastLane = getData(OpIdx, LastLane).V;
1531
1532 // Our strategy mode for OpIdx.
1533 ReorderingMode RMode = ReorderingModes[OpIdx];
1534 if (RMode == ReorderingMode::Failed)
1535 return None;
1536
1537 // The linearized opcode of the operand at OpIdx, Lane.
1538 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1539
1540 // The best operand index and its score.
1541 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1542 // are using the score to differentiate between the two.
1543 struct BestOpData {
1544 Optional<unsigned> Idx = None;
1545 unsigned Score = 0;
1546 } BestOp;
1547 BestOp.Score =
1548 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1549 .first->second;
1550
1551 // Track if the operand must be marked as used. If the operand is set to
1552 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1553 // want to reestimate the operands again on the following iterations).
1554 bool IsUsed =
1555 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1556 // Iterate through all unused operands and look for the best.
1557 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1558 // Get the operand at Idx and Lane.
1559 OperandData &OpData = getData(Idx, Lane);
1560 Value *Op = OpData.V;
1561 bool OpAPO = OpData.APO;
1562
1563 // Skip already selected operands.
1564 if (OpData.IsUsed)
1565 continue;
1566
1567 // Skip if we are trying to move the operand to a position with a
1568 // different opcode in the linearized tree form. This would break the
1569 // semantics.
1570 if (OpAPO != OpIdxAPO)
1571 continue;
1572
1573 // Look for an operand that matches the current mode.
1574 switch (RMode) {
1575 case ReorderingMode::Load:
1576 case ReorderingMode::Constant:
1577 case ReorderingMode::Opcode: {
1578 bool LeftToRight = Lane > LastLane;
1579 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1580 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1581 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1582 OpIdx, Idx, IsUsed);
1583 if (Score > static_cast<int>(BestOp.Score)) {
1584 BestOp.Idx = Idx;
1585 BestOp.Score = Score;
1586 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1587 }
1588 break;
1589 }
1590 case ReorderingMode::Splat:
1591 if (Op == OpLastLane)
1592 BestOp.Idx = Idx;
1593 break;
1594 case ReorderingMode::Failed:
1595 llvm_unreachable("Not expected Failed reordering mode.")::llvm::llvm_unreachable_internal("Not expected Failed reordering mode."
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1595)
;
1596 }
1597 }
1598
1599 if (BestOp.Idx) {
1600 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1601 return BestOp.Idx;
1602 }
1603 // If we could not find a good match return None.
1604 return None;
1605 }
1606
1607 /// Helper for reorderOperandVecs.
1608 /// \returns the lane that we should start reordering from. This is the one
1609 /// which has the least number of operands that can freely move about or
1610 /// less profitable because it already has the most optimal set of operands.
1611 unsigned getBestLaneToStartReordering() const {
1612 unsigned Min = UINT_MAX(2147483647 *2U +1U);
1613 unsigned SameOpNumber = 0;
1614 // std::pair<unsigned, unsigned> is used to implement a simple voting
1615 // algorithm and choose the lane with the least number of operands that
1616 // can freely move about or less profitable because it already has the
1617 // most optimal set of operands. The first unsigned is a counter for
1618 // voting, the second unsigned is the counter of lanes with instructions
1619 // with same/alternate opcodes and same parent basic block.
1620 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
1621 // Try to be closer to the original results, if we have multiple lanes
1622 // with same cost. If 2 lanes have the same cost, use the one with the
1623 // lowest index.
1624 for (int I = getNumLanes(); I > 0; --I) {
1625 unsigned Lane = I - 1;
1626 OperandsOrderData NumFreeOpsHash =
1627 getMaxNumOperandsThatCanBeReordered(Lane);
1628 // Compare the number of operands that can move and choose the one with
1629 // the least number.
1630 if (NumFreeOpsHash.NumOfAPOs < Min) {
1631 Min = NumFreeOpsHash.NumOfAPOs;
1632 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1633 HashMap.clear();
1634 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1635 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1636 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1637 // Select the most optimal lane in terms of number of operands that
1638 // should be moved around.
1639 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1640 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1641 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1642 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1643 auto It = HashMap.find(NumFreeOpsHash.Hash);
1644 if (It == HashMap.end())
1645 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1646 else
1647 ++It->second.first;
1648 }
1649 }
1650 // Select the lane with the minimum counter.
1651 unsigned BestLane = 0;
1652 unsigned CntMin = UINT_MAX(2147483647 *2U +1U);
1653 for (const auto &Data : reverse(HashMap)) {
1654 if (Data.second.first < CntMin) {
1655 CntMin = Data.second.first;
1656 BestLane = Data.second.second;
1657 }
1658 }
1659 return BestLane;
1660 }
1661
1662 /// Data structure that helps to reorder operands.
1663 struct OperandsOrderData {
1664 /// The best number of operands with the same APOs, which can be
1665 /// reordered.
1666 unsigned NumOfAPOs = UINT_MAX(2147483647 *2U +1U);
1667 /// Number of operands with the same/alternate instruction opcode and
1668 /// parent.
1669 unsigned NumOpsWithSameOpcodeParent = 0;
1670 /// Hash for the actual operands ordering.
1671 /// Used to count operands, actually their position id and opcode
1672 /// value. It is used in the voting mechanism to find the lane with the
1673 /// least number of operands that can freely move about or less profitable
1674 /// because it already has the most optimal set of operands. Can be
1675 /// replaced with SmallVector<unsigned> instead but hash code is faster
1676 /// and requires less memory.
1677 unsigned Hash = 0;
1678 };
1679 /// \returns the maximum number of operands that are allowed to be reordered
1680 /// for \p Lane and the number of compatible instructions(with the same
1681 /// parent/opcode). This is used as a heuristic for selecting the first lane
1682 /// to start operand reordering.
1683 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1684 unsigned CntTrue = 0;
1685 unsigned NumOperands = getNumOperands();
1686 // Operands with the same APO can be reordered. We therefore need to count
1687 // how many of them we have for each APO, like this: Cnt[APO] = x.
1688 // Since we only have two APOs, namely true and false, we can avoid using
1689 // a map. Instead we can simply count the number of operands that
1690 // correspond to one of them (in this case the 'true' APO), and calculate
1691 // the other by subtracting it from the total number of operands.
1692 // Operands with the same instruction opcode and parent are more
1693 // profitable since we don't need to move them in many cases, with a high
1694 // probability such lane already can be vectorized effectively.
1695 bool AllUndefs = true;
1696 unsigned NumOpsWithSameOpcodeParent = 0;
1697 Instruction *OpcodeI = nullptr;
1698 BasicBlock *Parent = nullptr;
1699 unsigned Hash = 0;
1700 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1701 const OperandData &OpData = getData(OpIdx, Lane);
1702 if (OpData.APO)
1703 ++CntTrue;
1704 // Use Boyer-Moore majority voting for finding the majority opcode and
1705 // the number of times it occurs.
1706 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1707 if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||
1708 I->getParent() != Parent) {
1709 if (NumOpsWithSameOpcodeParent == 0) {
1710 NumOpsWithSameOpcodeParent = 1;
1711 OpcodeI = I;
1712 Parent = I->getParent();
1713 } else {
1714 --NumOpsWithSameOpcodeParent;
1715 }
1716 } else {
1717 ++NumOpsWithSameOpcodeParent;
1718 }
1719 }
1720 Hash = hash_combine(
1721 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1722 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1723 }
1724 if (AllUndefs)
1725 return {};
1726 OperandsOrderData Data;
1727 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1728 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1729 Data.Hash = Hash;
1730 return Data;
1731 }
1732
1733 /// Go through the instructions in VL and append their operands.
1734 void appendOperandsOfVL(ArrayRef<Value *> VL) {
1735 assert(!VL.empty() && "Bad VL")(static_cast <bool> (!VL.empty() && "Bad VL") ?
void (0) : __assert_fail ("!VL.empty() && \"Bad VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1735, __extension__
__PRETTY_FUNCTION__))
;
1736 assert((empty() || VL.size() == getNumLanes()) &&(static_cast <bool> ((empty() || VL.size() == getNumLanes
()) && "Expected same number of lanes") ? void (0) : __assert_fail
("(empty() || VL.size() == getNumLanes()) && \"Expected same number of lanes\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1737, __extension__
__PRETTY_FUNCTION__))
1737 "Expected same number of lanes")(static_cast <bool> ((empty() || VL.size() == getNumLanes
()) && "Expected same number of lanes") ? void (0) : __assert_fail
("(empty() || VL.size() == getNumLanes()) && \"Expected same number of lanes\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1737, __extension__
__PRETTY_FUNCTION__))
;
1738 assert(isa<Instruction>(VL[0]) && "Expected instruction")(static_cast <bool> (isa<Instruction>(VL[0]) &&
"Expected instruction") ? void (0) : __assert_fail ("isa<Instruction>(VL[0]) && \"Expected instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1738, __extension__
__PRETTY_FUNCTION__))
;
1739 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1740 OpsVec.resize(NumOperands);
1741 unsigned NumLanes = VL.size();
1742 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1743 OpsVec[OpIdx].resize(NumLanes);
1744 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1745 assert(isa<Instruction>(VL[Lane]) && "Expected instruction")(static_cast <bool> (isa<Instruction>(VL[Lane]) &&
"Expected instruction") ? void (0) : __assert_fail ("isa<Instruction>(VL[Lane]) && \"Expected instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1745, __extension__
__PRETTY_FUNCTION__))
;
1746 // Our tree has just 3 nodes: the root and two operands.
1747 // It is therefore trivial to get the APO. We only need to check the
1748 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1749 // RHS operand. The LHS operand of both add and sub is never attached
1750 // to an inversese operation in the linearized form, therefore its APO
1751 // is false. The RHS is true only if VL[Lane] is an inverse operation.
1752
1753 // Since operand reordering is performed on groups of commutative
1754 // operations or alternating sequences (e.g., +, -), we can safely
1755 // tell the inverse operations by checking commutativity.
1756 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1757 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1758 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1759 APO, false};
1760 }
1761 }
1762 }
1763
1764 /// \returns the number of operands.
1765 unsigned getNumOperands() const { return OpsVec.size(); }
1766
1767 /// \returns the number of lanes.
1768 unsigned getNumLanes() const { return OpsVec[0].size(); }
1769
1770 /// \returns the operand value at \p OpIdx and \p Lane.
1771 Value *getValue(unsigned OpIdx, unsigned Lane) const {
1772 return getData(OpIdx, Lane).V;
1773 }
1774
1775 /// \returns true if the data structure is empty.
1776 bool empty() const { return OpsVec.empty(); }
1777
1778 /// Clears the data.
1779 void clear() { OpsVec.clear(); }
1780
1781 /// \Returns true if there are enough operands identical to \p Op to fill
1782 /// the whole vector.
1783 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1784 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1785 bool OpAPO = getData(OpIdx, Lane).APO;
1786 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1787 if (Ln == Lane)
1788 continue;
1789 // This is set to true if we found a candidate for broadcast at Lane.
1790 bool FoundCandidate = false;
1791 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1792 OperandData &Data = getData(OpI, Ln);
1793 if (Data.APO != OpAPO || Data.IsUsed)
1794 continue;
1795 if (Data.V == Op) {
1796 FoundCandidate = true;
1797 Data.IsUsed = true;
1798 break;
1799 }
1800 }
1801 if (!FoundCandidate)
1802 return false;
1803 }
1804 return true;
1805 }
1806
1807 public:
1808 /// Initialize with all the operands of the instruction vector \p RootVL.
1809 VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
1810 ScalarEvolution &SE, const BoUpSLP &R)
1811 : DL(DL), SE(SE), R(R) {
1812 // Append all the operands of RootVL.
1813 appendOperandsOfVL(RootVL);
1814 }
1815
1816 /// \Returns a value vector with the operands across all lanes for the
1817 /// opearnd at \p OpIdx.
1818 ValueList getVL(unsigned OpIdx) const {
1819 ValueList OpVL(OpsVec[OpIdx].size());
1820 assert(OpsVec[OpIdx].size() == getNumLanes() &&(static_cast <bool> (OpsVec[OpIdx].size() == getNumLanes
() && "Expected same num of lanes across all operands"
) ? void (0) : __assert_fail ("OpsVec[OpIdx].size() == getNumLanes() && \"Expected same num of lanes across all operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1821, __extension__
__PRETTY_FUNCTION__))
1821 "Expected same num of lanes across all operands")(static_cast <bool> (OpsVec[OpIdx].size() == getNumLanes
() && "Expected same num of lanes across all operands"
) ? void (0) : __assert_fail ("OpsVec[OpIdx].size() == getNumLanes() && \"Expected same num of lanes across all operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1821, __extension__
__PRETTY_FUNCTION__))
;
1822 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1823 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
1824 return OpVL;
1825 }
1826
1827 // Performs operand reordering for 2 or more operands.
1828 // The original operands are in OrigOps[OpIdx][Lane].
1829 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
1830 void reorder() {
1831 unsigned NumOperands = getNumOperands();
1832 unsigned NumLanes = getNumLanes();
1833 // Each operand has its own mode. We are using this mode to help us select
1834 // the instructions for each lane, so that they match best with the ones
1835 // we have selected so far.
1836 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
1837
1838 // This is a greedy single-pass algorithm. We are going over each lane
1839 // once and deciding on the best order right away with no back-tracking.
1840 // However, in order to increase its effectiveness, we start with the lane
1841 // that has operands that can move the least. For example, given the
1842 // following lanes:
1843 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
1844 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
1845 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
1846 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
1847 // we will start at Lane 1, since the operands of the subtraction cannot
1848 // be reordered. Then we will visit the rest of the lanes in a circular
1849 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
1850
1851 // Find the first lane that we will start our search from.
1852 unsigned FirstLane = getBestLaneToStartReordering();
1853
1854 // Initialize the modes.
1855 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1856 Value *OpLane0 = getValue(OpIdx, FirstLane);
1857 // Keep track if we have instructions with all the same opcode on one
1858 // side.
1859 if (isa<LoadInst>(OpLane0))
1860 ReorderingModes[OpIdx] = ReorderingMode::Load;
1861 else if (isa<Instruction>(OpLane0)) {
1862 // Check if OpLane0 should be broadcast.
1863 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
1864 ReorderingModes[OpIdx] = ReorderingMode::Splat;
1865 else
1866 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
1867 }
1868 else if (isa<Constant>(OpLane0))
1869 ReorderingModes[OpIdx] = ReorderingMode::Constant;
1870 else if (isa<Argument>(OpLane0))
1871 // Our best hope is a Splat. It may save some cost in some cases.
1872 ReorderingModes[OpIdx] = ReorderingMode::Splat;
1873 else
1874 // NOTE: This should be unreachable.
1875 ReorderingModes[OpIdx] = ReorderingMode::Failed;
1876 }
1877
1878 // Check that we don't have same operands. No need to reorder if operands
1879 // are just perfect diamond or shuffled diamond match. Do not do it only
1880 // for possible broadcasts or non-power of 2 number of scalars (just for
1881 // now).
1882 auto &&SkipReordering = [this]() {
1883 SmallPtrSet<Value *, 4> UniqueValues;
1884 ArrayRef<OperandData> Op0 = OpsVec.front();
1885 for (const OperandData &Data : Op0)
1886 UniqueValues.insert(Data.V);
1887 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
1888 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
1889 return !UniqueValues.contains(Data.V);
1890 }))
1891 return false;
1892 }
1893 // TODO: Check if we can remove a check for non-power-2 number of
1894 // scalars after full support of non-power-2 vectorization.
1895 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
1896 };
1897
1898 // If the initial strategy fails for any of the operand indexes, then we
1899 // perform reordering again in a second pass. This helps avoid assigning
1900 // high priority to the failed strategy, and should improve reordering for
1901 // the non-failed operand indexes.
1902 for (int Pass = 0; Pass != 2; ++Pass) {
1903 // Check if no need to reorder operands since they're are perfect or
1904 // shuffled diamond match.
1905 // Need to to do it to avoid extra external use cost counting for
1906 // shuffled matches, which may cause regressions.
1907 if (SkipReordering())
1908 break;
1909 // Skip the second pass if the first pass did not fail.
1910 bool StrategyFailed = false;
1911 // Mark all operand data as free to use.
1912 clearUsed();
1913 // We keep the original operand order for the FirstLane, so reorder the
1914 // rest of the lanes. We are visiting the nodes in a circular fashion,
1915 // using FirstLane as the center point and increasing the radius
1916 // distance.
1917 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
1918 for (unsigned I = 0; I < NumOperands; ++I)
1919 MainAltOps[I].push_back(getData(I, FirstLane).V);
1920
1921 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
1922 // Visit the lane on the right and then the lane on the left.
1923 for (int Direction : {+1, -1}) {
1924 int Lane = FirstLane + Direction * Distance;
1925 if (Lane < 0 || Lane >= (int)NumLanes)
1926 continue;
1927 int LastLane = Lane - Direction;
1928 assert(LastLane >= 0 && LastLane < (int)NumLanes &&(static_cast <bool> (LastLane >= 0 && LastLane
< (int)NumLanes && "Out of bounds") ? void (0) : __assert_fail
("LastLane >= 0 && LastLane < (int)NumLanes && \"Out of bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1929, __extension__
__PRETTY_FUNCTION__))
1929 "Out of bounds")(static_cast <bool> (LastLane >= 0 && LastLane
< (int)NumLanes && "Out of bounds") ? void (0) : __assert_fail
("LastLane >= 0 && LastLane < (int)NumLanes && \"Out of bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1929, __extension__
__PRETTY_FUNCTION__))
;
1930 // Look for a good match for each operand.
1931 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1932 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
1933 Optional<unsigned> BestIdx = getBestOperand(
1934 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
1935 // By not selecting a value, we allow the operands that follow to
1936 // select a better matching value. We will get a non-null value in
1937 // the next run of getBestOperand().
1938 if (BestIdx) {
1939 // Swap the current operand with the one returned by
1940 // getBestOperand().
1941 swap(OpIdx, *BestIdx, Lane);
1942 } else {
1943 // We failed to find a best operand, set mode to 'Failed'.
1944 ReorderingModes[OpIdx] = ReorderingMode::Failed;
1945 // Enable the second pass.
1946 StrategyFailed = true;
1947 }
1948 // Try to get the alternate opcode and follow it during analysis.
1949 if (MainAltOps[OpIdx].size() != 2) {
1950 OperandData &AltOp = getData(OpIdx, Lane);
1951 InstructionsState OpS =
1952 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V});
1953 if (OpS.getOpcode() && OpS.isAltShuffle())
1954 MainAltOps[OpIdx].push_back(AltOp.V);
1955 }
1956 }
1957 }
1958 }
1959 // Skip second pass if the strategy did not fail.
1960 if (!StrategyFailed)
1961 break;
1962 }
1963 }
1964
1965#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1966 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static StringRef getModeStr(ReorderingMode RMode) {
1967 switch (RMode) {
1968 case ReorderingMode::Load:
1969 return "Load";
1970 case ReorderingMode::Opcode:
1971 return "Opcode";
1972 case ReorderingMode::Constant:
1973 return "Constant";
1974 case ReorderingMode::Splat:
1975 return "Splat";
1976 case ReorderingMode::Failed:
1977 return "Failed";
1978 }
1979 llvm_unreachable("Unimplemented Reordering Type")::llvm::llvm_unreachable_internal("Unimplemented Reordering Type"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1979)
;
1980 }
1981
1982 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static raw_ostream &printMode(ReorderingMode RMode,
1983 raw_ostream &OS) {
1984 return OS << getModeStr(RMode);
1985 }
1986
1987 /// Debug print.
1988 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static void dumpMode(ReorderingMode RMode) {
1989 printMode(RMode, dbgs());
1990 }
1991
1992 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
1993 return printMode(RMode, OS);
1994 }
1995
1996 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) raw_ostream &print(raw_ostream &OS) const {
1997 const unsigned Indent = 2;
1998 unsigned Cnt = 0;
1999 for (const OperandDataVec &OpDataVec : OpsVec) {
2000 OS << "Operand " << Cnt++ << "\n";
2001 for (const OperandData &OpData : OpDataVec) {
2002 OS.indent(Indent) << "{";
2003 if (Value *V = OpData.V)
2004 OS << *V;
2005 else
2006 OS << "null";
2007 OS << ", APO:" << OpData.APO << "}\n";
2008 }
2009 OS << "\n";
2010 }
2011 return OS;
2012 }
2013
2014 /// Debug print.
2015 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { print(dbgs()); }
2016#endif
2017 };
2018
2019 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2020 /// for a pair which have highest score deemed to have best chance to form
2021 /// root of profitable tree to vectorize. Return None if no candidate scored
2022 /// above the LookAheadHeuristics::ScoreFail.
2023 /// \param Limit Lower limit of the cost, considered to be good enough score.
2024 Optional<int>
2025 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2026 int Limit = LookAheadHeuristics::ScoreFail) {
2027 LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
2028 RootLookAheadMaxDepth);
2029 int BestScore = Limit;
2030 Optional<int> Index;
2031 for (int I : seq<int>(0, Candidates.size())) {
2032 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2033 Candidates[I].second,
2034 /*U1=*/nullptr, /*U2=*/nullptr,
2035 /*Level=*/1, None);
2036 if (Score > BestScore) {
2037 BestScore = Score;
2038 Index = I;
2039 }
2040 }
2041 return Index;
2042 }
2043
2044 /// Checks if the instruction is marked for deletion.
2045 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2046
2047 /// Removes an instruction from its block and eventually deletes it.
2048 /// It's like Instruction::eraseFromParent() except that the actual deletion
2049 /// is delayed until BoUpSLP is destructed.
2050 void eraseInstruction(Instruction *I) {
2051 DeletedInstructions.insert(I);
2052 }
2053
2054 /// Checks if the instruction was already analyzed for being possible
2055 /// reduction root.
2056 bool isAnalyzedReductionRoot(Instruction *I) const {
2057 return AnalyzedReductionsRoots.count(I);
2058 }
2059 /// Register given instruction as already analyzed for being possible
2060 /// reduction root.
2061 void analyzedReductionRoot(Instruction *I) {
2062 AnalyzedReductionsRoots.insert(I);
2063 }
2064 /// Checks if the provided list of reduced values was checked already for
2065 /// vectorization.
2066 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {
2067 return AnalyzedReductionVals.contains(hash_value(VL));
2068 }
2069 /// Adds the list of reduced values to list of already checked values for the
2070 /// vectorization.
2071 void analyzedReductionVals(ArrayRef<Value *> VL) {
2072 AnalyzedReductionVals.insert(hash_value(VL));
2073 }
2074 /// Clear the list of the analyzed reduction root instructions.
2075 void clearReductionData() {
2076 AnalyzedReductionsRoots.clear();
2077 AnalyzedReductionVals.clear();
2078 }
2079 /// Checks if the given value is gathered in one of the nodes.
2080 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2081 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2082 }
2083
2084 ~BoUpSLP();
2085
2086private:
2087 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2088 /// reordering (i.e. the operands can be reordered because they have only one
2089 /// user and reordarable).
2090 /// \param ReorderableGathers List of all gather nodes that require reordering
2091 /// (e.g., gather of extractlements or partially vectorizable loads).
2092 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2093 /// reordering, subset of \p NonVectorized.
2094 bool
2095 canReorderOperands(TreeEntry *UserTE,
2096 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2097 ArrayRef<TreeEntry *> ReorderableGathers,
2098 SmallVectorImpl<TreeEntry *> &GatherOps);
2099
2100 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2101 /// if any. If it is not vectorized (gather node), returns nullptr.
2102 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2103 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2104 TreeEntry *TE = nullptr;
2105 const auto *It = find_if(VL, [this, &TE](Value *V) {
2106 TE = getTreeEntry(V);
2107 return TE;
2108 });
2109 if (It != VL.end() && TE->isSame(VL))
2110 return TE;
2111 return nullptr;
2112 }
2113
2114 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2115 /// if any. If it is not vectorized (gather node), returns nullptr.
2116 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2117 unsigned OpIdx) const {
2118 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2119 const_cast<TreeEntry *>(UserTE), OpIdx);
2120 }
2121
2122 /// Checks if all users of \p I are the part of the vectorization tree.
2123 bool areAllUsersVectorized(Instruction *I,
2124 ArrayRef<Value *> VectorizedVals) const;
2125
2126 /// Return information about the vector formed for the specified index
2127 /// of a vector of (the same) instruction.
2128 TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,
2129 unsigned OpIdx);
2130
2131 /// \returns the cost of the vectorizable entry.
2132 InstructionCost getEntryCost(const TreeEntry *E,
2133 ArrayRef<Value *> VectorizedVals);
2134
2135 /// This is the recursive part of buildTree.
2136 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2137 const EdgeInfo &EI);
2138
2139 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2140 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2141 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2142 /// returns false, setting \p CurrentOrder to either an empty vector or a
2143 /// non-identity permutation that allows to reuse extract instructions.
2144 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2145 SmallVectorImpl<unsigned> &CurrentOrder) const;
2146
2147 /// Vectorize a single entry in the tree.
2148 Value *vectorizeTree(TreeEntry *E);
2149
2150 /// Vectorize a single entry in the tree, starting in \p VL.
2151 Value *vectorizeTree(ArrayRef<Value *> VL);
2152
2153 /// Create a new vector from a list of scalar values. Produces a sequence
2154 /// which exploits values reused across lanes, and arranges the inserts
2155 /// for ease of later optimization.
2156 Value *createBuildVector(ArrayRef<Value *> VL);
2157
2158 /// \returns the scalarization cost for this type. Scalarization in this
2159 /// context means the creation of vectors from a group of scalars. If \p
2160 /// NeedToShuffle is true, need to add a cost of reshuffling some of the
2161 /// vector elements.
2162 InstructionCost getGatherCost(FixedVectorType *Ty,
2163 const APInt &ShuffledIndices,
2164 bool NeedToShuffle) const;
2165
2166 /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
2167 /// tree entries.
2168 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2169 /// previous tree entries. \p Mask is filled with the shuffle mask.
2170 Optional<TargetTransformInfo::ShuffleKind>
2171 isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
2172 SmallVectorImpl<const TreeEntry *> &Entries);
2173
2174 /// \returns the scalarization cost for this list of values. Assuming that
2175 /// this subtree gets vectorized, we may need to extract the values from the
2176 /// roots. This method calculates the cost of extracting the values.
2177 InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
2178
2179 /// Set the Builder insert point to one after the last instruction in
2180 /// the bundle
2181 void setInsertPointAfterBundle(const TreeEntry *E);
2182
2183 /// \returns a vector from a collection of scalars in \p VL.
2184 Value *gather(ArrayRef<Value *> VL);
2185
2186 /// \returns whether the VectorizableTree is fully vectorizable and will
2187 /// be beneficial even the tree height is tiny.
2188 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2189
2190 /// Reorder commutative or alt operands to get better probability of
2191 /// generating vectorized code.
2192 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2193 SmallVectorImpl<Value *> &Left,
2194 SmallVectorImpl<Value *> &Right,
2195 const DataLayout &DL,
2196 ScalarEvolution &SE,
2197 const BoUpSLP &R);
2198
2199 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2200 /// users of \p TE and collects the stores. It returns the map from the store
2201 /// pointers to the collected stores.
2202 DenseMap<Value *, SmallVector<StoreInst *, 4>>
2203 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2204
2205 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2206 /// stores in \p StoresVec can form a vector instruction. If so it returns true
2207 /// and populates \p ReorderIndices with the shuffle indices of the the stores
2208 /// when compared to the sorted vector.
2209 bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
2210 OrdersType &ReorderIndices) const;
2211
2212 /// Iterates through the users of \p TE, looking for scalar stores that can be
2213 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2214 /// their order and builds an order index vector for each store bundle. It
2215 /// returns all these order vectors found.
2216 /// We run this after the tree has formed, otherwise we may come across user
2217 /// instructions that are not yet in the tree.
2218 SmallVector<OrdersType, 1>
2219 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2220
2221 struct TreeEntry {
2222 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2223 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2224
2225 /// \returns true if the scalars in VL are equal to this entry.
2226 bool isSame(ArrayRef<Value *> VL) const {
2227 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2228 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2229 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2230 return VL.size() == Mask.size() &&
2231 std::equal(VL.begin(), VL.end(), Mask.begin(),
2232 [Scalars](Value *V, int Idx) {
2233 return (isa<UndefValue>(V) &&
2234 Idx == UndefMaskElem) ||
2235 (Idx != UndefMaskElem && V == Scalars[Idx]);
2236 });
2237 };
2238 if (!ReorderIndices.empty()) {
2239 // TODO: implement matching if the nodes are just reordered, still can
2240 // treat the vector as the same if the list of scalars matches VL
2241 // directly, without reordering.
2242 SmallVector<int> Mask;
2243 inversePermutation(ReorderIndices, Mask);
2244 if (VL.size() == Scalars.size())
2245 return IsSame(Scalars, Mask);
2246 if (VL.size() == ReuseShuffleIndices.size()) {
2247 ::addMask(Mask, ReuseShuffleIndices);
2248 return IsSame(Scalars, Mask);
2249 }
2250 return false;
2251 }
2252 return IsSame(Scalars, ReuseShuffleIndices);
2253 }
2254
2255 /// \returns true if current entry has same operands as \p TE.
2256 bool hasEqualOperands(const TreeEntry &TE) const {
2257 if (TE.getNumOperands() != getNumOperands())
2258 return false;
2259 SmallBitVector Used(getNumOperands());
2260 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2261 unsigned PrevCount = Used.count();
2262 for (unsigned K = 0; K < E; ++K) {
2263 if (Used.test(K))
2264 continue;
2265 if (getOperand(K) == TE.getOperand(I)) {
2266 Used.set(K);
2267 break;
2268 }
2269 }
2270 // Check if we actually found the matching operand.
2271 if (PrevCount == Used.count())
2272 return false;
2273 }
2274 return true;
2275 }
2276
2277 /// \return Final vectorization factor for the node. Defined by the total
2278 /// number of vectorized scalars, including those, used several times in the
2279 /// entry and counted in the \a ReuseShuffleIndices, if any.
2280 unsigned getVectorFactor() const {
2281 if (!ReuseShuffleIndices.empty())
2282 return ReuseShuffleIndices.size();
2283 return Scalars.size();
2284 };
2285
2286 /// A vector of scalars.
2287 ValueList Scalars;
2288
2289 /// The Scalars are vectorized into this value. It is initialized to Null.
2290 Value *VectorizedValue = nullptr;
2291
2292 /// Do we need to gather this sequence or vectorize it
2293 /// (either with vector instruction or with scatter/gather
2294 /// intrinsics for store/load)?
2295 enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
2296 EntryState State;
2297
2298 /// Does this sequence require some shuffling?
2299 SmallVector<int, 4> ReuseShuffleIndices;
2300
2301 /// Does this entry require reordering?
2302 SmallVector<unsigned, 4> ReorderIndices;
2303
2304 /// Points back to the VectorizableTree.
2305 ///
2306 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2307 /// to be a pointer and needs to be able to initialize the child iterator.
2308 /// Thus we need a reference back to the container to translate the indices
2309 /// to entries.
2310 VecTreeTy &Container;
2311
2312 /// The TreeEntry index containing the user of this entry. We can actually
2313 /// have multiple users so the data structure is not truly a tree.
2314 SmallVector<EdgeInfo, 1> UserTreeIndices;
2315
2316 /// The index of this treeEntry in VectorizableTree.
2317 int Idx = -1;
2318
2319 private:
2320 /// The operands of each instruction in each lane Operands[op_index][lane].
2321 /// Note: This helps avoid the replication of the code that performs the
2322 /// reordering of operands during buildTree_rec() and vectorizeTree().
2323 SmallVector<ValueList, 2> Operands;
2324
2325 /// The main/alternate instruction.
2326 Instruction *MainOp = nullptr;
2327 Instruction *AltOp = nullptr;
2328
2329 public:
2330 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2331 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2332 if (Operands.size() < OpIdx + 1)
2333 Operands.resize(OpIdx + 1);
2334 assert(Operands[OpIdx].empty() && "Already resized?")(static_cast <bool> (Operands[OpIdx].empty() &&
"Already resized?") ? void (0) : __assert_fail ("Operands[OpIdx].empty() && \"Already resized?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2334, __extension__
__PRETTY_FUNCTION__))
;
2335 assert(OpVL.size() <= Scalars.size() &&(static_cast <bool> (OpVL.size() <= Scalars.size() &&
"Number of operands is greater than the number of scalars.")
? void (0) : __assert_fail ("OpVL.size() <= Scalars.size() && \"Number of operands is greater than the number of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2336, __extension__
__PRETTY_FUNCTION__))
2336 "Number of operands is greater than the number of scalars.")(static_cast <bool> (OpVL.size() <= Scalars.size() &&
"Number of operands is greater than the number of scalars.")
? void (0) : __assert_fail ("OpVL.size() <= Scalars.size() && \"Number of operands is greater than the number of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2336, __extension__
__PRETTY_FUNCTION__))
;
2337 Operands[OpIdx].resize(OpVL.size());
2338 copy(OpVL, Operands[OpIdx].begin());
2339 }
2340
2341 /// Set the operands of this bundle in their original order.
2342 void setOperandsInOrder() {
2343 assert(Operands.empty() && "Already initialized?")(static_cast <bool> (Operands.empty() && "Already initialized?"
) ? void (0) : __assert_fail ("Operands.empty() && \"Already initialized?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2343, __extension__
__PRETTY_FUNCTION__))
;
2344 auto *I0 = cast<Instruction>(Scalars[0]);
2345 Operands.resize(I0->getNumOperands());
2346 unsigned NumLanes = Scalars.size();
2347 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2348 OpIdx != NumOperands; ++OpIdx) {
2349 Operands[OpIdx].resize(NumLanes);
2350 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2351 auto *I = cast<Instruction>(Scalars[Lane]);
2352 assert(I->getNumOperands() == NumOperands &&(static_cast <bool> (I->getNumOperands() == NumOperands
&& "Expected same number of operands") ? void (0) : __assert_fail
("I->getNumOperands() == NumOperands && \"Expected same number of operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2353, __extension__
__PRETTY_FUNCTION__))
2353 "Expected same number of operands")(static_cast <bool> (I->getNumOperands() == NumOperands
&& "Expected same number of operands") ? void (0) : __assert_fail
("I->getNumOperands() == NumOperands && \"Expected same number of operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2353, __extension__
__PRETTY_FUNCTION__))
;
2354 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2355 }
2356 }
2357 }
2358
2359 /// Reorders operands of the node to the given mask \p Mask.
2360 void reorderOperands(ArrayRef<int> Mask) {
2361 for (ValueList &Operand : Operands)
2362 reorderScalars(Operand, Mask);
2363 }
2364
2365 /// \returns the \p OpIdx operand of this TreeEntry.
2366 ValueList &getOperand(unsigned OpIdx) {
2367 assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2367, __extension__
__PRETTY_FUNCTION__))
;
2368 return Operands[OpIdx];
2369 }
2370
2371 /// \returns the \p OpIdx operand of this TreeEntry.
2372 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2373 assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2373, __extension__
__PRETTY_FUNCTION__))
;
2374 return Operands[OpIdx];
2375 }
2376
2377 /// \returns the number of operands.
2378 unsigned getNumOperands() const { return Operands.size(); }
2379
2380 /// \return the single \p OpIdx operand.
2381 Value *getSingleOperand(unsigned OpIdx) const {
2382 assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2382, __extension__
__PRETTY_FUNCTION__))
;
2383 assert(!Operands[OpIdx].empty() && "No operand available")(static_cast <bool> (!Operands[OpIdx].empty() &&
"No operand available") ? void (0) : __assert_fail ("!Operands[OpIdx].empty() && \"No operand available\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2383, __extension__
__PRETTY_FUNCTION__))
;
2384 return Operands[OpIdx][0];
2385 }
2386
2387 /// Some of the instructions in the list have alternate opcodes.
2388 bool isAltShuffle() const { return MainOp != AltOp; }
2389
2390 bool isOpcodeOrAlt(Instruction *I) const {
2391 unsigned CheckedOpcode = I->getOpcode();
2392 return (getOpcode() == CheckedOpcode ||
2393 getAltOpcode() == CheckedOpcode);
2394 }
2395
2396 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2397 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2398 /// \p OpValue.
2399 Value *isOneOf(Value *Op) const {
2400 auto *I = dyn_cast<Instruction>(Op);
2401 if (I && isOpcodeOrAlt(I))
2402 return Op;
2403 return MainOp;
2404 }
2405
2406 void setOperations(const InstructionsState &S) {
2407 MainOp = S.MainOp;
2408 AltOp = S.AltOp;
2409 }
2410
2411 Instruction *getMainOp() const {
2412 return MainOp;
2413 }
2414
2415 Instruction *getAltOp() const {
2416 return AltOp;
2417 }
2418
2419 /// The main/alternate opcodes for the list of instructions.
2420 unsigned getOpcode() const {
2421 return MainOp ? MainOp->getOpcode() : 0;
2422 }
2423
2424 unsigned getAltOpcode() const {
2425 return AltOp ? AltOp->getOpcode() : 0;
2426 }
2427
2428 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2429 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2430 int findLaneForValue(Value *V) const {
2431 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2432 assert(FoundLane < Scalars.size() && "Couldn't find extract lane")(static_cast <bool> (FoundLane < Scalars.size() &&
"Couldn't find extract lane") ? void (0) : __assert_fail ("FoundLane < Scalars.size() && \"Couldn't find extract lane\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2432, __extension__
__PRETTY_FUNCTION__))
;
2433 if (!ReorderIndices.empty())
2434 FoundLane = ReorderIndices[FoundLane];
2435 assert(FoundLane < Scalars.size() && "Couldn't find extract lane")(static_cast <bool> (FoundLane < Scalars.size() &&
"Couldn't find extract lane") ? void (0) : __assert_fail ("FoundLane < Scalars.size() && \"Couldn't find extract lane\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2435, __extension__
__PRETTY_FUNCTION__))
;
2436 if (!ReuseShuffleIndices.empty()) {
2437 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2438 find(ReuseShuffleIndices, FoundLane));
2439 }
2440 return FoundLane;
2441 }
2442
2443#ifndef NDEBUG
2444 /// Debug printer.
2445 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const {
2446 dbgs() << Idx << ".\n";
2447 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2448 dbgs() << "Operand " << OpI << ":\n";
2449 for (const Value *V : Operands[OpI])
2450 dbgs().indent(2) << *V << "\n";
2451 }
2452 dbgs() << "Scalars: \n";
2453 for (Value *V : Scalars)
2454 dbgs().indent(2) << *V << "\n";
2455 dbgs() << "State: ";
2456 switch (State) {
2457 case Vectorize:
2458 dbgs() << "Vectorize\n";
2459 break;
2460 case ScatterVectorize:
2461 dbgs() << "ScatterVectorize\n";
2462 break;
2463 case NeedToGather:
2464 dbgs() << "NeedToGather\n";
2465 break;
2466 }
2467 dbgs() << "MainOp: ";
2468 if (MainOp)
2469 dbgs() << *MainOp << "\n";
2470 else
2471 dbgs() << "NULL\n";
2472 dbgs() << "AltOp: ";
2473 if (AltOp)
2474 dbgs() << *AltOp << "\n";
2475 else
2476 dbgs() << "NULL\n";
2477 dbgs() << "VectorizedValue: ";
2478 if (VectorizedValue)
2479 dbgs() << *VectorizedValue << "\n";
2480 else
2481 dbgs() << "NULL\n";
2482 dbgs() << "ReuseShuffleIndices: ";
2483 if (ReuseShuffleIndices.empty())
2484 dbgs() << "Empty";
2485 else
2486 for (int ReuseIdx : ReuseShuffleIndices)
2487 dbgs() << ReuseIdx << ", ";
2488 dbgs() << "\n";
2489 dbgs() << "ReorderIndices: ";
2490 for (unsigned ReorderIdx : ReorderIndices)
2491 dbgs() << ReorderIdx << ", ";
2492 dbgs() << "\n";
2493 dbgs() << "UserTreeIndices: ";
2494 for (const auto &EInfo : UserTreeIndices)
2495 dbgs() << EInfo << ", ";
2496 dbgs() << "\n";
2497 }
2498#endif
2499 };
2500
2501#ifndef NDEBUG
2502 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2503 InstructionCost VecCost,
2504 InstructionCost ScalarCost) const {
2505 dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
2506 dbgs() << "SLP: Costs:\n";
2507 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2508 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2509 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2510 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
2511 ReuseShuffleCost + VecCost - ScalarCost << "\n";
2512 }
2513#endif
2514
2515 /// Create a new VectorizableTree entry.
2516 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
2517 const InstructionsState &S,
2518 const EdgeInfo &UserTreeIdx,
2519 ArrayRef<int> ReuseShuffleIndices = None,
2520 ArrayRef<unsigned> ReorderIndices = None) {
2521 TreeEntry::EntryState EntryState =
2522 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2523 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2524 ReuseShuffleIndices, ReorderIndices);
2525 }
2526
2527 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2528 TreeEntry::EntryState EntryState,
2529 Optional<ScheduleData *> Bundle,
2530 const InstructionsState &S,
2531 const EdgeInfo &UserTreeIdx,
2532 ArrayRef<int> ReuseShuffleIndices = None,
2533 ArrayRef<unsigned> ReorderIndices = None) {
2534 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2536, __extension__
__PRETTY_FUNCTION__))
2535 (Bundle && EntryState != TreeEntry::NeedToGather)) &&(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2536, __extension__
__PRETTY_FUNCTION__))
2536 "Need to vectorize gather entry?")(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2536, __extension__
__PRETTY_FUNCTION__))
;
2537 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2538 TreeEntry *Last = VectorizableTree.back().get();
2539 Last->Idx = VectorizableTree.size() - 1;
2540 Last->State = EntryState;
2541 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2542 ReuseShuffleIndices.end());
2543 if (ReorderIndices.empty()) {
2544 Last->Scalars.assign(VL.begin(), VL.end());
2545 Last->setOperations(S);
2546 } else {
2547 // Reorder scalars and build final mask.
2548 Last->Scalars.assign(VL.size(), nullptr);
2549 transform(ReorderIndices, Last->Scalars.begin(),
2550 [VL](unsigned Idx) -> Value * {
2551 if (Idx >= VL.size())
2552 return UndefValue::get(VL.front()->getType());
2553 return VL[Idx];
2554 });
2555 InstructionsState S = getSameOpcode(Last->Scalars);
2556 Last->setOperations(S);
2557 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2558 }
2559 if (Last->State != TreeEntry::NeedToGather) {
2560 for (Value *V : VL) {
2561 assert(!getTreeEntry(V) && "Scalar already in tree!")(static_cast <bool> (!getTreeEntry(V) && "Scalar already in tree!"
) ? void (0) : __assert_fail ("!getTreeEntry(V) && \"Scalar already in tree!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2561, __extension__
__PRETTY_FUNCTION__))
;
2562 ScalarToTreeEntry[V] = Last;
2563 }
2564 // Update the scheduler bundle to point to this TreeEntry.
2565 ScheduleData *BundleMember = *Bundle;
2566 assert((BundleMember || isa<PHINode>(S.MainOp) ||(static_cast <bool> ((BundleMember || isa<PHINode>
(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule
(VL)) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(BundleMember || isa<PHINode>(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2569, __extension__
__PRETTY_FUNCTION__))
2567 isVectorLikeInstWithConstOps(S.MainOp) ||(static_cast <bool> ((BundleMember || isa<PHINode>
(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule
(VL)) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(BundleMember || isa<PHINode>(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2569, __extension__
__PRETTY_FUNCTION__))
2568 doesNotNeedToSchedule(VL)) &&(static_cast <bool> ((BundleMember || isa<PHINode>
(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule
(VL)) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(BundleMember || isa<PHINode>(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2569, __extension__
__PRETTY_FUNCTION__))
2569 "Bundle and VL out of sync")(static_cast <bool> ((BundleMember || isa<PHINode>
(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule
(VL)) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(BundleMember || isa<PHINode>(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2569, __extension__
__PRETTY_FUNCTION__))
;
2570 if (BundleMember) {
2571 for (Value *V : VL) {
2572 if (doesNotNeedToBeScheduled(V))
2573 continue;
2574 assert(BundleMember && "Unexpected end of bundle.")(static_cast <bool> (BundleMember && "Unexpected end of bundle."
) ? void (0) : __assert_fail ("BundleMember && \"Unexpected end of bundle.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2574, __extension__
__PRETTY_FUNCTION__))
;
2575 BundleMember->TE = Last;
2576 BundleMember = BundleMember->NextInBundle;
2577 }
2578 }
2579 assert(!BundleMember && "Bundle and VL out of sync")(static_cast <bool> (!BundleMember && "Bundle and VL out of sync"
) ? void (0) : __assert_fail ("!BundleMember && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2579, __extension__
__PRETTY_FUNCTION__))
;
2580 } else {
2581 MustGather.insert(VL.begin(), VL.end());
2582 }
2583
2584 if (UserTreeIdx.UserTE)
2585 Last->UserTreeIndices.push_back(UserTreeIdx);
2586
2587 return Last;
2588 }
2589
2590 /// -- Vectorization State --
2591 /// Holds all of the tree entries.
2592 TreeEntry::VecTreeTy VectorizableTree;
2593
2594#ifndef NDEBUG
2595 /// Debug printer.
2596 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dumpVectorizableTree() const {
2597 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2598 VectorizableTree[Id]->dump();
2599 dbgs() << "\n";
2600 }
2601 }
2602#endif
2603
2604 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2605
2606 const TreeEntry *getTreeEntry(Value *V) const {
2607 return ScalarToTreeEntry.lookup(V);
2608 }
2609
2610 /// Maps a specific scalar to its tree entry.
2611 SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
2612
2613 /// Maps a value to the proposed vectorizable size.
2614 SmallDenseMap<Value *, unsigned> InstrElementSize;
2615
2616 /// A list of scalars that we found that we need to keep as scalars.
2617 ValueSet MustGather;
2618
2619 /// This POD struct describes one external user in the vectorized tree.
2620 struct ExternalUser {
2621 ExternalUser(Value *S, llvm::User *U, int L)
2622 : Scalar(S), User(U), Lane(L) {}
2623
2624 // Which scalar in our function.
2625 Value *Scalar;
2626
2627 // Which user that uses the scalar.
2628 llvm::User *User;
2629
2630 // Which lane does the scalar belong to.
2631 int Lane;
2632 };
2633 using UserList = SmallVector<ExternalUser, 16>;
2634
2635 /// Checks if two instructions may access the same memory.
2636 ///
2637 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
2638 /// is invariant in the calling loop.
2639 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
2640 Instruction *Inst2) {
2641 // First check if the result is already in the cache.
2642 AliasCacheKey key = std::make_pair(Inst1, Inst2);
2643 Optional<bool> &result = AliasCache[key];
2644 if (result) {
2645 return result.value();
2646 }
2647 bool aliased = true;
2648 if (Loc1.Ptr && isSimple(Inst1))
2649 aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
2650 // Store the result in the cache.
2651 result = aliased;
2652 return aliased;
2653 }
2654
2655 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
2656
2657 /// Cache for alias results.
2658 /// TODO: consider moving this to the AliasAnalysis itself.
2659 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
2660
2661 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
2662 // globally through SLP because we don't perform any action which
2663 // invalidates capture results.
2664 BatchAAResults BatchAA;
2665
2666 /// Temporary store for deleted instructions. Instructions will be deleted
2667 /// eventually when the BoUpSLP is destructed. The deferral is required to
2668 /// ensure that there are no incorrect collisions in the AliasCache, which
2669 /// can happen if a new instruction is allocated at the same address as a
2670 /// previously deleted instruction.
2671 DenseSet<Instruction *> DeletedInstructions;
2672
2673 /// Set of the instruction, being analyzed already for reductions.
2674 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
2675
2676 /// Set of hashes for the list of reduction values already being analyzed.
2677 DenseSet<size_t> AnalyzedReductionVals;
2678
2679 /// A list of values that need to extracted out of the tree.
2680 /// This list holds pairs of (Internal Scalar : External User). External User
2681 /// can be nullptr, it means that this Internal Scalar will be used later,
2682 /// after vectorization.
2683 UserList ExternalUses;
2684
2685 /// Values used only by @llvm.assume calls.
2686 SmallPtrSet<const Value *, 32> EphValues;
2687
2688 /// Holds all of the instructions that we gathered.
2689 SetVector<Instruction *> GatherShuffleSeq;
2690
2691 /// A list of blocks that we are going to CSE.
2692 SetVector<BasicBlock *> CSEBlocks;
2693
2694 /// Contains all scheduling relevant data for an instruction.
2695 /// A ScheduleData either represents a single instruction or a member of an
2696 /// instruction bundle (= a group of instructions which is combined into a
2697 /// vector instruction).
2698 struct ScheduleData {
2699 // The initial value for the dependency counters. It means that the
2700 // dependencies are not calculated yet.
2701 enum { InvalidDeps = -1 };
2702
2703 ScheduleData() = default;
2704
2705 void init(int BlockSchedulingRegionID, Value *OpVal) {
2706 FirstInBundle = this;
2707 NextInBundle = nullptr;
2708 NextLoadStore = nullptr;
2709 IsScheduled = false;
2710 SchedulingRegionID = BlockSchedulingRegionID;
2711 clearDependencies();
2712 OpValue = OpVal;
2713 TE = nullptr;
2714 }
2715
2716 /// Verify basic self consistency properties
2717 void verify() {
2718 if (hasValidDependencies()) {
2719 assert(UnscheduledDeps <= Dependencies && "invariant")(static_cast <bool> (UnscheduledDeps <= Dependencies
&& "invariant") ? void (0) : __assert_fail ("UnscheduledDeps <= Dependencies && \"invariant\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2719, __extension__
__PRETTY_FUNCTION__))
;
2720 } else {
2721 assert(UnscheduledDeps == Dependencies && "invariant")(static_cast <bool> (UnscheduledDeps == Dependencies &&
"invariant") ? void (0) : __assert_fail ("UnscheduledDeps == Dependencies && \"invariant\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2721, __extension__
__PRETTY_FUNCTION__))
;
2722 }
2723
2724 if (IsScheduled) {
2725 assert(isSchedulingEntity() &&(static_cast <bool> (isSchedulingEntity() && "unexpected scheduled state"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2726, __extension__
__PRETTY_FUNCTION__))
2726 "unexpected scheduled state")(static_cast <bool> (isSchedulingEntity() && "unexpected scheduled state"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2726, __extension__
__PRETTY_FUNCTION__))
;
2727 for (const ScheduleData *BundleMember = this; BundleMember;
2728 BundleMember = BundleMember->NextInBundle) {
2729 assert(BundleMember->hasValidDependencies() &&(static_cast <bool> (BundleMember->hasValidDependencies
() && BundleMember->UnscheduledDeps == 0 &&
"unexpected scheduled state") ? void (0) : __assert_fail ("BundleMember->hasValidDependencies() && BundleMember->UnscheduledDeps == 0 && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2731, __extension__
__PRETTY_FUNCTION__))
2730 BundleMember->UnscheduledDeps == 0 &&(static_cast <bool> (BundleMember->hasValidDependencies
() && BundleMember->UnscheduledDeps == 0 &&
"unexpected scheduled state") ? void (0) : __assert_fail ("BundleMember->hasValidDependencies() && BundleMember->UnscheduledDeps == 0 && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2731, __extension__
__PRETTY_FUNCTION__))
2731 "unexpected scheduled state")(static_cast <bool> (BundleMember->hasValidDependencies
() && BundleMember->UnscheduledDeps == 0 &&
"unexpected scheduled state") ? void (0) : __assert_fail ("BundleMember->hasValidDependencies() && BundleMember->UnscheduledDeps == 0 && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2731, __extension__
__PRETTY_FUNCTION__))
;
2732 assert((BundleMember == this || !BundleMember->IsScheduled) &&(static_cast <bool> ((BundleMember == this || !BundleMember
->IsScheduled) && "only bundle is marked scheduled"
) ? void (0) : __assert_fail ("(BundleMember == this || !BundleMember->IsScheduled) && \"only bundle is marked scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2733, __extension__
__PRETTY_FUNCTION__))
2733 "only bundle is marked scheduled")(static_cast <bool> ((BundleMember == this || !BundleMember
->IsScheduled) && "only bundle is marked scheduled"
) ? void (0) : __assert_fail ("(BundleMember == this || !BundleMember->IsScheduled) && \"only bundle is marked scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2733, __extension__
__PRETTY_FUNCTION__))
;
2734 }
2735 }
2736
2737 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&(static_cast <bool> (Inst->getParent() == FirstInBundle
->Inst->getParent() && "all bundle members must be in same basic block"
) ? void (0) : __assert_fail ("Inst->getParent() == FirstInBundle->Inst->getParent() && \"all bundle members must be in same basic block\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2738, __extension__
__PRETTY_FUNCTION__))
2738 "all bundle members must be in same basic block")(static_cast <bool> (Inst->getParent() == FirstInBundle
->Inst->getParent() && "all bundle members must be in same basic block"
) ? void (0) : __assert_fail ("Inst->getParent() == FirstInBundle->Inst->getParent() && \"all bundle members must be in same basic block\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2738, __extension__
__PRETTY_FUNCTION__))
;
2739 }
2740
2741 /// Returns true if the dependency information has been calculated.
2742 /// Note that depenendency validity can vary between instructions within
2743 /// a single bundle.
2744 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
2745
2746 /// Returns true for single instructions and for bundle representatives
2747 /// (= the head of a bundle).
2748 bool isSchedulingEntity() const { return FirstInBundle == this; }
2749
2750 /// Returns true if it represents an instruction bundle and not only a
2751 /// single instruction.
2752 bool isPartOfBundle() const {
2753 return NextInBundle != nullptr || FirstInBundle != this || TE;
2754 }
2755
2756 /// Returns true if it is ready for scheduling, i.e. it has no more
2757 /// unscheduled depending instructions/bundles.
2758 bool isReady() const {
2759 assert(isSchedulingEntity() &&(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2760, __extension__
__PRETTY_FUNCTION__))
2760 "can't consider non-scheduling entity for ready list")(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2760, __extension__
__PRETTY_FUNCTION__))
;
2761 return unscheduledDepsInBundle() == 0 && !IsScheduled;
2762 }
2763
2764 /// Modifies the number of unscheduled dependencies for this instruction,
2765 /// and returns the number of remaining dependencies for the containing
2766 /// bundle.
2767 int incrementUnscheduledDeps(int Incr) {
2768 assert(hasValidDependencies() &&(static_cast <bool> (hasValidDependencies() && "increment of unscheduled deps would be meaningless"
) ? void (0) : __assert_fail ("hasValidDependencies() && \"increment of unscheduled deps would be meaningless\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2769, __extension__
__PRETTY_FUNCTION__))
2769 "increment of unscheduled deps would be meaningless")(static_cast <bool> (hasValidDependencies() && "increment of unscheduled deps would be meaningless"
) ? void (0) : __assert_fail ("hasValidDependencies() && \"increment of unscheduled deps would be meaningless\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2769, __extension__
__PRETTY_FUNCTION__))
;
2770 UnscheduledDeps += Incr;
2771 return FirstInBundle->unscheduledDepsInBundle();
2772 }
2773
2774 /// Sets the number of unscheduled dependencies to the number of
2775 /// dependencies.
2776 void resetUnscheduledDeps() {
2777 UnscheduledDeps = Dependencies;
2778 }
2779
2780 /// Clears all dependency information.
2781 void clearDependencies() {
2782 Dependencies = InvalidDeps;
2783 resetUnscheduledDeps();
2784 MemoryDependencies.clear();
2785 ControlDependencies.clear();
2786 }
2787
2788 int unscheduledDepsInBundle() const {
2789 assert(isSchedulingEntity() && "only meaningful on the bundle")(static_cast <bool> (isSchedulingEntity() && "only meaningful on the bundle"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"only meaningful on the bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2789, __extension__
__PRETTY_FUNCTION__))
;
2790 int Sum = 0;
2791 for (const ScheduleData *BundleMember = this; BundleMember;
2792 BundleMember = BundleMember->NextInBundle) {
2793 if (BundleMember->UnscheduledDeps == InvalidDeps)
2794 return InvalidDeps;
2795 Sum += BundleMember->UnscheduledDeps;
2796 }
2797 return Sum;
2798 }
2799
2800 void dump(raw_ostream &os) const {
2801 if (!isSchedulingEntity()) {
2802 os << "/ " << *Inst;
2803 } else if (NextInBundle) {
2804 os << '[' << *Inst;
2805 ScheduleData *SD = NextInBundle;
2806 while (SD) {
2807 os << ';' << *SD->Inst;
2808 SD = SD->NextInBundle;
2809 }
2810 os << ']';
2811 } else {
2812 os << *Inst;
2813 }
2814 }
2815
2816 Instruction *Inst = nullptr;
2817
2818 /// Opcode of the current instruction in the schedule data.
2819 Value *OpValue = nullptr;
2820
2821 /// The TreeEntry that this instruction corresponds to.
2822 TreeEntry *TE = nullptr;
2823
2824 /// Points to the head in an instruction bundle (and always to this for
2825 /// single instructions).
2826 ScheduleData *FirstInBundle = nullptr;
2827
2828 /// Single linked list of all instructions in a bundle. Null if it is a
2829 /// single instruction.
2830 ScheduleData *NextInBundle = nullptr;
2831
2832 /// Single linked list of all memory instructions (e.g. load, store, call)
2833 /// in the block - until the end of the scheduling region.
2834 ScheduleData *NextLoadStore = nullptr;
2835
2836 /// The dependent memory instructions.
2837 /// This list is derived on demand in calculateDependencies().
2838 SmallVector<ScheduleData *, 4> MemoryDependencies;
2839
2840 /// List of instructions which this instruction could be control dependent
2841 /// on. Allowing such nodes to be scheduled below this one could introduce
2842 /// a runtime fault which didn't exist in the original program.
2843 /// ex: this is a load or udiv following a readonly call which inf loops
2844 SmallVector<ScheduleData *, 4> ControlDependencies;
2845
2846 /// This ScheduleData is in the current scheduling region if this matches
2847 /// the current SchedulingRegionID of BlockScheduling.
2848 int SchedulingRegionID = 0;
2849
2850 /// Used for getting a "good" final ordering of instructions.
2851 int SchedulingPriority = 0;
2852
2853 /// The number of dependencies. Constitutes of the number of users of the
2854 /// instruction plus the number of dependent memory instructions (if any).
2855 /// This value is calculated on demand.
2856 /// If InvalidDeps, the number of dependencies is not calculated yet.
2857 int Dependencies = InvalidDeps;
2858
2859 /// The number of dependencies minus the number of dependencies of scheduled
2860 /// instructions. As soon as this is zero, the instruction/bundle gets ready
2861 /// for scheduling.
2862 /// Note that this is negative as long as Dependencies is not calculated.
2863 int UnscheduledDeps = InvalidDeps;
2864
2865 /// True if this instruction is scheduled (or considered as scheduled in the
2866 /// dry-run).
2867 bool IsScheduled = false;
2868 };
2869
2870#ifndef NDEBUG
2871 friend inline raw_ostream &operator<<(raw_ostream &os,
2872 const BoUpSLP::ScheduleData &SD) {
2873 SD.dump(os);
2874 return os;
2875 }
2876#endif
2877
2878 friend struct GraphTraits<BoUpSLP *>;
2879 friend struct DOTGraphTraits<BoUpSLP *>;
2880
2881 /// Contains all scheduling data for a basic block.
2882 /// It does not schedules instructions, which are not memory read/write
2883 /// instructions and their operands are either constants, or arguments, or
2884 /// phis, or instructions from others blocks, or their users are phis or from
2885 /// the other blocks. The resulting vector instructions can be placed at the
2886 /// beginning of the basic block without scheduling (if operands does not need
2887 /// to be scheduled) or at the end of the block (if users are outside of the
2888 /// block). It allows to save some compile time and memory used by the
2889 /// compiler.
2890 /// ScheduleData is assigned for each instruction in between the boundaries of
2891 /// the tree entry, even for those, which are not part of the graph. It is
2892 /// required to correctly follow the dependencies between the instructions and
2893 /// their correct scheduling. The ScheduleData is not allocated for the
2894 /// instructions, which do not require scheduling, like phis, nodes with
2895 /// extractelements/insertelements only or nodes with instructions, with
2896 /// uses/operands outside of the block.
2897 struct BlockScheduling {
2898 BlockScheduling(BasicBlock *BB)
2899 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
2900
2901 void clear() {
2902 ReadyInsts.clear();
2903 ScheduleStart = nullptr;
2904 ScheduleEnd = nullptr;
2905 FirstLoadStoreInRegion = nullptr;
2906 LastLoadStoreInRegion = nullptr;
2907 RegionHasStackSave = false;
2908
2909 // Reduce the maximum schedule region size by the size of the
2910 // previous scheduling run.
2911 ScheduleRegionSizeLimit -= ScheduleRegionSize;
2912 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
2913 ScheduleRegionSizeLimit = MinScheduleRegionSize;
2914 ScheduleRegionSize = 0;
2915
2916 // Make a new scheduling region, i.e. all existing ScheduleData is not
2917 // in the new region yet.
2918 ++SchedulingRegionID;
2919 }
2920
2921 ScheduleData *getScheduleData(Instruction *I) {
2922 if (BB != I->getParent())
2923 // Avoid lookup if can't possibly be in map.
2924 return nullptr;
2925 ScheduleData *SD = ScheduleDataMap.lookup(I);
2926 if (SD && isInSchedulingRegion(SD))
2927 return SD;
2928 return nullptr;
2929 }
2930
2931 ScheduleData *getScheduleData(Value *V) {
2932 if (auto *I = dyn_cast<Instruction>(V))
2933 return getScheduleData(I);
2934 return nullptr;
2935 }
2936
2937 ScheduleData *getScheduleData(Value *V, Value *Key) {
2938 if (V == Key)
2939 return getScheduleData(V);
2940 auto I = ExtraScheduleDataMap.find(V);
2941 if (I != ExtraScheduleDataMap.end()) {
2942 ScheduleData *SD = I->second.lookup(Key);
2943 if (SD && isInSchedulingRegion(SD))
2944 return SD;
2945 }
2946 return nullptr;
2947 }
2948
2949 bool isInSchedulingRegion(ScheduleData *SD) const {
2950 return SD->SchedulingRegionID == SchedulingRegionID;
2951 }
2952
2953 /// Marks an instruction as scheduled and puts all dependent ready
2954 /// instructions into the ready-list.
2955 template <typename ReadyListType>
2956 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
2957 SD->IsScheduled = true;
2958 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule " << *SD <<
"\n"; } } while (false)
;
2959
2960 for (ScheduleData *BundleMember = SD; BundleMember;
2961 BundleMember = BundleMember->NextInBundle) {
2962 if (BundleMember->Inst != BundleMember->OpValue)
2963 continue;
2964
2965 // Handle the def-use chain dependencies.
2966
2967 // Decrement the unscheduled counter and insert to ready list if ready.
2968 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
2969 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
2970 if (OpDef && OpDef->hasValidDependencies() &&
2971 OpDef->incrementUnscheduledDeps(-1) == 0) {
2972 // There are no more unscheduled dependencies after
2973 // decrementing, so we can put the dependent instruction
2974 // into the ready list.
2975 ScheduleData *DepBundle = OpDef->FirstInBundle;
2976 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2977, __extension__
__PRETTY_FUNCTION__))
2977 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2977, __extension__
__PRETTY_FUNCTION__))
;
2978 ReadyList.insert(DepBundle);
2979 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
2980 << "SLP: gets ready (def): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)
;
2981 }
2982 });
2983 };
2984
2985 // If BundleMember is a vector bundle, its operands may have been
2986 // reordered during buildTree(). We therefore need to get its operands
2987 // through the TreeEntry.
2988 if (TreeEntry *TE = BundleMember->TE) {
2989 // Need to search for the lane since the tree entry can be reordered.
2990 int Lane = std::distance(TE->Scalars.begin(),
2991 find(TE->Scalars, BundleMember->Inst));
2992 assert(Lane >= 0 && "Lane not set")(static_cast <bool> (Lane >= 0 && "Lane not set"
) ? void (0) : __assert_fail ("Lane >= 0 && \"Lane not set\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2992, __extension__
__PRETTY_FUNCTION__))
;
2993
2994 // Since vectorization tree is being built recursively this assertion
2995 // ensures that the tree entry has all operands set before reaching
2996 // this code. Couple of exceptions known at the moment are extracts
2997 // where their second (immediate) operand is not added. Since
2998 // immediates do not affect scheduler behavior this is considered
2999 // okay.
3000 auto *In = BundleMember->Inst;
3001 assert(In &&(static_cast <bool> (In && (isa<ExtractValueInst
, ExtractElementInst>(In) || In->getNumOperands() == TE
->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3004, __extension__
__PRETTY_FUNCTION__))
3002 (isa<ExtractValueInst, ExtractElementInst>(In) ||(static_cast <bool> (In && (isa<ExtractValueInst
, ExtractElementInst>(In) || In->getNumOperands() == TE
->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3004, __extension__
__PRETTY_FUNCTION__))
3003 In->getNumOperands() == TE->getNumOperands()) &&(static_cast <bool> (In && (isa<ExtractValueInst
, ExtractElementInst>(In) || In->getNumOperands() == TE
->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3004, __extension__
__PRETTY_FUNCTION__))
3004 "Missed TreeEntry operands?")(static_cast <bool> (In && (isa<ExtractValueInst
, ExtractElementInst>(In) || In->getNumOperands() == TE
->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3004, __extension__
__PRETTY_FUNCTION__))
;
3005 (void)In; // fake use to avoid build failure when assertions disabled
3006
3007 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3008 OpIdx != NumOperands; ++OpIdx)
3009 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3010 DecrUnsched(I);
3011 } else {
3012 // If BundleMember is a stand-alone instruction, no operand reordering
3013 // has taken place, so we directly access its operands.
3014 for (Use &U : BundleMember->Inst->operands())
3015 if (auto *I = dyn_cast<Instruction>(U.get()))
3016 DecrUnsched(I);
3017 }
3018 // Handle the memory dependencies.
3019 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3020 if (MemoryDepSD->hasValidDependencies() &&
3021 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3022 // There are no more unscheduled dependencies after decrementing,
3023 // so we can put the dependent instruction into the ready list.
3024 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3025 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3026, __extension__
__PRETTY_FUNCTION__))
3026 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3026, __extension__
__PRETTY_FUNCTION__))
;
3027 ReadyList.insert(DepBundle);
3028 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
3029 << "SLP: gets ready (mem): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)
;
3030 }
3031 }
3032 // Handle the control dependencies.
3033 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3034 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3035 // There are no more unscheduled dependencies after decrementing,
3036 // so we can put the dependent instruction into the ready list.
3037 ScheduleData *DepBundle = DepSD->FirstInBundle;
3038 assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3039, __extension__
__PRETTY_FUNCTION__))
3039 "already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3039, __extension__
__PRETTY_FUNCTION__))
;
3040 ReadyList.insert(DepBundle);
3041 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (ctl): " <<
*DepBundle << "\n"; } } while (false)
3042 << "SLP: gets ready (ctl): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (ctl): " <<
*DepBundle << "\n"; } } while (false)
;
3043 }
3044 }
3045
3046 }
3047 }
3048
3049 /// Verify basic self consistency properties of the data structure.
3050 void verify() {
3051 if (!ScheduleStart)
3052 return;
3053
3054 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&(static_cast <bool> (ScheduleStart->getParent() == ScheduleEnd
->getParent() && ScheduleStart->comesBefore(ScheduleEnd
) && "Not a valid scheduling region?") ? void (0) : __assert_fail
("ScheduleStart->getParent() == ScheduleEnd->getParent() && ScheduleStart->comesBefore(ScheduleEnd) && \"Not a valid scheduling region?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3056, __extension__
__PRETTY_FUNCTION__))
3055 ScheduleStart->comesBefore(ScheduleEnd) &&(static_cast <bool> (ScheduleStart->getParent() == ScheduleEnd
->getParent() && ScheduleStart->comesBefore(ScheduleEnd
) && "Not a valid scheduling region?") ? void (0) : __assert_fail
("ScheduleStart->getParent() == ScheduleEnd->getParent() && ScheduleStart->comesBefore(ScheduleEnd) && \"Not a valid scheduling region?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3056, __extension__
__PRETTY_FUNCTION__))
3056 "Not a valid scheduling region?")(static_cast <bool> (ScheduleStart->getParent() == ScheduleEnd
->getParent() && ScheduleStart->comesBefore(ScheduleEnd
) && "Not a valid scheduling region?") ? void (0) : __assert_fail
("ScheduleStart->getParent() == ScheduleEnd->getParent() && ScheduleStart->comesBefore(ScheduleEnd) && \"Not a valid scheduling region?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3056, __extension__
__PRETTY_FUNCTION__))
;
3057
3058 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3059 auto *SD = getScheduleData(I);
3060 if (!SD)
3061 continue;
3062 assert(isInSchedulingRegion(SD) &&(static_cast <bool> (isInSchedulingRegion(SD) &&
"primary schedule data not in window?") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"primary schedule data not in window?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3063, __extension__
__PRETTY_FUNCTION__))
3063 "primary schedule data not in window?")(static_cast <bool> (isInSchedulingRegion(SD) &&
"primary schedule data not in window?") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"primary schedule data not in window?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3063, __extension__
__PRETTY_FUNCTION__))
;
3064 assert(isInSchedulingRegion(SD->FirstInBundle) &&(static_cast <bool> (isInSchedulingRegion(SD->FirstInBundle
) && "entire bundle in window!") ? void (0) : __assert_fail
("isInSchedulingRegion(SD->FirstInBundle) && \"entire bundle in window!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3065, __extension__
__PRETTY_FUNCTION__))
3065 "entire bundle in window!")(static_cast <bool> (isInSchedulingRegion(SD->FirstInBundle
) && "entire bundle in window!") ? void (0) : __assert_fail
("isInSchedulingRegion(SD->FirstInBundle) && \"entire bundle in window!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3065, __extension__
__PRETTY_FUNCTION__))
;
3066 (void)SD;
3067 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3068 }
3069
3070 for (auto *SD : ReadyInsts) {
3071 assert(SD->isSchedulingEntity() && SD->isReady() &&(static_cast <bool> (SD->isSchedulingEntity() &&
SD->isReady() && "item in ready list not ready?")
? void (0) : __assert_fail ("SD->isSchedulingEntity() && SD->isReady() && \"item in ready list not ready?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3072, __extension__
__PRETTY_FUNCTION__))
3072 "item in ready list not ready?")(static_cast <bool> (SD->isSchedulingEntity() &&
SD->isReady() && "item in ready list not ready?")
? void (0) : __assert_fail ("SD->isSchedulingEntity() && SD->isReady() && \"item in ready list not ready?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3072, __extension__
__PRETTY_FUNCTION__))
;
3073 (void)SD;
3074 }
3075 }
3076
3077 void doForAllOpcodes(Value *V,
3078 function_ref<void(ScheduleData *SD)> Action) {
3079 if (ScheduleData *SD = getScheduleData(V))
3080 Action(SD);
3081 auto I = ExtraScheduleDataMap.find(V);
3082 if (I != ExtraScheduleDataMap.end())
3083 for (auto &P : I->second)
3084 if (isInSchedulingRegion(P.second))
3085 Action(P.second);
3086 }
3087
3088 /// Put all instructions into the ReadyList which are ready for scheduling.
3089 template <typename ReadyListType>
3090 void initialFillReadyList(ReadyListType &ReadyList) {
3091 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3092 doForAllOpcodes(I, [&](ScheduleData *SD) {
3093 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3094 SD->isReady()) {
3095 ReadyList.insert(SD);
3096 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *SD << "\n"; } } while (false)
3097 << "SLP: initially in ready list: " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *SD << "\n"; } } while (false)
;
3098 }
3099 });
3100 }
3101 }
3102
3103 /// Build a bundle from the ScheduleData nodes corresponding to the
3104 /// scalar instruction for each lane.
3105 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3106
3107 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3108 /// cyclic dependencies. This is only a dry-run, no instructions are
3109 /// actually moved at this stage.
3110 /// \returns the scheduling bundle. The returned Optional value is non-None
3111 /// if \p VL is allowed to be scheduled.
3112 Optional<ScheduleData *>
3113 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3114 const InstructionsState &S);
3115
3116 /// Un-bundles a group of instructions.
3117 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3118
3119 /// Allocates schedule data chunk.
3120 ScheduleData *allocateScheduleDataChunks();
3121
3122 /// Extends the scheduling region so that V is inside the region.
3123 /// \returns true if the region size is within the limit.
3124 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3125
3126 /// Initialize the ScheduleData structures for new instructions in the
3127 /// scheduling region.
3128 void initScheduleData(Instruction *FromI, Instruction *ToI,
3129 ScheduleData *PrevLoadStore,
3130 ScheduleData *NextLoadStore);
3131
3132 /// Updates the dependency information of a bundle and of all instructions/
3133 /// bundles which depend on the original bundle.
3134 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3135 BoUpSLP *SLP);
3136
3137 /// Sets all instruction in the scheduling region to un-scheduled.
3138 void resetSchedule();
3139
3140 BasicBlock *BB;
3141
3142 /// Simple memory allocation for ScheduleData.
3143 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3144
3145 /// The size of a ScheduleData array in ScheduleDataChunks.
3146 int ChunkSize;
3147
3148 /// The allocator position in the current chunk, which is the last entry
3149 /// of ScheduleDataChunks.
3150 int ChunkPos;
3151
3152 /// Attaches ScheduleData to Instruction.
3153 /// Note that the mapping survives during all vectorization iterations, i.e.
3154 /// ScheduleData structures are recycled.
3155 DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
3156
3157 /// Attaches ScheduleData to Instruction with the leading key.
3158 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
3159 ExtraScheduleDataMap;
3160
3161 /// The ready-list for scheduling (only used for the dry-run).
3162 SetVector<ScheduleData *> ReadyInsts;
3163
3164 /// The first instruction of the scheduling region.
3165 Instruction *ScheduleStart = nullptr;
3166
3167 /// The first instruction _after_ the scheduling region.
3168 Instruction *ScheduleEnd = nullptr;
3169
3170 /// The first memory accessing instruction in the scheduling region
3171 /// (can be null).
3172 ScheduleData *FirstLoadStoreInRegion = nullptr;
3173
3174 /// The last memory accessing instruction in the scheduling region
3175 /// (can be null).
3176 ScheduleData *LastLoadStoreInRegion = nullptr;
3177
3178 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3179 /// region? Used to optimize the dependence calculation for the
3180 /// common case where there isn't.
3181 bool RegionHasStackSave = false;
3182
3183 /// The current size of the scheduling region.
3184 int ScheduleRegionSize = 0;
3185
3186 /// The maximum size allowed for the scheduling region.
3187 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3188
3189 /// The ID of the scheduling region. For a new vectorization iteration this
3190 /// is incremented which "removes" all ScheduleData from the region.
3191 /// Make sure that the initial SchedulingRegionID is greater than the
3192 /// initial SchedulingRegionID in ScheduleData (which is 0).
3193 int SchedulingRegionID = 1;
3194 };
3195
3196 /// Attaches the BlockScheduling structures to basic blocks.
3197 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3198
3199 /// Performs the "real" scheduling. Done before vectorization is actually
3200 /// performed in a basic block.
3201 void scheduleBlock(BlockScheduling *BS);
3202
3203 /// List of users to ignore during scheduling and that don't need extracting.
3204 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3205
3206 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3207 /// sorted SmallVectors of unsigned.
3208 struct OrdersTypeDenseMapInfo {
3209 static OrdersType getEmptyKey() {
3210 OrdersType V;
3211 V.push_back(~1U);
3212 return V;
3213 }
3214
3215 static OrdersType getTombstoneKey() {
3216 OrdersType V;
3217 V.push_back(~2U);
3218 return V;
3219 }
3220
3221 static unsigned getHashValue(const OrdersType &V) {
3222 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3223 }
3224
3225 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3226 return LHS == RHS;
3227 }
3228 };
3229
3230 // Analysis and block reference.
3231 Function *F;
3232 ScalarEvolution *SE;
3233 TargetTransformInfo *TTI;
3234 TargetLibraryInfo *TLI;
3235 LoopInfo *LI;
3236 DominatorTree *DT;
3237 AssumptionCache *AC;
3238 DemandedBits *DB;
3239 const DataLayout *DL;
3240 OptimizationRemarkEmitter *ORE;
3241
3242 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3243 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3244
3245 /// Instruction builder to construct the vectorized tree.
3246 IRBuilder<> Builder;
3247
3248 /// A map of scalar integer values to the smallest bit width with which they
3249 /// can legally be represented. The values map to (width, signed) pairs,
3250 /// where "width" indicates the minimum bit width and "signed" is True if the
3251 /// value must be signed-extended, rather than zero-extended, back to its
3252 /// original width.
3253 MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
3254};
3255
3256} // end namespace slpvectorizer
3257
3258template <> struct GraphTraits<BoUpSLP *> {
3259 using TreeEntry = BoUpSLP::TreeEntry;
3260
3261 /// NodeRef has to be a pointer per the GraphWriter.
3262 using NodeRef = TreeEntry *;
3263
3264 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
3265
3266 /// Add the VectorizableTree to the index iterator to be able to return
3267 /// TreeEntry pointers.
3268 struct ChildIteratorType
3269 : public iterator_adaptor_base<
3270 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3271 ContainerTy &VectorizableTree;
3272
3273 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
3274 ContainerTy &VT)
3275 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3276
3277 NodeRef operator*() { return I->UserTE; }
3278 };
3279
3280 static NodeRef getEntryNode(BoUpSLP &R) {
3281 return R.VectorizableTree[0].get();
3282 }
3283
3284 static ChildIteratorType child_begin(NodeRef N) {
3285 return {N->UserTreeIndices.begin(), N->Container};
3286 }
3287
3288 static ChildIteratorType child_end(NodeRef N) {
3289 return {N->UserTreeIndices.end(), N->Container};
3290 }
3291
3292 /// For the node iterator we just need to turn the TreeEntry iterator into a
3293 /// TreeEntry* iterator so that it dereferences to NodeRef.
3294 class nodes_iterator {
3295 using ItTy = ContainerTy::iterator;
3296 ItTy It;
3297
3298 public:
3299 nodes_iterator(const ItTy &It2) : It(It2) {}
3300 NodeRef operator*() { return It->get(); }
3301 nodes_iterator operator++() {
3302 ++It;
3303 return *this;
3304 }
3305 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3306 };
3307
3308 static nodes_iterator nodes_begin(BoUpSLP *R) {
3309 return nodes_iterator(R->VectorizableTree.begin());
3310 }
3311
3312 static nodes_iterator nodes_end(BoUpSLP *R) {
3313 return nodes_iterator(R->VectorizableTree.end());
3314 }
3315
3316 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3317};
3318
3319template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3320 using TreeEntry = BoUpSLP::TreeEntry;
3321
3322 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
3323
3324 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3325 std::string Str;
3326 raw_string_ostream OS(Str);
3327 if (isSplat(Entry->Scalars))
3328 OS << "<splat> ";
3329 for (auto *V : Entry->Scalars) {
3330 OS << *V;
3331 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3332 return EU.Scalar == V;
3333 }))
3334 OS << " <extract>";
3335 OS << "\n";
3336 }
3337 return Str;
3338 }
3339
3340 static std::string getNodeAttributes(const TreeEntry *Entry,
3341 const BoUpSLP *) {
3342 if (Entry->State == TreeEntry::NeedToGather)
3343 return "color=red";
3344 return "";
3345 }
3346};
3347
3348} // end namespace llvm
3349
3350BoUpSLP::~BoUpSLP() {
3351 SmallVector<WeakTrackingVH> DeadInsts;
3352 for (auto *I : DeletedInstructions) {
3353 for (Use &U : I->operands()) {
3354 auto *Op = dyn_cast<Instruction>(U.get());
3355 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3356 wouldInstructionBeTriviallyDead(Op, TLI))
3357 DeadInsts.emplace_back(Op);
3358 }
3359 I->dropAllReferences();
3360 }
3361 for (auto *I : DeletedInstructions) {
3362 assert(I->use_empty() &&(static_cast <bool> (I->use_empty() && "trying to erase instruction with users."
) ? void (0) : __assert_fail ("I->use_empty() && \"trying to erase instruction with users.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3363, __extension__
__PRETTY_FUNCTION__))
3363 "trying to erase instruction with users.")(static_cast <bool> (I->use_empty() && "trying to erase instruction with users."
) ? void (0) : __assert_fail ("I->use_empty() && \"trying to erase instruction with users.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3363, __extension__
__PRETTY_FUNCTION__))
;
3364 I->eraseFromParent();
3365 }
3366
3367 // Cleanup any dead scalar code feeding the vectorized instructions
3368 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
3369
3370#ifdef EXPENSIVE_CHECKS
3371 // If we could guarantee that this call is not extremely slow, we could
3372 // remove the ifdef limitation (see PR47712).
3373 assert(!verifyFunction(*F, &dbgs()))(static_cast <bool> (!verifyFunction(*F, &dbgs())) ?
void (0) : __assert_fail ("!verifyFunction(*F, &dbgs())"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3373, __extension__
__PRETTY_FUNCTION__))
;
3374#endif
3375}
3376
3377/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3378/// contains original mask for the scalars reused in the node. Procedure
3379/// transform this mask in accordance with the given \p Mask.
3380static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
3381 assert(!Mask.empty() && Reuses.size() == Mask.size() &&(static_cast <bool> (!Mask.empty() && Reuses.size
() == Mask.size() && "Expected non-empty mask.") ? void
(0) : __assert_fail ("!Mask.empty() && Reuses.size() == Mask.size() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3382, __extension__
__PRETTY_FUNCTION__))
3382 "Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && Reuses.size
() == Mask.size() && "Expected non-empty mask.") ? void
(0) : __assert_fail ("!Mask.empty() && Reuses.size() == Mask.size() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3382, __extension__
__PRETTY_FUNCTION__))
;
3383 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3384 Prev.swap(Reuses);
3385 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3386 if (Mask[I] != UndefMaskElem)
3387 Reuses[Mask[I]] = Prev[I];
3388}
3389
3390/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3391/// the original order of the scalars. Procedure transforms the provided order
3392/// in accordance with the given \p Mask. If the resulting \p Order is just an
3393/// identity order, \p Order is cleared.
3394static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
3395 assert(!Mask.empty() && "Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && "Expected non-empty mask."
) ? void (0) : __assert_fail ("!Mask.empty() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3395, __extension__
__PRETTY_FUNCTION__))
;
3396 SmallVector<int> MaskOrder;
3397 if (Order.empty()) {
3398 MaskOrder.resize(Mask.size());
3399 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3400 } else {
3401 inversePermutation(Order, MaskOrder);
3402 }
3403 reorderReuses(MaskOrder, Mask);
3404 if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
3405 Order.clear();
3406 return;
3407 }
3408 Order.assign(Mask.size(), Mask.size());
3409 for (unsigned I = 0, E = Mask.size(); I < E; ++I)
3410 if (MaskOrder[I] != UndefMaskElem)
3411 Order[MaskOrder[I]] = I;
3412 fixupOrderingIndices(Order);
3413}
3414
3415Optional<BoUpSLP::OrdersType>
3416BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3417 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.")(static_cast <bool> (TE.State == TreeEntry::NeedToGather
&& "Expected gather node only.") ? void (0) : __assert_fail
("TE.State == TreeEntry::NeedToGather && \"Expected gather node only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3417, __extension__
__PRETTY_FUNCTION__))
;
3418 unsigned NumScalars = TE.Scalars.size();
3419 OrdersType CurrentOrder(NumScalars, NumScalars);
3420 SmallVector<int> Positions;
3421 SmallBitVector UsedPositions(NumScalars);
3422 const TreeEntry *STE = nullptr;
3423 // Try to find all gathered scalars that are gets vectorized in other
3424 // vectorize node. Here we can have only one single tree vector node to
3425 // correctly identify order of the gathered scalars.
3426 for (unsigned I = 0; I < NumScalars; ++I) {
3427 Value *V = TE.Scalars[I];
3428 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3429 continue;
3430 if (const auto *LocalSTE = getTreeEntry(V)) {
3431 if (!STE)
3432 STE = LocalSTE;
3433 else if (STE != LocalSTE)
3434 // Take the order only from the single vector node.
3435 return None;
3436 unsigned Lane =
3437 std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
3438 if (Lane >= NumScalars)
3439 return None;
3440 if (CurrentOrder[Lane] != NumScalars) {
3441 if (Lane != I)
3442 continue;
3443 UsedPositions.reset(CurrentOrder[Lane]);
3444 }
3445 // The partial identity (where only some elements of the gather node are
3446 // in the identity order) is good.
3447 CurrentOrder[Lane] = I;
3448 UsedPositions.set(I);
3449 }
3450 }
3451 // Need to keep the order if we have a vector entry and at least 2 scalars or
3452 // the vectorized entry has just 2 scalars.
3453 if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
3454 auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
3455 for (unsigned I = 0; I < NumScalars; ++I)
3456 if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
3457 return false;
3458 return true;
3459 };
3460 if (IsIdentityOrder(CurrentOrder)) {
3461 CurrentOrder.clear();
3462 return CurrentOrder;
3463 }
3464 auto *It = CurrentOrder.begin();
3465 for (unsigned I = 0; I < NumScalars;) {
3466 if (UsedPositions.test(I)) {
3467 ++I;
3468 continue;
3469 }
3470 if (*It == NumScalars) {
3471 *It = I;
3472 ++I;
3473 }
3474 ++It;
3475 }
3476 return CurrentOrder;
3477 }
3478 return None;
3479}
3480
3481namespace {
3482/// Tracks the state we can represent the loads in the given sequence.
3483enum class LoadsState { Gather, Vectorize, ScatterVectorize };
3484} // anonymous namespace
3485
3486/// Checks if the given array of loads can be represented as a vectorized,
3487/// scatter or just simple gather.
3488static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3489 const TargetTransformInfo &TTI,
3490 const DataLayout &DL, ScalarEvolution &SE,
3491 LoopInfo &LI,
3492 SmallVectorImpl<unsigned> &Order,
3493 SmallVectorImpl<Value *> &PointerOps) {
3494 // Check that a vectorized load would load the same memory as a scalar
3495 // load. For example, we don't want to vectorize loads that are smaller
3496 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3497 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3498 // from such a struct, we read/write packed bits disagreeing with the
3499 // unvectorized version.
3500 Type *ScalarTy = VL0->getType();
3501
3502 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
3503 return LoadsState::Gather;
3504
3505 // Make sure all loads in the bundle are simple - we can't vectorize
3506 // atomic or volatile loads.
3507 PointerOps.clear();
3508 PointerOps.resize(VL.size());
3509 auto *POIter = PointerOps.begin();
3510 for (Value *V : VL) {
3511 auto *L = cast<LoadInst>(V);
3512 if (!L->isSimple())
3513 return LoadsState::Gather;
3514 *POIter = L->getPointerOperand();
3515 ++POIter;
3516 }
3517
3518 Order.clear();
3519 // Check the order of pointer operands or that all pointers are the same.
3520 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
3521 if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) {
3522 if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front()))
3523 return false;
3524 auto *GEP = dyn_cast<GetElementPtrInst>(P);
3525 if (!GEP)
3526 return false;
3527 auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front());
3528 return GEP->getNumOperands() == 2 &&
3529 ((isConstant(GEP->getOperand(1)) &&
3530 isConstant(GEP0->getOperand(1))) ||
3531 getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)})
3532 .getOpcode());
3533 })) {
3534 if (IsSorted) {
3535 Value *Ptr0;
3536 Value *PtrN;
3537 if (Order.empty()) {
3538 Ptr0 = PointerOps.front();
3539 PtrN = PointerOps.back();
3540 } else {
3541 Ptr0 = PointerOps[Order.front()];
3542 PtrN = PointerOps[Order.back()];
3543 }
3544 Optional<int> Diff =
3545 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
3546 // Check that the sorted loads are consecutive.
3547 if (static_cast<unsigned>(*Diff) == VL.size() - 1)
3548 return LoadsState::Vectorize;
3549 }
3550 // TODO: need to improve analysis of the pointers, if not all of them are
3551 // GEPs or have > 2 operands, we end up with a gather node, which just
3552 // increases the cost.
3553 Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
3554 bool ProfitableGatherPointers =
3555 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
3556 return L && L->isLoopInvariant(V);
3557 })) <= VL.size() / 2 && VL.size() > 2;
3558 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
3559 auto *GEP = dyn_cast<GetElementPtrInst>(P);
3560 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
3561 (GEP && GEP->getNumOperands() == 2);
3562 })) {
3563 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3564 for (Value *V : VL)
3565 CommonAlignment =
3566 std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
3567 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
3568 if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
3569 !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
3570 return LoadsState::ScatterVectorize;
3571 }
3572 }
3573
3574 return LoadsState::Gather;
3575}
3576
3577bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
3578 const DataLayout &DL, ScalarEvolution &SE,
3579 SmallVectorImpl<unsigned> &SortedIndices) {
3580 assert(llvm::all_of((static_cast <bool> (llvm::all_of( VL, [](const Value *
V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."
) ? void (0) : __assert_fail ("llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && \"Expected list of pointer operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3582, __extension__
__PRETTY_FUNCTION__))
3581 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&(static_cast <bool> (llvm::all_of( VL, [](const Value *
V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."
) ? void (0) : __assert_fail ("llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && \"Expected list of pointer operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3582, __extension__
__PRETTY_FUNCTION__))
3582 "Expected list of pointer operands.")(static_cast <bool> (llvm::all_of( VL, [](const Value *
V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."
) ? void (0) : __assert_fail ("llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && \"Expected list of pointer operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3582, __extension__
__PRETTY_FUNCTION__))
;
3583 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
3584 // Ptr into, sort and return the sorted indices with values next to one
3585 // another.
3586 MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
3587 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3588
3589 unsigned Cnt = 1;
3590 for (Value *Ptr : VL.drop_front()) {
3591 bool Found = any_of(Bases, [&](auto &Base) {
3592 Optional<int> Diff =
3593 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
3594 /*StrictCheck=*/true);
3595 if (!Diff)
3596 return false;
3597
3598 Base.second.emplace_back(Ptr, *Diff, Cnt++);
3599 return true;
3600 });
3601
3602 if (!Found) {
3603 // If we haven't found enough to usefully cluster, return early.
3604 if (Bases.size() > VL.size() / 2 - 1)
3605 return false;
3606
3607 // Not found already - add a new Base
3608 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
3609 }
3610 }
3611
3612 // For each of the bases sort the pointers by Offset and check if any of the
3613 // base become consecutively allocated.
3614 bool AnyConsecutive = false;
3615 for (auto &Base : Bases) {
3616 auto &Vec = Base.second;
3617 if (Vec.size() > 1) {
3618 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
3619 const std::tuple<Value *, int, unsigned> &Y) {
3620 return std::get<1>(X) < std::get<1>(Y);
3621 });
3622 int InitialOffset = std::get<1>(Vec[0]);
3623 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {
3624 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
3625 });
3626 }
3627 }
3628
3629 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
3630 SortedIndices.clear();
3631 if (!AnyConsecutive)
3632 return false;
3633
3634 for (auto &Base : Bases) {
3635 for (auto &T : Base.second)
3636 SortedIndices.push_back(std::get<2>(T));
3637 }
3638
3639 assert(SortedIndices.size() == VL.size() &&(static_cast <bool> (SortedIndices.size() == VL.size() &&
"Expected SortedIndices to be the size of VL") ? void (0) : __assert_fail
("SortedIndices.size() == VL.size() && \"Expected SortedIndices to be the size of VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3640, __extension__
__PRETTY_FUNCTION__))
3640 "Expected SortedIndices to be the size of VL")(static_cast <bool> (SortedIndices.size() == VL.size() &&
"Expected SortedIndices to be the size of VL") ? void (0) : __assert_fail
("SortedIndices.size() == VL.size() && \"Expected SortedIndices to be the size of VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3640, __extension__
__PRETTY_FUNCTION__))
;
3641 return true;
3642}
3643
3644Optional<BoUpSLP::OrdersType>
3645BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
3646 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.")(static_cast <bool> (TE.State == TreeEntry::NeedToGather
&& "Expected gather node only.") ? void (0) : __assert_fail
("TE.State == TreeEntry::NeedToGather && \"Expected gather node only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3646, __extension__
__PRETTY_FUNCTION__))
;
3647 Type *ScalarTy = TE.Scalars[0]->getType();
3648
3649 SmallVector<Value *> Ptrs;
3650 Ptrs.reserve(TE.Scalars.size());
3651 for (Value *V : TE.Scalars) {
3652 auto *L = dyn_cast<LoadInst>(V);
3653 if (!L || !L->isSimple())
3654 return None;
3655 Ptrs.push_back(L->getPointerOperand());
3656 }
3657
3658 BoUpSLP::OrdersType Order;
3659 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
3660 return Order;
3661 return None;
3662}
3663
3664Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
3665 bool TopToBottom) {
3666 // No need to reorder if need to shuffle reuses, still need to shuffle the
3667 // node.
3668 if (!TE.ReuseShuffleIndices.empty())
3669 return None;
3670 if (TE.State == TreeEntry::Vectorize &&
3671 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
3672 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
3673 !TE.isAltShuffle())
3674 return TE.ReorderIndices;
3675 if (TE.State == TreeEntry::NeedToGather) {
3676 // TODO: add analysis of other gather nodes with extractelement
3677 // instructions and other values/instructions, not only undefs.
3678 if (((TE.getOpcode() == Instruction::ExtractElement &&
3679 !TE.isAltShuffle()) ||
3680 (all_of(TE.Scalars,
3681 [](Value *V) {
3682 return isa<UndefValue, ExtractElementInst>(V);
3683 }) &&
3684 any_of(TE.Scalars,
3685 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
3686 all_of(TE.Scalars,
3687 [](Value *V) {
3688 auto *EE = dyn_cast<ExtractElementInst>(V);
3689 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
3690 }) &&
3691 allSameType(TE.Scalars)) {
3692 // Check that gather of extractelements can be represented as
3693 // just a shuffle of a single vector.
3694 OrdersType CurrentOrder;
3695 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
3696 if (Reuse || !CurrentOrder.empty()) {
3697 if (!CurrentOrder.empty())
3698 fixupOrderingIndices(CurrentOrder);
3699 return CurrentOrder;
3700 }
3701 }
3702 if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
3703 return CurrentOrder;
3704 if (TE.Scalars.size() >= 4)
3705 if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
3706 return Order;
3707 }
3708 return None;
3709}
3710
3711void BoUpSLP::reorderTopToBottom() {
3712 // Maps VF to the graph nodes.
3713 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
3714 // ExtractElement gather nodes which can be vectorized and need to handle
3715 // their ordering.
3716 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
3717
3718 // AltShuffles can also have a preferred ordering that leads to fewer
3719 // instructions, e.g., the addsub instruction in x86.
3720 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
3721
3722 // Maps a TreeEntry to the reorder indices of external users.
3723 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
3724 ExternalUserReorderMap;
3725 // FIXME: Workaround for syntax error reported by MSVC buildbots.
3726 TargetTransformInfo &TTIRef = *TTI;
3727 // Find all reorderable nodes with the given VF.
3728 // Currently the are vectorized stores,loads,extracts + some gathering of
3729 // extracts.
3730 for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
3731 &GathersToOrders, &ExternalUserReorderMap,
3732 &AltShufflesToOrders](
3733 const std::unique_ptr<TreeEntry> &TE) {
3734 // Look for external users that will probably be vectorized.
3735 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
3736 findExternalStoreUsersReorderIndices(TE.get());
3737 if (!ExternalUserReorderIndices.empty()) {
3738 VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
3739 ExternalUserReorderMap.try_emplace(TE.get(),
3740 std::move(ExternalUserReorderIndices));
3741 }
3742
3743 // Patterns like [fadd,fsub] can be combined into a single instruction in
3744 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
3745 // to take into account their order when looking for the most used order.
3746 if (TE->isAltShuffle()) {
3747 VectorType *VecTy =
3748 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
3749 unsigned Opcode0 = TE->getOpcode();
3750 unsigned Opcode1 = TE->getAltOpcode();
3751 // The opcode mask selects between the two opcodes.
3752 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
3753 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
3754 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
3755 OpcodeMask.set(Lane);
3756 // If this pattern is supported by the target then we consider the order.
3757 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
3758 VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
3759 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
3760 }
3761 // TODO: Check the reverse order too.
3762 }
3763
3764 if (Optional<OrdersType> CurrentOrder =
3765 getReorderingData(*TE, /*TopToBottom=*/true)) {
3766 // Do not include ordering for nodes used in the alt opcode vectorization,
3767 // better to reorder them during bottom-to-top stage. If follow the order
3768 // here, it causes reordering of the whole graph though actually it is
3769 // profitable just to reorder the subgraph that starts from the alternate
3770 // opcode vectorization node. Such nodes already end-up with the shuffle
3771 // instruction and it is just enough to change this shuffle rather than
3772 // rotate the scalars for the whole graph.
3773 unsigned Cnt = 0;
3774 const TreeEntry *UserTE = TE.get();
3775 while (UserTE && Cnt < RecursionMaxDepth) {
3776 if (UserTE->UserTreeIndices.size() != 1)
3777 break;
3778 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
3779 return EI.UserTE->State == TreeEntry::Vectorize &&
3780 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
3781 }))
3782 return;
3783 UserTE = UserTE->UserTreeIndices.back().UserTE;
3784 ++Cnt;
3785 }
3786 VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
3787 if (TE->State != TreeEntry::Vectorize)
3788 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
3789 }
3790 });
3791
3792 // Reorder the graph nodes according to their vectorization factor.
3793 for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;
3794 VF /= 2) {
3795 auto It = VFToOrderedEntries.find(VF);
3796 if (It == VFToOrderedEntries.end())
3797 continue;
3798 // Try to find the most profitable order. We just are looking for the most
3799 // used order and reorder scalar elements in the nodes according to this
3800 // mostly used order.
3801 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
3802 // All operands are reordered and used only in this node - propagate the
3803 // most used order to the user node.
3804 MapVector<OrdersType, unsigned,
3805 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
3806 OrdersUses;
3807 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
3808 for (const TreeEntry *OpTE : OrderedEntries) {
3809 // No need to reorder this nodes, still need to extend and to use shuffle,
3810 // just need to merge reordering shuffle and the reuse shuffle.
3811 if (!OpTE->ReuseShuffleIndices.empty())
3812 continue;
3813 // Count number of orders uses.
3814 const auto &Order = [OpTE, &GathersToOrders,
3815 &AltShufflesToOrders]() -> const OrdersType & {
3816 if (OpTE->State == TreeEntry::NeedToGather) {
3817 auto It = GathersToOrders.find(OpTE);
3818 if (It != GathersToOrders.end())
3819 return It->second;
3820 }
3821 if (OpTE->isAltShuffle()) {
3822 auto It = AltShufflesToOrders.find(OpTE);
3823 if (It != AltShufflesToOrders.end())
3824 return It->second;
3825 }
3826 return OpTE->ReorderIndices;
3827 }();
3828 // First consider the order of the external scalar users.
3829 auto It = ExternalUserReorderMap.find(OpTE);
3830 if (It != ExternalUserReorderMap.end()) {
3831 const auto &ExternalUserReorderIndices = It->second;
3832 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
3833 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
3834 // No other useful reorder data in this entry.
3835 if (Order.empty())
3836 continue;
3837 }
3838 // Stores actually store the mask, not the order, need to invert.
3839 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
3840 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
3841 SmallVector<int> Mask;
3842 inversePermutation(Order, Mask);
3843 unsigned E = Order.size();
3844 OrdersType CurrentOrder(E, E);
3845 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
3846 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
3847 });
3848 fixupOrderingIndices(CurrentOrder);
3849 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
3850 } else {
3851 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
3852 }
3853 }
3854 // Set order of the user node.
3855 if (OrdersUses.empty())
3856 continue;
3857 // Choose the most used order.
3858 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
3859 unsigned Cnt = OrdersUses.front().second;
3860 for (const auto &Pair : drop_begin(OrdersUses)) {
3861 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
3862 BestOrder = Pair.first;
3863 Cnt = Pair.second;
3864 }
3865 }
3866 // Set order of the user node.
3867 if (BestOrder.empty())
3868 continue;
3869 SmallVector<int> Mask;
3870 inversePermutation(BestOrder, Mask);
3871 SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
3872 unsigned E = BestOrder.size();
3873 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
3874 return I < E ? static_cast<int>(I) : UndefMaskElem;
3875 });
3876 // Do an actual reordering, if profitable.
3877 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
3878 // Just do the reordering for the nodes with the given VF.
3879 if (TE->Scalars.size() != VF) {
3880 if (TE->ReuseShuffleIndices.size() == VF) {
3881 // Need to reorder the reuses masks of the operands with smaller VF to
3882 // be able to find the match between the graph nodes and scalar
3883 // operands of the given node during vectorization/cost estimation.
3884 assert(all_of(TE->UserTreeIndices,(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
3885 [VF, &TE](const EdgeInfo &EI) {(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
3886 return EI.UserTE->Scalars.size() == VF ||(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
3887 EI.UserTE->Scalars.size() ==(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
3888 TE->Scalars.size();(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
3889 }) &&(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
3890 "All users must be of VF size.")(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
;
3891 // Update ordering of the operands with the smaller VF than the given
3892 // one.
3893 reorderReuses(TE->ReuseShuffleIndices, Mask);
3894 }
3895 continue;
3896 }
3897 if (TE->State == TreeEntry::Vectorize &&
3898 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
3899 InsertElementInst>(TE->getMainOp()) &&
3900 !TE->isAltShuffle()) {
3901 // Build correct orders for extract{element,value}, loads and
3902 // stores.
3903 reorderOrder(TE->ReorderIndices, Mask);
3904 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
3905 TE->reorderOperands(Mask);
3906 } else {
3907 // Reorder the node and its operands.
3908 TE->reorderOperands(Mask);
3909 assert(TE->ReorderIndices.empty() &&(static_cast <bool> (TE->ReorderIndices.empty() &&
"Expected empty reorder sequence.") ? void (0) : __assert_fail
("TE->ReorderIndices.empty() && \"Expected empty reorder sequence.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3910, __extension__
__PRETTY_FUNCTION__))
3910 "Expected empty reorder sequence.")(static_cast <bool> (TE->ReorderIndices.empty() &&
"Expected empty reorder sequence.") ? void (0) : __assert_fail
("TE->ReorderIndices.empty() && \"Expected empty reorder sequence.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3910, __extension__
__PRETTY_FUNCTION__))
;
3911 reorderScalars(TE->Scalars, Mask);
3912 }
3913 if (!TE->ReuseShuffleIndices.empty()) {
3914 // Apply reversed order to keep the original ordering of the reused
3915 // elements to avoid extra reorder indices shuffling.
3916 OrdersType CurrentOrder;
3917 reorderOrder(CurrentOrder, MaskOrder);
3918 SmallVector<int> NewReuses;
3919 inversePermutation(CurrentOrder, NewReuses);
3920 addMask(NewReuses, TE->ReuseShuffleIndices);
3921 TE->ReuseShuffleIndices.swap(NewReuses);
3922 }
3923 }
3924 }
3925}
3926
3927bool BoUpSLP::canReorderOperands(
3928 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3929 ArrayRef<TreeEntry *> ReorderableGathers,
3930 SmallVectorImpl<TreeEntry *> &GatherOps) {
3931 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
3932 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
3933 return OpData.first == I &&
3934 OpData.second->State == TreeEntry::Vectorize;
3935 }))
3936 continue;
3937 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
3938 // Do not reorder if operand node is used by many user nodes.
3939 if (any_of(TE->UserTreeIndices,
3940 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
3941 return false;
3942 // Add the node to the list of the ordered nodes with the identity
3943 // order.
3944 Edges.emplace_back(I, TE);
3945 // Add ScatterVectorize nodes to the list of operands, where just
3946 // reordering of the scalars is required. Similar to the gathers, so
3947 // simply add to the list of gathered ops.
3948 // If there are reused scalars, process this node as a regular vectorize
3949 // node, just reorder reuses mask.
3950 if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
3951 GatherOps.push_back(TE);
3952 continue;
3953 }
3954 TreeEntry *Gather = nullptr;
3955 if (count_if(ReorderableGathers,
3956 [&Gather, UserTE, I](TreeEntry *TE) {
3957 assert(TE->State != TreeEntry::Vectorize &&(static_cast <bool> (TE->State != TreeEntry::Vectorize
&& "Only non-vectorized nodes are expected.") ? void
(0) : __assert_fail ("TE->State != TreeEntry::Vectorize && \"Only non-vectorized nodes are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3958, __extension__
__PRETTY_FUNCTION__))
3958 "Only non-vectorized nodes are expected.")(static_cast <bool> (TE->State != TreeEntry::Vectorize
&& "Only non-vectorized nodes are expected.") ? void
(0) : __assert_fail ("TE->State != TreeEntry::Vectorize && \"Only non-vectorized nodes are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3958, __extension__
__PRETTY_FUNCTION__))
;
3959 if (any_of(TE->UserTreeIndices,
3960 [UserTE, I](const EdgeInfo &EI) {
3961 return EI.UserTE == UserTE && EI.EdgeIdx == I;
3962 })) {
3963 assert(TE->isSame(UserTE->getOperand(I)) &&(static_cast <bool> (TE->isSame(UserTE->getOperand
(I)) && "Operand entry does not match operands.") ? void
(0) : __assert_fail ("TE->isSame(UserTE->getOperand(I)) && \"Operand entry does not match operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3964, __extension__
__PRETTY_FUNCTION__))
3964 "Operand entry does not match operands.")(static_cast <bool> (TE->isSame(UserTE->getOperand
(I)) && "Operand entry does not match operands.") ? void
(0) : __assert_fail ("TE->isSame(UserTE->getOperand(I)) && \"Operand entry does not match operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3964, __extension__
__PRETTY_FUNCTION__))
;
3965 Gather = TE;
3966 return true;
3967 }
3968 return false;
3969 }) > 1 &&
3970 !all_of(UserTE->getOperand(I), isConstant))
3971 return false;
3972 if (Gather)
3973 GatherOps.push_back(Gather);
3974 }
3975 return true;
3976}
3977
3978void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
3979 SetVector<TreeEntry *> OrderedEntries;
3980 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
3981 // Find all reorderable leaf nodes with the given VF.
3982 // Currently the are vectorized loads,extracts without alternate operands +
3983 // some gathering of extracts.
3984 SmallVector<TreeEntry *> NonVectorized;
3985 for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
3986 &NonVectorized](
3987 const std::unique_ptr<TreeEntry> &TE) {
3988 if (TE->State != TreeEntry::Vectorize)
3989 NonVectorized.push_back(TE.get());
3990 if (Optional<OrdersType> CurrentOrder =
3991 getReorderingData(*TE, /*TopToBottom=*/false)) {
3992 OrderedEntries.insert(TE.get());
3993 if (TE->State != TreeEntry::Vectorize)
3994 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
3995 }
3996 });
3997
3998 // 1. Propagate order to the graph nodes, which use only reordered nodes.
3999 // I.e., if the node has operands, that are reordered, try to make at least
4000 // one operand order in the natural order and reorder others + reorder the
4001 // user node itself.
4002 SmallPtrSet<const TreeEntry *, 4> Visited;
4003 while (!OrderedEntries.empty()) {
4004 // 1. Filter out only reordered nodes.
4005 // 2. If the entry has multiple uses - skip it and jump to the next node.
4006 DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
4007 SmallVector<TreeEntry *> Filtered;
4008 for (TreeEntry *TE : OrderedEntries) {
4009 if (!(TE->State == TreeEntry::Vectorize ||
4010 (TE->State == TreeEntry::NeedToGather &&
4011 GathersToOrders.count(TE))) ||
4012 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4013 !all_of(drop_begin(TE->UserTreeIndices),
4014 [TE](const EdgeInfo &EI) {
4015 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4016 }) ||
4017 !Visited.insert(TE).second) {
4018 Filtered.push_back(TE);
4019 continue;
4020 }
4021 // Build a map between user nodes and their operands order to speedup
4022 // search. The graph currently does not provide this dependency directly.
4023 for (EdgeInfo &EI : TE->UserTreeIndices) {
4024 TreeEntry *UserTE = EI.UserTE;
4025 auto It = Users.find(UserTE);
4026 if (It == Users.end())
4027 It = Users.insert({UserTE, {}}).first;
4028 It->second.emplace_back(EI.EdgeIdx, TE);
4029 }
4030 }
4031 // Erase filtered entries.
4032 for_each(Filtered,
4033 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
4034 SmallVector<
4035 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4036 UsersVec(Users.begin(), Users.end());
4037 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
4038 return Data1.first->Idx > Data2.first->Idx;
4039 });
4040 for (auto &Data : UsersVec) {
4041 // Check that operands are used only in the User node.
4042 SmallVector<TreeEntry *> GatherOps;
4043 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
4044 GatherOps)) {
4045 for_each(Data.second,
4046 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
4047 OrderedEntries.remove(Op.second);
4048 });
4049 continue;
4050 }
4051 // All operands are reordered and used only in this node - propagate the
4052 // most used order to the user node.
4053 MapVector<OrdersType, unsigned,
4054 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
4055 OrdersUses;
4056 // Do the analysis for each tree entry only once, otherwise the order of
4057 // the same node my be considered several times, though might be not
4058 // profitable.
4059 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
4060 SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
4061 for (const auto &Op : Data.second) {
4062 TreeEntry *OpTE = Op.second;
4063 if (!VisitedOps.insert(OpTE).second)
4064 continue;
4065 if (!OpTE->ReuseShuffleIndices.empty())
4066 continue;
4067 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
4068 if (OpTE->State == TreeEntry::NeedToGather)
4069 return GathersToOrders.find(OpTE)->second;
4070 return OpTE->ReorderIndices;
4071 }();
4072 unsigned NumOps = count_if(
4073 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
4074 return P.second == OpTE;
4075 });
4076 // Stores actually store the mask, not the order, need to invert.
4077 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4078 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4079 SmallVector<int> Mask;
4080 inversePermutation(Order, Mask);
4081 unsigned E = Order.size();
4082 OrdersType CurrentOrder(E, E);
4083 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4084 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
4085 });
4086 fixupOrderingIndices(CurrentOrder);
4087 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4088 NumOps;
4089 } else {
4090 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4091 }
4092 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
4093 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4094 const TreeEntry *TE) {
4095 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4096 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4097 (IgnoreReorder && TE->Idx == 0))
4098 return true;
4099 if (TE->State == TreeEntry::NeedToGather) {
4100 auto It = GathersToOrders.find(TE);
4101 if (It != GathersToOrders.end())
4102 return !It->second.empty();
4103 return true;
4104 }
4105 return false;
4106 };
4107 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
4108 TreeEntry *UserTE = EI.UserTE;
4109 if (!VisitedUsers.insert(UserTE).second)
4110 continue;
4111 // May reorder user node if it requires reordering, has reused
4112 // scalars, is an alternate op vectorize node or its op nodes require
4113 // reordering.
4114 if (AllowsReordering(UserTE))
4115 continue;
4116 // Check if users allow reordering.
4117 // Currently look up just 1 level of operands to avoid increase of
4118 // the compile time.
4119 // Profitable to reorder if definitely more operands allow
4120 // reordering rather than those with natural order.
4121 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
4122 if (static_cast<unsigned>(count_if(
4123 Ops, [UserTE, &AllowsReordering](
4124 const std::pair<unsigned, TreeEntry *> &Op) {
4125 return AllowsReordering(Op.second) &&
4126 all_of(Op.second->UserTreeIndices,
4127 [UserTE](const EdgeInfo &EI) {
4128 return EI.UserTE == UserTE;
4129 });
4130 })) <= Ops.size() / 2)
4131 ++Res.first->second;
4132 }
4133 }
4134 // If no orders - skip current nodes and jump to the next one, if any.
4135 if (OrdersUses.empty()) {
4136 for_each(Data.second,
4137 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
4138 OrderedEntries.remove(Op.second);
4139 });
4140 continue;
4141 }
4142 // Choose the best order.
4143 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4144 unsigned Cnt = OrdersUses.front().second;
4145 for (const auto &Pair : drop_begin(OrdersUses)) {
4146 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4147 BestOrder = Pair.first;
4148 Cnt = Pair.second;
4149 }
4150 }
4151 // Set order of the user node (reordering of operands and user nodes).
4152 if (BestOrder.empty()) {
4153 for_each(Data.second,
4154 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
4155 OrderedEntries.remove(Op.second);
4156 });
4157 continue;
4158 }
4159 // Erase operands from OrderedEntries list and adjust their orders.
4160 VisitedOps.clear();
4161 SmallVector<int> Mask;
4162 inversePermutation(BestOrder, Mask);
4163 SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
4164 unsigned E = BestOrder.size();
4165 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4166 return I < E ? static_cast<int>(I) : UndefMaskElem;
4167 });
4168 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
4169 TreeEntry *TE = Op.second;
4170 OrderedEntries.remove(TE);
4171 if (!VisitedOps.insert(TE).second)
4172 continue;
4173 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
4174 // Just reorder reuses indices.
4175 reorderReuses(TE->ReuseShuffleIndices, Mask);
4176 continue;
4177 }
4178 // Gathers are processed separately.
4179 if (TE->State != TreeEntry::Vectorize)
4180 continue;
4181 assert((BestOrder.size() == TE->ReorderIndices.size() ||(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4183, __extension__
__PRETTY_FUNCTION__))
4182 TE->ReorderIndices.empty()) &&(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4183, __extension__
__PRETTY_FUNCTION__))
4183 "Non-matching sizes of user/operand entries.")(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4183, __extension__
__PRETTY_FUNCTION__))
;
4184 reorderOrder(TE->ReorderIndices, Mask);
4185 if (IgnoreReorder && TE == VectorizableTree.front().get())
4186 IgnoreReorder = false;
4187 }
4188 // For gathers just need to reorder its scalars.
4189 for (TreeEntry *Gather : GatherOps) {
4190 assert(Gather->ReorderIndices.empty() &&(static_cast <bool> (Gather->ReorderIndices.empty() &&
"Unexpected reordering of gathers.") ? void (0) : __assert_fail
("Gather->ReorderIndices.empty() && \"Unexpected reordering of gathers.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4191, __extension__
__PRETTY_FUNCTION__))
4191 "Unexpected reordering of gathers.")(static_cast <bool> (Gather->ReorderIndices.empty() &&
"Unexpected reordering of gathers.") ? void (0) : __assert_fail
("Gather->ReorderIndices.empty() && \"Unexpected reordering of gathers.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4191, __extension__
__PRETTY_FUNCTION__))
;
4192 if (!Gather->ReuseShuffleIndices.empty()) {
4193 // Just reorder reuses indices.
4194 reorderReuses(Gather->ReuseShuffleIndices, Mask);
4195 continue;
4196 }
4197 reorderScalars(Gather->Scalars, Mask);
4198 OrderedEntries.remove(Gather);
4199 }
4200 // Reorder operands of the user node and set the ordering for the user
4201 // node itself.
4202 if (Data.first->State != TreeEntry::Vectorize ||
4203 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4204 Data.first->getMainOp()) ||
4205 Data.first->isAltShuffle())
4206 Data.first->reorderOperands(Mask);
4207 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
4208 Data.first->isAltShuffle()) {
4209 reorderScalars(Data.first->Scalars, Mask);
4210 reorderOrder(Data.first->ReorderIndices, MaskOrder);
4211 if (Data.first->ReuseShuffleIndices.empty() &&
4212 !Data.first->ReorderIndices.empty() &&
4213 !Data.first->isAltShuffle()) {
4214 // Insert user node to the list to try to sink reordering deeper in
4215 // the graph.
4216 OrderedEntries.insert(Data.first);
4217 }
4218 } else {
4219 reorderOrder(Data.first->ReorderIndices, Mask);
4220 }
4221 }
4222 }
4223 // If the reordering is unnecessary, just remove the reorder.
4224 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4225 VectorizableTree.front()->ReuseShuffleIndices.empty())
4226 VectorizableTree.front()->ReorderIndices.clear();
4227}
4228
4229void BoUpSLP::buildExternalUses(
4230 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
4231 // Collect the values that we need to extract from the tree.
4232 for (auto &TEPtr : VectorizableTree) {
4233 TreeEntry *Entry = TEPtr.get();
4234
4235 // No need to handle users of gathered values.
4236 if (Entry->State == TreeEntry::NeedToGather)
4237 continue;
4238
4239 // For each lane:
4240 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4241 Value *Scalar = Entry->Scalars[Lane];
4242 int FoundLane = Entry->findLaneForValue(Scalar);
4243
4244 // Check if the scalar is externally used as an extra arg.
4245 auto ExtI = ExternallyUsedValues.find(Scalar);
4246 if (ExtI != ExternallyUsedValues.end()) {
4247 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
4248 << Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)
;
4249 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
4250 }
4251 for (User *U : Scalar->users()) {
4252 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Checking user:" << *U <<
".\n"; } } while (false)
;
4253
4254 Instruction *UserInst = dyn_cast<Instruction>(U);
4255 if (!UserInst)
4256 continue;
4257
4258 if (isDeleted(UserInst))
4259 continue;
4260
4261 // Skip in-tree scalars that become vectors
4262 if (TreeEntry *UseEntry = getTreeEntry(U)) {
4263 Value *UseScalar = UseEntry->Scalars[0];
4264 // Some in-tree scalars will remain as scalar in vectorized
4265 // instructions. If that is the case, the one in Lane 0 will
4266 // be used.
4267 if (UseScalar != U ||
4268 UseEntry->State == TreeEntry::ScatterVectorize ||
4269 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
4270 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *Udo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
4271 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)
;
4272 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state")(static_cast <bool> (UseEntry->State != TreeEntry::NeedToGather
&& "Bad state") ? void (0) : __assert_fail ("UseEntry->State != TreeEntry::NeedToGather && \"Bad state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4272, __extension__
__PRETTY_FUNCTION__))
;
4273 continue;
4274 }
4275 }
4276
4277 // Ignore users in the user ignore list.
4278 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4279 continue;
4280
4281 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
4282 << Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)
;
4283 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4284 }
4285 }
4286 }
4287}
4288
4289DenseMap<Value *, SmallVector<StoreInst *, 4>>
4290BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
4291 DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;
4292 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4293 Value *V = TE->Scalars[Lane];
4294 // To save compilation time we don't visit if we have too many users.
4295 static constexpr unsigned UsersLimit = 4;
4296 if (V->hasNUsesOrMore(UsersLimit))
4297 break;
4298
4299 // Collect stores per pointer object.
4300 for (User *U : V->users()) {
4301 auto *SI = dyn_cast<StoreInst>(U);
4302 if (SI == nullptr || !SI->isSimple() ||
4303 !isValidElementType(SI->getValueOperand()->getType()))
4304 continue;
4305 // Skip entry if already
4306 if (getTreeEntry(U))
4307 continue;
4308
4309 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
4310 auto &StoresVec = PtrToStoresMap[Ptr];
4311 // For now just keep one store per pointer object per lane.
4312 // TODO: Extend this to support multiple stores per pointer per lane
4313 if (StoresVec.size() > Lane)
4314 continue;
4315 // Skip if in different BBs.
4316 if (!StoresVec.empty() &&
4317 SI->getParent() != StoresVec.back()->getParent())
4318 continue;
4319 // Make sure that the stores are of the same type.
4320 if (!StoresVec.empty() &&
4321 SI->getValueOperand()->getType() !=
4322 StoresVec.back()->getValueOperand()->getType())
4323 continue;
4324 StoresVec.push_back(SI);
4325 }
4326 }
4327 return PtrToStoresMap;
4328}
4329
4330bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
4331 OrdersType &ReorderIndices) const {
4332 // We check whether the stores in StoreVec can form a vector by sorting them
4333 // and checking whether they are consecutive.
4334
4335 // To avoid calling getPointersDiff() while sorting we create a vector of
4336 // pairs {store, offset from first} and sort this instead.
4337 SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());
4338 StoreInst *S0 = StoresVec[0];
4339 StoreOffsetVec[0] = {S0, 0};
4340 Type *S0Ty = S0->getValueOperand()->getType();
4341 Value *S0Ptr = S0->getPointerOperand();
4342 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
4343 StoreInst *SI = StoresVec[Idx];
4344 Optional<int> Diff =
4345 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
4346 SI->getPointerOperand(), *DL, *SE,
4347 /*StrictCheck=*/true);
4348 // We failed to compare the pointers so just abandon this StoresVec.
4349 if (!Diff)
4350 return false;
4351 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
4352 }
4353
4354 // Sort the vector based on the pointers. We create a copy because we may
4355 // need the original later for calculating the reorder (shuffle) indices.
4356 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
4357 const std::pair<StoreInst *, int> &Pair2) {
4358 int Offset1 = Pair1.second;
4359 int Offset2 = Pair2.second;
4360 return Offset1 < Offset2;
4361 });
4362
4363 // Check if the stores are consecutive by checking if their difference is 1.
4364 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
4365 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)
4366 return false;
4367
4368 // Calculate the shuffle indices according to their offset against the sorted
4369 // StoreOffsetVec.
4370 ReorderIndices.reserve(StoresVec.size());
4371 for (StoreInst *SI : StoresVec) {
4372 unsigned Idx = find_if(StoreOffsetVec,
4373 [SI](const std::pair<StoreInst *, int> &Pair) {
4374 return Pair.first == SI;
4375 }) -
4376 StoreOffsetVec.begin();
4377 ReorderIndices.push_back(Idx);
4378 }
4379 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
4380 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
4381 // same convention here.
4382 auto IsIdentityOrder = [](const OrdersType &Order) {
4383 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4384 if (Idx != Order[Idx])
4385 return false;
4386 return true;
4387 };
4388 if (IsIdentityOrder(ReorderIndices))
4389 ReorderIndices.clear();
4390
4391 return true;
4392}
4393
4394#ifndef NDEBUG
4395LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static void dumpOrder(const BoUpSLP::OrdersType &Order) {
4396 for (unsigned Idx : Order)
4397 dbgs() << Idx << ", ";
4398 dbgs() << "\n";
4399}
4400#endif
4401
4402SmallVector<BoUpSLP::OrdersType, 1>
4403BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
4404 unsigned NumLanes = TE->Scalars.size();
4405
4406 DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =
4407 collectUserStores(TE);
4408
4409 // Holds the reorder indices for each candidate store vector that is a user of
4410 // the current TreeEntry.
4411 SmallVector<OrdersType, 1> ExternalReorderIndices;
4412
4413 // Now inspect the stores collected per pointer and look for vectorization
4414 // candidates. For each candidate calculate the reorder index vector and push
4415 // it into `ExternalReorderIndices`
4416 for (const auto &Pair : PtrToStoresMap) {
4417 auto &StoresVec = Pair.second;
4418 // If we have fewer than NumLanes stores, then we can't form a vector.
4419 if (StoresVec.size() != NumLanes)
4420 continue;
4421
4422 // If the stores are not consecutive then abandon this StoresVec.
4423 OrdersType ReorderIndices;
4424 if (!canFormVector(StoresVec, ReorderIndices))
4425 continue;
4426
4427 // We now know that the scalars in StoresVec can form a vector instruction,
4428 // so set the reorder indices.
4429 ExternalReorderIndices.push_back(ReorderIndices);
4430 }
4431 return ExternalReorderIndices;
4432}
4433
4434void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
4435 const SmallDenseSet<Value *> &UserIgnoreLst) {
4436 deleteTree();
4437 UserIgnoreList = &UserIgnoreLst;
4438 if (!allSameType(Roots))
4439 return;
4440 buildTree_rec(Roots, 0, EdgeInfo());
4441}
4442
4443void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
4444 deleteTree();
4445 if (!allSameType(Roots))
4446 return;
4447 buildTree_rec(Roots, 0, EdgeInfo());
4448}
4449
4450/// \return true if the specified list of values has only one instruction that
4451/// requires scheduling, false otherwise.
4452#ifndef NDEBUG
4453static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
4454 Value *NeedsScheduling = nullptr;
4455 for (Value *V : VL) {
4456 if (doesNotNeedToBeScheduled(V))
4457 continue;
4458 if (!NeedsScheduling) {
4459 NeedsScheduling = V;
4460 continue;
4461 }
4462 return false;
4463 }
4464 return NeedsScheduling;
4465}
4466#endif
4467
4468/// Generates key/subkey pair for the given value to provide effective sorting
4469/// of the values and better detection of the vectorizable values sequences. The
4470/// keys/subkeys can be used for better sorting of the values themselves (keys)
4471/// and in values subgroups (subkeys).
4472static std::pair<size_t, size_t> generateKeySubkey(
4473 Value *V, const TargetLibraryInfo *TLI,
4474 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
4475 bool AllowAlternate) {
4476 hash_code Key = hash_value(V->getValueID() + 2);
4477 hash_code SubKey = hash_value(0);
4478 // Sort the loads by the distance between the pointers.
4479 if (auto *LI = dyn_cast<LoadInst>(V)) {
4480 Key = hash_combine(hash_value(Instruction::Load), Key);
4481 if (LI->isSimple())
4482 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
4483 else
4484 SubKey = hash_value(LI);
4485 } else if (isVectorLikeInstWithConstOps(V)) {
4486 // Sort extracts by the vector operands.
4487 if (isa<ExtractElementInst, UndefValue>(V))
4488 Key = hash_value(Value::UndefValueVal + 1);
4489 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
4490 if (!isUndefVector(EI->getVectorOperand()) &&
4491 !isa<UndefValue>(EI->getIndexOperand()))
4492 SubKey = hash_value(EI->getVectorOperand());
4493 }
4494 } else if (auto *I = dyn_cast<Instruction>(V)) {
4495 // Sort other instructions just by the opcodes except for CMPInst.
4496 // For CMP also sort by the predicate kind.
4497 if ((isa<BinaryOperator, CastInst>(I)) &&
4498 isValidForAlternation(I->getOpcode())) {
4499 if (AllowAlternate)
4500 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
4501 else
4502 Key = hash_combine(hash_value(I->getOpcode()), Key);
4503 SubKey = hash_combine(
4504 hash_value(I->getOpcode()), hash_value(I->getType()),
4505 hash_value(isa<BinaryOperator>(I)
4506 ? I->getType()
4507 : cast<CastInst>(I)->getOperand(0)->getType()));
4508 // For casts, look through the only operand to improve compile time.
4509 if (isa<CastInst>(I)) {
4510 std::pair<size_t, size_t> OpVals =
4511 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
4512 /*=AllowAlternate*/ true);
4513 Key = hash_combine(OpVals.first, Key);
4514 SubKey = hash_combine(OpVals.first, SubKey);
4515 }
4516 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
4517 CmpInst::Predicate Pred = CI->getPredicate();
4518 if (CI->isCommutative())
4519 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
4520 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
4521 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
4522 hash_value(SwapPred),
4523 hash_value(CI->getOperand(0)->getType()));
4524 } else if (auto *Call = dyn_cast<CallInst>(I)) {
4525 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
4526 if (isTriviallyVectorizable(ID)) {
4527 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
4528 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
4529 SubKey = hash_combine(hash_value(I->getOpcode()),
4530 hash_value(Call->getCalledFunction()));
4531 } else {
4532 Key = hash_combine(hash_value(Call), Key);
4533 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
4534 }
4535 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
4536 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
4537 hash_value(Op.Tag), SubKey);
4538 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
4539 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
4540 SubKey = hash_value(Gep->getPointerOperand());
4541 else
4542 SubKey = hash_value(Gep);
4543 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
4544 !isa<ConstantInt>(I->getOperand(1))) {
4545 // Do not try to vectorize instructions with potentially high cost.
4546 SubKey = hash_value(I);
4547 } else {
4548 SubKey = hash_value(I->getOpcode());
4549 }
4550 Key = hash_combine(hash_value(I->getParent()), Key);
4551 }
4552 return std::make_pair(Key, SubKey);
4553}
4554
4555void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
4556 const EdgeInfo &UserTreeIdx) {
4557 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!")(static_cast <bool> ((allConstant(VL) || allSameType(VL
)) && "Invalid types!") ? void (0) : __assert_fail ("(allConstant(VL) || allSameType(VL)) && \"Invalid types!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4557, __extension__
__PRETTY_FUNCTION__))
;
4558
4559 SmallVector<int> ReuseShuffleIndicies;
4560 SmallVector<Value *> UniqueValues;
4561 auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
4562 &UserTreeIdx,
4563 this](const InstructionsState &S) {
4564 // Check that every instruction appears once in this bundle.
4565 DenseMap<Value *, unsigned> UniquePositions;
4566 for (Value *V : VL) {
4567 if (isConstant(V)) {
4568 ReuseShuffleIndicies.emplace_back(
4569 isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());
4570 UniqueValues.emplace_back(V);
4571 continue;
4572 }
4573 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
4574 ReuseShuffleIndicies.emplace_back(Res.first->second);
4575 if (Res.second)
4576 UniqueValues.emplace_back(V);
4577 }
4578 size_t NumUniqueScalarValues = UniqueValues.size();
4579 if (NumUniqueScalarValues == VL.size()) {
4580 ReuseShuffleIndicies.clear();
4581 } else {
4582 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Shuffle for reused scalars.\n"
; } } while (false)
;
4583 if (NumUniqueScalarValues <= 1 ||
4584 (UniquePositions.size() == 1 && all_of(UniqueValues,
4585 [](Value *V) {
4586 return isa<UndefValue>(V) ||
4587 !isConstant(V);
4588 })) ||
4589 !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
4590 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Scalar used twice in bundle.\n"
; } } while (false)
;
4591 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
4592 return false;
4593 }
4594 VL = UniqueValues;
4595 }
4596 return true;
4597 };
4598
4599 InstructionsState S = getSameOpcode(VL);
4600
4601 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
4602 // a load), in which case peek through to include it in the tree, without
4603 // ballooning over-budget.
4604 if (Depth >= RecursionMaxDepth &&
4605 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
4606 VL.size() >= 4 &&
4607 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
4608 return match(I,
4609 m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
4610 cast<Instruction>(I)->getOpcode() ==
4611 cast<Instruction>(S.MainOp)->getOpcode();
4612 })))) {
4613 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to max recursion depth.\n"
; } } while (false)
;
4614 if (TryToFindDuplicates(S))
4615 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4616 ReuseShuffleIndicies);
4617 return;
4618 }
4619
4620 // Don't handle scalable vectors
4621 if (S.getOpcode() == Instruction::ExtractElement &&
4622 isa<ScalableVectorType>(
4623 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
4624 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to scalable vector type.\n"
; } } while (false)
;
4625 if (TryToFindDuplicates(S))
4626 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4627 ReuseShuffleIndicies);
4628 return;
4629 }
4630
4631 // Don't handle vectors.
4632 if (S.OpValue->getType()->isVectorTy() &&
4633 !isa<InsertElementInst>(S.OpValue)) {
4634 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to vector type.\n"
; } } while (false)
;
4635 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
4636 return;
4637 }
4638
4639 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
4640 if (SI->getValueOperand()->getType()->isVectorTy()) {
4641 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to store vector type.\n"
; } } while (false)
;
4642 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
4643 return;
4644 }
4645
4646 // If all of the operands are identical or constant we have a simple solution.
4647 // If we deal with insert/extract instructions, they all must have constant
4648 // indices, otherwise we should gather them, not try to vectorize.
4649 // If alternate op node with 2 elements with gathered operands - do not
4650 // vectorize.
4651 auto &&NotProfitableForVectorization = [&S, this,
4652 Depth](ArrayRef<Value *> VL) {
4653 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
4654 return false;
4655 if (VectorizableTree.size() < MinTreeSize)
4656 return false;
4657 if (Depth >= RecursionMaxDepth - 1)
4658 return true;
4659 // Check if all operands are extracts, part of vector node or can build a
4660 // regular vectorize node.
4661 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
4662 for (Value *V : VL) {
4663 auto *I = cast<Instruction>(V);
4664 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
4665 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
4666 }));
4667 }
4668 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
4669 if ((IsCommutative &&
4670 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
4671 (!IsCommutative &&
4672 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
4673 return true;
4674 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.")(static_cast <bool> (VL.size() == 2 && "Expected only 2 alternate op instructions."
) ? void (0) : __assert_fail ("VL.size() == 2 && \"Expected only 2 alternate op instructions.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4674, __extension__
__PRETTY_FUNCTION__))
;
4675 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
4676 auto *I1 = cast<Instruction>(VL.front());
4677 auto *I2 = cast<Instruction>(VL.back());
4678 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
4679 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
4680 I2->getOperand(Op));
4681 if (static_cast<unsigned>(count_if(
4682 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
4683 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
4684 })) >= S.MainOp->getNumOperands() / 2)
4685 return false;
4686 if (S.MainOp->getNumOperands() > 2)
4687 return true;
4688 if (IsCommutative) {
4689 // Check permuted operands.
4690 Candidates.clear();
4691 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
4692 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
4693 I2->getOperand((Op + 1) % E));
4694 if (any_of(
4695 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
4696 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
4697 }))
4698 return false;
4699 }
4700 return true;
4701 };
4702 SmallVector<unsigned> SortedIndices;
4703 BasicBlock *BB = nullptr;
4704 bool IsScatterVectorizeUserTE =
4705 UserTreeIdx.UserTE &&
4706 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
4707 bool AreAllSameInsts =
4708 (S.getOpcode() && allSameBlock(VL)) ||
4709 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
4710 VL.size() > 2 &&
4711 all_of(VL,
4712 [&BB](Value *V) {
4713 auto *I = dyn_cast<GetElementPtrInst>(V);
4714 if (!I)
4715 return doesNotNeedToBeScheduled(V);
4716 if (!BB)
4717 BB = I->getParent();
4718 return BB == I->getParent() && I->getNumOperands() == 2;
4719 }) &&
4720 BB &&
4721 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
4722 SortedIndices));
4723 if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts ||
4724 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
4725 S.OpValue) &&
4726 !all_of(VL, isVectorLikeInstWithConstOps)) ||
4727 NotProfitableForVectorization(VL)) {
4728 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"
; } } while (false)
;
4729 if (TryToFindDuplicates(S))
4730 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4731 ReuseShuffleIndicies);
4732 return;
4733 }
4734
4735 // We now know that this is a vector of instructions of the same type from
4736 // the same block.
4737
4738 // Don't vectorize ephemeral values.
4739 if (!EphValues.empty()) {
4740 for (Value *V : VL) {
4741 if (EphValues.count(V)) {
4742 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is ephemeral.\n"; } } while (false)
4743 << ") is ephemeral.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is ephemeral.\n"; } } while (false)
;
4744 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
4745 return;
4746 }
4747 }
4748 }
4749
4750 // Check if this is a duplicate of another entry.
4751 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
4752 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tChecking bundle: " <<
*S.OpValue << ".\n"; } } while (false)
;
4753 if (!E->isSame(VL)) {
4754 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to partial overlap.\n"
; } } while (false)
;
4755 if (TryToFindDuplicates(S))
4756 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4757 ReuseShuffleIndicies);
4758 return;
4759 }
4760 // Record the reuse of the tree node. FIXME, currently this is only used to
4761 // properly draw the graph rather than for the actual vectorization.
4762 E->UserTreeIndices.push_back(UserTreeIdx);
4763 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValuedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)
4764 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)
;
4765 return;
4766 }
4767
4768 // Check that none of the instructions in the bundle are already in the tree.
4769 for (Value *V : VL) {
4770 if (!IsScatterVectorizeUserTE && !isa<Instruction>(V))
4771 continue;
4772 if (getTreeEntry(V)) {
4773 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is already in tree.\n"; } } while (false)
4774 << ") is already in tree.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is already in tree.\n"; } } while (false)
;
4775 if (TryToFindDuplicates(S))
4776 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4777 ReuseShuffleIndicies);
4778 return;
4779 }
4780 }
4781
4782 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
4783 if (UserIgnoreList && !UserIgnoreList->empty()) {
4784 for (Value *V : VL) {
4785 if (UserIgnoreList && UserIgnoreList->contains(V)) {
4786 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to gathered scalar.\n"
; } } while (false)
;
4787 if (TryToFindDuplicates(S))
4788 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4789 ReuseShuffleIndicies);
4790 return;
4791 }
4792 }
4793 }
4794
4795 // Special processing for sorted pointers for ScatterVectorize node with
4796 // constant indeces only.
4797 if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) &&
4798 UserTreeIdx.UserTE &&
4799 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) {
4800 assert(S.OpValue->getType()->isPointerTy() &&(static_cast <bool> (S.OpValue->getType()->isPointerTy
() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst
>(V); }) >= 2 && "Expected pointers only.") ? void
(0) : __assert_fail ("S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && \"Expected pointers only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4803, __extension__
__PRETTY_FUNCTION__))
4801 count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=(static_cast <bool> (S.OpValue->getType()->isPointerTy
() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst
>(V); }) >= 2 && "Expected pointers only.") ? void
(0) : __assert_fail ("S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && \"Expected pointers only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4803, __extension__
__PRETTY_FUNCTION__))
4802 2 &&(static_cast <bool> (S.OpValue->getType()->isPointerTy
() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst
>(V); }) >= 2 && "Expected pointers only.") ? void
(0) : __assert_fail ("S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && \"Expected pointers only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4803, __extension__
__PRETTY_FUNCTION__))
4803 "Expected pointers only.")(static_cast <bool> (S.OpValue->getType()->isPointerTy
() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst
>(V); }) >= 2 && "Expected pointers only.") ? void
(0) : __assert_fail ("S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && \"Expected pointers only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4803, __extension__
__PRETTY_FUNCTION__))
;
4804 // Reset S to make it GetElementPtr kind of node.
4805 const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
4806 assert(It != VL.end() && "Expected at least one GEP.")(static_cast <bool> (It != VL.end() && "Expected at least one GEP."
) ? void (0) : __assert_fail ("It != VL.end() && \"Expected at least one GEP.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4806, __extension__
__PRETTY_FUNCTION__))
;
4807 S = getSameOpcode(*It);
4808 }
4809
4810 // Check that all of the users of the scalars that we want to vectorize are
4811 // schedulable.
4812 auto *VL0 = cast<Instruction>(S.OpValue);
4813 BB = VL0->getParent();
4814
4815 if (!DT->isReachableFromEntry(BB)) {
4816 // Don't go into unreachable blocks. They may contain instructions with
4817 // dependency cycles which confuse the final scheduling.
4818 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in unreachable block.\n"
; } } while (false)
;
4819 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
4820 return;
4821 }
4822
4823 // Don't go into catchswitch blocks, which can happen with PHIs.
4824 // Such blocks can only have PHIs and the catchswitch. There is no
4825 // place to insert a shuffle if we need to, so just avoid that issue.
4826 if (isa<CatchSwitchInst>(BB->getTerminator())) {
4827 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in catchswitch block.\n"
; } } while (false)
;
4828 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
4829 return;
4830 }
4831
4832 // Check that every instruction appears once in this bundle.
4833 if (!TryToFindDuplicates(S))
4834 return;
4835
4836 auto &BSRef = BlocksSchedules[BB];
4837 if (!BSRef)
4838 BSRef = std::make_unique<BlockScheduling>(BB);
4839
4840 BlockScheduling &BS = *BSRef;
4841
4842 Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
4843#ifdef EXPENSIVE_CHECKS
4844 // Make sure we didn't break any internal invariants
4845 BS.verify();
4846#endif
4847 if (!Bundle) {
4848 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are not able to schedule this bundle!\n"
; } } while (false)
;
4849 assert((!BS.getScheduleData(VL0) ||(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4851, __extension__
__PRETTY_FUNCTION__))
4850 !BS.getScheduleData(VL0)->isPartOfBundle()) &&(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4851, __extension__
__PRETTY_FUNCTION__))
4851 "tryScheduleBundle should cancelScheduling on failure")(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4851, __extension__
__PRETTY_FUNCTION__))
;
4852 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4853 ReuseShuffleIndicies);
4854 return;
4855 }
4856 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are able to schedule this bundle.\n"
; } } while (false)
;
4857
4858 unsigned ShuffleOrOp = S.isAltShuffle() ?
4859 (unsigned) Instruction::ShuffleVector : S.getOpcode();
4860 switch (ShuffleOrOp) {
4861 case Instruction::PHI: {
4862 auto *PH = cast<PHINode>(VL0);
4863
4864 // Check for terminator values (e.g. invoke).
4865 for (Value *V : VL)
4866 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
4867 Instruction *Term = dyn_cast<Instruction>(Incoming);
4868 if (Term && Term->isTerminator()) {
4869 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"
; } } while (false)
4870 << "SLP: Need to swizzle PHINodes (terminator use).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"
; } } while (false)
;
4871 BS.cancelScheduling(VL, VL0);
4872 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4873 ReuseShuffleIndicies);
4874 return;
4875 }
4876 }
4877
4878 TreeEntry *TE =
4879 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
4880 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of PHINodes.\n"
; } } while (false)
;
4881
4882 // Keeps the reordered operands to avoid code duplication.
4883 SmallVector<ValueList, 2> OperandsVec;
4884 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
4885 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
4886 ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
4887 TE->setOperand(I, Operands);
4888 OperandsVec.push_back(Operands);
4889 continue;
4890 }
4891 ValueList Operands;
4892 // Prepare the operand vector.
4893 for (Value *V : VL)
4894 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
4895 PH->getIncomingBlock(I)));
4896 TE->setOperand(I, Operands);
4897 OperandsVec.push_back(Operands);
4898 }
4899 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
4900 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
4901 return;
4902 }
4903 case Instruction::ExtractValue:
4904 case Instruction::ExtractElement: {
4905 OrdersType CurrentOrder;
4906 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
4907 if (Reuse) {
4908 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Reusing or shuffling extract sequence.\n"
; } } while (false)
;
4909 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4910 ReuseShuffleIndicies);
4911 // This is a special case, as it does not gather, but at the same time
4912 // we are not extending buildTree_rec() towards the operands.
4913 ValueList Op0;
4914 Op0.assign(VL.size(), VL0->getOperand(0));
4915 VectorizableTree.back()->setOperand(0, Op0);
4916 return;
4917 }
4918 if (!CurrentOrder.empty()) {
4919 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
4920 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
4921 "with order";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
4922 for (unsigned Idx : CurrentOrder)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
4923 dbgs() << " " << Idx;do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
4924 dbgs() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
4925 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)
;
4926 fixupOrderingIndices(CurrentOrder);
4927 // Insert new order with initial value 0, if it does not exist,
4928 // otherwise return the iterator to the existing one.
4929 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4930 ReuseShuffleIndicies, CurrentOrder);
4931 // This is a special case, as it does not gather, but at the same time
4932 // we are not extending buildTree_rec() towards the operands.
4933 ValueList Op0;
4934 Op0.assign(VL.size(), VL0->getOperand(0));
4935 VectorizableTree.back()->setOperand(0, Op0);
4936 return;
4937 }
4938 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather extract sequence.\n";
} } while (false)
;
4939 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4940 ReuseShuffleIndicies);
4941 BS.cancelScheduling(VL, VL0);
4942 return;
4943 }
4944 case Instruction::InsertElement: {
4945 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique")(static_cast <bool> (ReuseShuffleIndicies.empty() &&
"All inserts should be unique") ? void (0) : __assert_fail (
"ReuseShuffleIndicies.empty() && \"All inserts should be unique\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4945, __extension__
__PRETTY_FUNCTION__))
;
4946
4947 // Check that we have a buildvector and not a shuffle of 2 or more
4948 // different vectors.
4949 ValueSet SourceVectors;
4950 for (Value *V : VL) {
4951 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
4952 assert(getInsertIndex(V) != None && "Non-constant or undef index?")(static_cast <bool> (getInsertIndex(V) != None &&
"Non-constant or undef index?") ? void (0) : __assert_fail (
"getInsertIndex(V) != None && \"Non-constant or undef index?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4952, __extension__
__PRETTY_FUNCTION__))
;
4953 }
4954
4955 if (count_if(VL, [&SourceVectors](Value *V) {
4956 return !SourceVectors.contains(V);
4957 }) >= 2) {
4958 // Found 2nd source vector - cancel.
4959 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather of insertelement vectors with "
"different source vectors.\n"; } } while (false)
4960 "different source vectors.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather of insertelement vectors with "
"different source vectors.\n"; } } while (false)
;
4961 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
4962 BS.cancelScheduling(VL, VL0);
4963 return;
4964 }
4965
4966 auto OrdCompare = [](const std::pair<int, int> &P1,
4967 const std::pair<int, int> &P2) {
4968 return P1.first > P2.first;
4969 };
4970 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
4971 decltype(OrdCompare)>
4972 Indices(OrdCompare);
4973 for (int I = 0, E = VL.size(); I < E; ++I) {
4974 unsigned Idx = *getInsertIndex(VL[I]);
4975 Indices.emplace(Idx, I);
4976 }
4977 OrdersType CurrentOrder(VL.size(), VL.size());
4978 bool IsIdentity = true;
4979 for (int I = 0, E = VL.size(); I < E; ++I) {
4980 CurrentOrder[Indices.top().second] = I;
4981 IsIdentity &= Indices.top().second == I;
4982 Indices.pop();
4983 }
4984 if (IsIdentity)
4985 CurrentOrder.clear();
4986 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4987 None, CurrentOrder);
4988 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added inserts bundle.\n"; } }
while (false)
;
4989
4990 constexpr int NumOps = 2;
4991 ValueList VectorOperands[NumOps];
4992 for (int I = 0; I < NumOps; ++I) {
4993 for (Value *V : VL)
4994 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
4995
4996 TE->setOperand(I, VectorOperands[I]);
4997 }
4998 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
4999 return;
5000 }
5001 case Instruction::Load: {
5002 // Check that a vectorized load would load the same memory as a scalar
5003 // load. For example, we don't want to vectorize loads that are smaller
5004 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5005 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5006 // from such a struct, we read/write packed bits disagreeing with the
5007 // unvectorized version.
5008 SmallVector<Value *> PointerOps;
5009 OrdersType CurrentOrder;
5010 TreeEntry *TE = nullptr;
5011 switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder,
5012 PointerOps)) {
5013 case LoadsState::Vectorize:
5014 if (CurrentOrder.empty()) {
5015 // Original loads are consecutive and does not require reordering.
5016 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5017 ReuseShuffleIndicies);
5018 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of loads.\n";
} } while (false)
;
5019 } else {
5020 fixupOrderingIndices(CurrentOrder);
5021 // Need to reorder.
5022 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5023 ReuseShuffleIndicies, CurrentOrder);
5024 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of jumbled loads.\n"
; } } while (false)
;
5025 }
5026 TE->setOperandsInOrder();
5027 break;
5028 case LoadsState::ScatterVectorize:
5029 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
5030 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
5031 UserTreeIdx, ReuseShuffleIndicies);
5032 TE->setOperandsInOrder();
5033 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
5034 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of non-consecutive loads.\n"
; } } while (false)
;
5035 break;
5036 case LoadsState::Gather:
5037 BS.cancelScheduling(VL, VL0);
5038 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5039 ReuseShuffleIndicies);
5040#ifndef NDEBUG
5041 Type *ScalarTy = VL0->getType();
5042 if (DL->getTypeSizeInBits(ScalarTy) !=
5043 DL->getTypeAllocSizeInBits(ScalarTy))
5044 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering loads of non-packed type.\n"
; } } while (false)
;
5045 else if (any_of(VL, [](Value *V) {
5046 return !cast<LoadInst>(V)->isSimple();
5047 }))
5048 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple loads.\n"
; } } while (false)
;
5049 else
5050 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-consecutive loads.\n"
; } } while (false)
;
5051#endif // NDEBUG
5052 break;
5053 }
5054 return;
5055 }
5056 case Instruction::ZExt:
5057 case Instruction::SExt:
5058 case Instruction::FPToUI:
5059 case Instruction::FPToSI:
5060 case Instruction::FPExt:
5061 case Instruction::PtrToInt:
5062 case Instruction::IntToPtr:
5063 case Instruction::SIToFP:
5064 case Instruction::UIToFP:
5065 case Instruction::Trunc:
5066 case Instruction::FPTrunc:
5067 case Instruction::BitCast: {
5068 Type *SrcTy = VL0->getOperand(0)->getType();
5069 for (Value *V : VL) {
5070 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
5071 if (Ty != SrcTy || !isValidElementType(Ty)) {
5072 BS.cancelScheduling(VL, VL0);
5073 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5074 ReuseShuffleIndicies);
5075 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)
5076 << "SLP: Gathering casts with different src types.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)
;
5077 return;
5078 }
5079 }
5080 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5081 ReuseShuffleIndicies);
5082 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of casts.\n";
} } while (false)
;
5083
5084 TE->setOperandsInOrder();
5085 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
5086 ValueList Operands;
5087 // Prepare the operand vector.
5088 for (Value *V : VL)
5089 Operands.push_back(cast<Instruction>(V)->getOperand(i));
5090
5091 buildTree_rec(Operands, Depth + 1, {TE, i});
5092 }
5093 return;
5094 }
5095 case Instruction::ICmp:
5096 case Instruction::FCmp: {
5097 // Check that all of the compares have the same predicate.
5098 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
5099 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
5100 Type *ComparedTy = VL0->getOperand(0)->getType();
5101 for (Value *V : VL) {
5102 CmpInst *Cmp = cast<CmpInst>(V);
5103 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
5104 Cmp->getOperand(0)->getType() != ComparedTy) {
5105 BS.cancelScheduling(VL, VL0);
5106 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5107 ReuseShuffleIndicies);
5108 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)
5109 << "SLP: Gathering cmp with different predicate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)
;
5110 return;
5111 }
5112 }
5113
5114 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5115 ReuseShuffleIndicies);
5116 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of compares.\n"
; } } while (false)
;
5117
5118 ValueList Left, Right;
5119 if (cast<CmpInst>(VL0)->isCommutative()) {
5120 // Commutative predicate - collect + sort operands of the instructions
5121 // so that each side is more likely to have the same opcode.
5122 assert(P0 == SwapP0 && "Commutative Predicate mismatch")(static_cast <bool> (P0 == SwapP0 && "Commutative Predicate mismatch"
) ? void (0) : __assert_fail ("P0 == SwapP0 && \"Commutative Predicate mismatch\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5122, __extension__
__PRETTY_FUNCTION__))
;
5123 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
5124 } else {
5125 // Collect operands - commute if it uses the swapped predicate.
5126 for (Value *V : VL) {
5127 auto *Cmp = cast<CmpInst>(V);
5128 Value *LHS = Cmp->getOperand(0);
5129 Value *RHS = Cmp->getOperand(1);
5130 if (Cmp->getPredicate() != P0)
5131 std::swap(LHS, RHS);
5132 Left.push_back(LHS);
5133 Right.push_back(RHS);
5134 }
5135 }
5136 TE->setOperand(0, Left);
5137 TE->setOperand(1, Right);
5138 buildTree_rec(Left, Depth + 1, {TE, 0});
5139 buildTree_rec(Right, Depth + 1, {TE, 1});
5140 return;
5141 }
5142 case Instruction::Select:
5143 case Instruction::FNeg:
5144 case Instruction::Add:
5145 case Instruction::FAdd:
5146 case Instruction::Sub:
5147 case Instruction::FSub:
5148 case Instruction::Mul:
5149 case Instruction::FMul:
5150 case Instruction::UDiv:
5151 case Instruction::SDiv:
5152 case Instruction::FDiv:
5153 case Instruction::URem:
5154 case Instruction::SRem:
5155 case Instruction::FRem:
5156 case Instruction::Shl:
5157 case Instruction::LShr:
5158 case Instruction::AShr:
5159 case Instruction::And:
5160 case Instruction::Or:
5161 case Instruction::Xor: {
5162 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5163 ReuseShuffleIndicies);
5164 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of un/bin op.\n"
; } } while (false)
;
5165
5166 // Sort operands of the instructions so that each side is more likely to
5167 // have the same opcode.
5168 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
5169 ValueList Left, Right;
5170 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
5171 TE->setOperand(0, Left);
5172 TE->setOperand(1, Right);
5173 buildTree_rec(Left, Depth + 1, {TE, 0});
5174 buildTree_rec(Right, Depth + 1, {TE, 1});
5175 return;
5176 }
5177
5178 TE->setOperandsInOrder();
5179 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
5180 ValueList Operands;
5181 // Prepare the operand vector.
5182 for (Value *V : VL)
5183 Operands.push_back(cast<Instruction>(V)->getOperand(i));
5184
5185 buildTree_rec(Operands, Depth + 1, {TE, i});
5186 }
5187 return;
5188 }
5189 case Instruction::GetElementPtr: {
5190 // We don't combine GEPs with complicated (nested) indexing.
5191 for (Value *V : VL) {
5192 auto *I = dyn_cast<GetElementPtrInst>(V);
5193 if (!I)
5194 continue;
5195 if (I->getNumOperands() != 2) {
5196 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"
; } } while (false)
;
5197 BS.cancelScheduling(VL, VL0);
5198 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5199 ReuseShuffleIndicies);
5200 return;
5201 }
5202 }
5203
5204 // We can't combine several GEPs into one vector if they operate on
5205 // different types.
5206 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
5207 for (Value *V : VL) {
5208 auto *GEP = dyn_cast<GEPOperator>(V);
5209 if (!GEP)
5210 continue;
5211 Type *CurTy = GEP->getSourceElementType();
5212 if (Ty0 != CurTy) {
5213 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)
5214 << "SLP: not-vectorizable GEP (different types).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)
;
5215 BS.cancelScheduling(VL, VL0);
5216 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5217 ReuseShuffleIndicies);
5218 return;
5219 }
5220 }
5221
5222 // We don't combine GEPs with non-constant indexes.
5223 Type *Ty1 = VL0->getOperand(1)->getType();
5224 for (Value *V : VL) {
5225 auto *I = dyn_cast<GetElementPtrInst>(V);
5226 if (!I)
5227 continue;
5228 auto *Op = I->getOperand(1);
5229 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
5230 (Op->getType() != Ty1 &&
5231 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
5232 Op->getType()->getScalarSizeInBits() >
5233 DL->getIndexSizeInBits(
5234 V->getType()->getPointerAddressSpace())))) {
5235 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
5236 << "SLP: not-vectorizable GEP (non-constant indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)
;
5237 BS.cancelScheduling(VL, VL0);
5238 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5239 ReuseShuffleIndicies);
5240 return;
5241 }
5242 }
5243
5244 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5245 ReuseShuffleIndicies);
5246 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of GEPs.\n"; }
} while (false)
;
5247 SmallVector<ValueList, 2> Operands(2);
5248 // Prepare the operand vector for pointer operands.
5249 for (Value *V : VL) {
5250 auto *GEP = dyn_cast<GetElementPtrInst>(V);
5251 if (!GEP) {
5252 Operands.front().push_back(V);
5253 continue;
5254 }
5255 Operands.front().push_back(GEP->getPointerOperand());
5256 }
5257 TE->setOperand(0, Operands.front());
5258 // Need to cast all indices to the same type before vectorization to
5259 // avoid crash.
5260 // Required to be able to find correct matches between different gather
5261 // nodes and reuse the vectorized values rather than trying to gather them
5262 // again.
5263 int IndexIdx = 1;
5264 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
5265 Type *Ty = all_of(VL,
5266 [VL0Ty, IndexIdx](Value *V) {
5267 auto *GEP = dyn_cast<GetElementPtrInst>(V);
5268 if (!GEP)
5269 return true;
5270 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
5271 })
5272 ? VL0Ty
5273 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
5274 ->getPointerOperandType()
5275 ->getScalarType());
5276 // Prepare the operand vector.
5277 for (Value *V : VL) {
5278 auto *I = dyn_cast<GetElementPtrInst>(V);
5279 if (!I) {
5280 Operands.back().push_back(
5281 ConstantInt::get(Ty, 0, /*isSigned=*/false));
5282 continue;
5283 }
5284 auto *Op = I->getOperand(IndexIdx);
5285 auto *CI = dyn_cast<ConstantInt>(Op);
5286 if (!CI)
5287 Operands.back().push_back(Op);
5288 else
5289 Operands.back().push_back(ConstantExpr::getIntegerCast(
5290 CI, Ty, CI->getValue().isSignBitSet()));
5291 }
5292 TE->setOperand(IndexIdx, Operands.back());
5293
5294 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
5295 buildTree_rec(Operands[I], Depth + 1, {TE, I});
5296 return;
5297 }
5298 case Instruction::Store: {
5299 // Check if the stores are consecutive or if we need to swizzle them.
5300 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
5301 // Avoid types that are padded when being allocated as scalars, while
5302 // being packed together in a vector (such as i1).
5303 if (DL->getTypeSizeInBits(ScalarTy) !=
5304 DL->getTypeAllocSizeInBits(ScalarTy)) {
5305 BS.cancelScheduling(VL, VL0);
5306 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5307 ReuseShuffleIndicies);
5308 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering stores of non-packed type.\n"
; } } while (false)
;
5309 return;
5310 }
5311 // Make sure all stores in the bundle are simple - we can't vectorize
5312 // atomic or volatile stores.
5313 SmallVector<Value *, 4> PointerOps(VL.size());
5314 ValueList Operands(VL.size());
5315 auto POIter = PointerOps.begin();
5316 auto OIter = Operands.begin();
5317 for (Value *V : VL) {
5318 auto *SI = cast<StoreInst>(V);
5319 if (!SI->isSimple()) {
5320 BS.cancelScheduling(VL, VL0);
5321 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5322 ReuseShuffleIndicies);
5323 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple stores.\n"
; } } while (false)
;
5324 return;
5325 }
5326 *POIter = SI->getPointerOperand();
5327 *OIter = SI->getValueOperand();
5328 ++POIter;
5329 ++OIter;
5330 }
5331
5332 OrdersType CurrentOrder;
5333 // Check the order of pointer operands.
5334 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
5335 Value *Ptr0;
5336 Value *PtrN;
5337 if (CurrentOrder.empty()) {
5338 Ptr0 = PointerOps.front();
5339 PtrN = PointerOps.back();
5340 } else {
5341 Ptr0 = PointerOps[CurrentOrder.front()];
5342 PtrN = PointerOps[CurrentOrder.back()];
5343 }
5344 Optional<int> Dist =
5345 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5346 // Check that the sorted pointer operands are consecutive.
5347 if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
5348 if (CurrentOrder.empty()) {
5349 // Original stores are consecutive and does not require reordering.
5350 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
5351 UserTreeIdx, ReuseShuffleIndicies);
5352 TE->setOperandsInOrder();
5353 buildTree_rec(Operands, Depth + 1, {TE, 0});
5354 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of stores.\n"
; } } while (false)
;
5355 } else {
5356 fixupOrderingIndices(CurrentOrder);
5357 TreeEntry *TE =
5358 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5359 ReuseShuffleIndicies, CurrentOrder);
5360 TE->setOperandsInOrder();
5361 buildTree_rec(Operands, Depth + 1, {TE, 0});
5362 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of jumbled stores.\n"
; } } while (false)
;
5363 }
5364 return;
5365 }
5366 }
5367
5368 BS.cancelScheduling(VL, VL0);
5369 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5370 ReuseShuffleIndicies);
5371 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-consecutive store.\n"; }
} while (false)
;
5372 return;
5373 }
5374 case Instruction::Call: {
5375 // Check if the calls are all to the same vectorizable intrinsic or
5376 // library function.
5377 CallInst *CI = cast<CallInst>(VL0);
5378 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
5379
5380 VFShape Shape = VFShape::get(
5381 *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
5382 false /*HasGlobalPred*/);
5383 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
5384
5385 if (!VecFunc && !isTriviallyVectorizable(ID)) {
5386 BS.cancelScheduling(VL, VL0);
5387 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5388 ReuseShuffleIndicies);
5389 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-vectorizable call.\n"; }
} while (false)
;
5390 return;
5391 }
5392 Function *F = CI->getCalledFunction();
5393 unsigned NumArgs = CI->arg_size();
5394 SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
5395 for (unsigned j = 0; j != NumArgs; ++j)
5396 if (isVectorIntrinsicWithScalarOpAtArg(ID, j))
5397 ScalarArgs[j] = CI->getArgOperand(j);
5398 for (Value *V : VL) {
5399 CallInst *CI2 = dyn_cast<CallInst>(V);
5400 if (!CI2 || CI2->getCalledFunction() != F ||
5401 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
5402 (VecFunc &&
5403 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
5404 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
5405 BS.cancelScheduling(VL, VL0);
5406 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5407 ReuseShuffleIndicies);
5408 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *V << "\n"; } } while (false)
5409 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *V << "\n"; } } while (false)
;
5410 return;
5411 }
5412 // Some intrinsics have scalar arguments and should be same in order for
5413 // them to be vectorized.
5414 for (unsigned j = 0; j != NumArgs; ++j) {
5415 if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {
5416 Value *A1J = CI2->getArgOperand(j);
5417 if (ScalarArgs[j] != A1J) {
5418 BS.cancelScheduling(VL, VL0);
5419 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5420 ReuseShuffleIndicies);
5421 LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
5422 << " argument " << ScalarArgs[j] << "!=" << A1Jdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
5423 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)
;
5424 return;
5425 }
5426 }
5427 }
5428 // Verify that the bundle operands are identical between the two calls.
5429 if (CI->hasOperandBundles() &&
5430 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
5431 CI->op_begin() + CI->getBundleOperandsEndIndex(),
5432 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
5433 BS.cancelScheduling(VL, VL0);
5434 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5435 ReuseShuffleIndicies);
5436 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n'; } } while
(false)
5437 << *CI << "!=" << *V << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n'; } } while
(false)
;
5438 return;
5439 }
5440 }
5441
5442 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5443 ReuseShuffleIndicies);
5444 TE->setOperandsInOrder();
5445 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
5446 // For scalar operands no need to to create an entry since no need to
5447 // vectorize it.
5448 if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
5449 continue;
5450 ValueList Operands;
5451 // Prepare the operand vector.
5452 for (Value *V : VL) {
5453 auto *CI2 = cast<CallInst>(V);
5454 Operands.push_back(CI2->getArgOperand(i));
5455 }
5456 buildTree_rec(Operands, Depth + 1, {TE, i});
5457 }
5458 return;
5459 }
5460 case Instruction::ShuffleVector: {
5461 // If this is not an alternate sequence of opcode like add-sub
5462 // then do not vectorize this instruction.
5463 if (!S.isAltShuffle()) {
5464 BS.cancelScheduling(VL, VL0);
5465 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
5466 ReuseShuffleIndicies);
5467 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: ShuffleVector are not vectorized.\n"
; } } while (false)
;
5468 return;
5469 }
5470 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5471 ReuseShuffleIndicies);
5472 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a ShuffleVector op.\n"
; } } while (false)
;
5473
5474 // Reorder operands if reordering would enable vectorization.
5475 auto *CI = dyn_cast<CmpInst>(VL0);
5476 if (isa<BinaryOperator>(VL0) || CI) {
5477 ValueList Left, Right;
5478 if (!CI || all_of(VL, [](Value *V) {
5479 return cast<CmpInst>(V)->isCommutative();
5480 })) {
5481 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
5482 } else {
5483 CmpInst::Predicate P0 = CI->getPredicate();
5484 CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate();
5485 assert(P0 != AltP0 &&(static_cast <bool> (P0 != AltP0 && "Expected different main/alternate predicates."
) ? void (0) : __assert_fail ("P0 != AltP0 && \"Expected different main/alternate predicates.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5486, __extension__
__PRETTY_FUNCTION__))
5486 "Expected different main/alternate predicates.")(static_cast <bool> (P0 != AltP0 && "Expected different main/alternate predicates."
) ? void (0) : __assert_fail ("P0 != AltP0 && \"Expected different main/alternate predicates.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5486, __extension__
__PRETTY_FUNCTION__))
;
5487 CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
5488 Value *BaseOp0 = VL0->getOperand(0);
5489 Value *BaseOp1 = VL0->getOperand(1);
5490 // Collect operands - commute if it uses the swapped predicate or
5491 // alternate operation.
5492 for (Value *V : VL) {
5493 auto *Cmp = cast<CmpInst>(V);
5494 Value *LHS = Cmp->getOperand(0);