LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<bool>
117 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118 cl::desc("Enable vectorization for wider vector utilization"));
119
120static cl::opt<int>
122 cl::desc("Only vectorize if you gain more than this "
123 "number "));
124
126 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
129
130static cl::opt<bool>
131ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132 cl::desc("Attempt to vectorize horizontal reductions"));
133
135 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136 cl::desc(
137 "Attempt to vectorize horizontal reductions feeding into a store"));
138
139// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140// even if we match a reduction but do not vectorize in the end.
142 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
143 cl::desc("Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
145
146static cl::opt<int>
148 cl::desc("Attempt to vectorize for this register size in bits"));
149
152 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153
154/// Limits the size of scheduling regions in a block.
155/// It avoid long compile times for _very_ large blocks where vector
156/// instructions are spread over a wide range.
157/// This limit is way higher than needed by real-world functions.
158static cl::opt<int>
159ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
160 cl::desc("Limit the size of the SLP scheduling region per block"));
161
163 "slp-min-reg-size", cl::init(128), cl::Hidden,
164 cl::desc("Attempt to vectorize for this register size in bits"));
165
167 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
168 cl::desc("Limit the recursion depth when building a vectorizable tree"));
169
171 "slp-min-tree-size", cl::init(3), cl::Hidden,
172 cl::desc("Only vectorize small trees if they are fully vectorizable"));
173
174// The maximum depth that the look-ahead score heuristic will explore.
175// The higher this value, the higher the compilation time overhead.
177 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
178 cl::desc("The maximum look-ahead depth for operand reordering scores"));
179
180// The maximum depth that the look-ahead score heuristic will explore
181// when it probing among candidates for vectorization tree roots.
182// The higher this value, the higher the compilation time overhead but unlike
183// similar limit for operands ordering this is less frequently used, hence
184// impact of higher value is less noticeable.
186 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
187 cl::desc("The maximum look-ahead depth for searching best rooting option"));
188
190 "slp-min-strided-loads", cl::init(2), cl::Hidden,
191 cl::desc("The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
193
195 "slp-max-stride", cl::init(8), cl::Hidden,
196 cl::desc("The maximum stride, considered to be profitable."));
197
198static cl::opt<bool>
199 ViewSLPTree("view-slp-tree", cl::Hidden,
200 cl::desc("Display the SLP trees with Graphviz"));
201
203 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
204 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205
206// Limit the number of alias checks. The limit is chosen so that
207// it has no negative effect on the llvm benchmarks.
208static const unsigned AliasedCheckLimit = 10;
209
210// Limit of the number of uses for potentially transformed instructions/values,
211// used in checks to avoid compile-time explode.
212static constexpr int UsesLimit = 64;
213
214// Another limit for the alias checks: The maximum distance between load/store
215// instructions where alias checks are done.
216// This limit is useful for very large basic blocks.
217static const unsigned MaxMemDepDistance = 160;
218
219/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220/// regions to be handled.
221static const int MinScheduleRegionSize = 16;
222
223/// Maximum allowed number of operands in the PHI nodes.
224static const unsigned MaxPHINumOperands = 128;
225
226/// Predicate for the element types that the SLP vectorizer supports.
227///
228/// The most important thing to filter here are types which are invalid in LLVM
229/// vectors. We also filter target specific types which have absolutely no
230/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231/// avoids spending time checking the cost model and realizing that they will
232/// be inevitably scalarized.
233static bool isValidElementType(Type *Ty) {
234 // TODO: Support ScalableVectorType.
235 if (SLPReVec && isa<FixedVectorType>(Ty))
236 Ty = Ty->getScalarType();
237 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
238 !Ty->isPPC_FP128Ty();
239}
240
241/// \returns the number of elements for Ty.
242static unsigned getNumElements(Type *Ty) {
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
247 return 1;
248}
249
250/// \returns the vector type of ScalarTy based on vectorization factor.
251static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252 return FixedVectorType::get(ScalarTy->getScalarType(),
253 VF * getNumElements(ScalarTy));
254}
255
256static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
257 SmallVectorImpl<int> &Mask) {
258 // The ShuffleBuilder implementation use shufflevector to splat an "element".
259 // But the element have different meaning for SLP (scalar) and REVEC
260 // (vector). We need to expand Mask into masks which shufflevector can use
261 // directly.
262 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
263 for (unsigned I : seq<unsigned>(Mask.size()))
264 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
265 I * VecTyNumElements, VecTyNumElements)))
266 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
267 : Mask[I] * VecTyNumElements + J;
268 Mask.swap(NewMask);
269}
270
271/// \returns True if the value is a constant (but not globals/constant
272/// expressions).
273static bool isConstant(Value *V) {
274 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
275}
276
277/// Checks if \p V is one of vector-like instructions, i.e. undef,
278/// insertelement/extractelement with constant indices for fixed vector type or
279/// extractvalue instruction.
281 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
282 !isa<ExtractValueInst, UndefValue>(V))
283 return false;
284 auto *I = dyn_cast<Instruction>(V);
285 if (!I || isa<ExtractValueInst>(I))
286 return true;
287 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
288 return false;
289 if (isa<ExtractElementInst>(I))
290 return isConstant(I->getOperand(1));
291 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
292 return isConstant(I->getOperand(2));
293}
294
295/// Returns power-of-2 number of elements in a single register (part), given the
296/// total number of elements \p Size and number of registers (parts) \p
297/// NumParts.
298static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
299 return PowerOf2Ceil(divideCeil(Size, NumParts));
300}
301
302/// Returns correct remaining number of elements, considering total amount \p
303/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
304/// and current register (part) \p Part.
305static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
306 unsigned Part) {
307 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
308}
309
310#if !defined(NDEBUG)
311/// Print a short descriptor of the instruction bundle suitable for debug output.
312static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
313 std::string Result;
314 raw_string_ostream OS(Result);
315 if (Idx >= 0)
316 OS << "Idx: " << Idx << ", ";
317 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
318 OS.flush();
319 return Result;
320}
321#endif
322
323/// \returns true if all of the instructions in \p VL are in the same block or
324/// false otherwise.
326 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
327 if (!I0)
328 return false;
330 return true;
331
332 BasicBlock *BB = I0->getParent();
333 for (int I = 1, E = VL.size(); I < E; I++) {
334 auto *II = dyn_cast<Instruction>(VL[I]);
335 if (!II)
336 return false;
337
338 if (BB != II->getParent())
339 return false;
340 }
341 return true;
342}
343
344/// \returns True if all of the values in \p VL are constants (but not
345/// globals/constant expressions).
347 // Constant expressions and globals can't be vectorized like normal integer/FP
348 // constants.
349 return all_of(VL, isConstant);
350}
351
352/// \returns True if all of the values in \p VL are identical or some of them
353/// are UndefValue.
354static bool isSplat(ArrayRef<Value *> VL) {
355 Value *FirstNonUndef = nullptr;
356 for (Value *V : VL) {
357 if (isa<UndefValue>(V))
358 continue;
359 if (!FirstNonUndef) {
360 FirstNonUndef = V;
361 continue;
362 }
363 if (V != FirstNonUndef)
364 return false;
365 }
366 return FirstNonUndef != nullptr;
367}
368
369/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
371 if (auto *Cmp = dyn_cast<CmpInst>(I))
372 return Cmp->isCommutative();
373 if (auto *BO = dyn_cast<BinaryOperator>(I))
374 return BO->isCommutative() ||
375 (BO->getOpcode() == Instruction::Sub &&
376 !BO->hasNUsesOrMore(UsesLimit) &&
377 all_of(
378 BO->uses(),
379 [](const Use &U) {
380 // Commutative, if icmp eq/ne sub, 0
381 ICmpInst::Predicate Pred;
382 if (match(U.getUser(),
383 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
384 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
385 return true;
386 // Commutative, if abs(sub nsw, true) or abs(sub, false).
387 ConstantInt *Flag;
388 return match(U.getUser(),
389 m_Intrinsic<Intrinsic::abs>(
390 m_Specific(U.get()), m_ConstantInt(Flag))) &&
391 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
392 Flag->isOne());
393 })) ||
394 (BO->getOpcode() == Instruction::FSub &&
395 !BO->hasNUsesOrMore(UsesLimit) &&
396 all_of(BO->uses(), [](const Use &U) {
397 return match(U.getUser(),
398 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
399 }));
400 return I->isCommutative();
401}
402
403template <typename T>
404static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
405 unsigned Offset) {
406 static_assert(std::is_same_v<T, InsertElementInst> ||
407 std::is_same_v<T, ExtractElementInst>,
408 "unsupported T");
409 int Index = Offset;
410 if (const auto *IE = dyn_cast<T>(Inst)) {
411 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
412 if (!VT)
413 return std::nullopt;
414 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
415 if (!CI)
416 return std::nullopt;
417 if (CI->getValue().uge(VT->getNumElements()))
418 return std::nullopt;
419 Index *= VT->getNumElements();
420 Index += CI->getZExtValue();
421 return Index;
422 }
423 return std::nullopt;
424}
425
426/// \returns inserting or extracting index of InsertElement, ExtractElement or
427/// InsertValue instruction, using Offset as base offset for index.
428/// \returns std::nullopt if the index is not an immediate.
429static std::optional<unsigned> getElementIndex(const Value *Inst,
430 unsigned Offset = 0) {
431 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
432 return Index;
433 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
434 return Index;
435
436 int Index = Offset;
437
438 const auto *IV = dyn_cast<InsertValueInst>(Inst);
439 if (!IV)
440 return std::nullopt;
441
442 Type *CurrentType = IV->getType();
443 for (unsigned I : IV->indices()) {
444 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
445 Index *= ST->getNumElements();
446 CurrentType = ST->getElementType(I);
447 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
448 Index *= AT->getNumElements();
449 CurrentType = AT->getElementType();
450 } else {
451 return std::nullopt;
452 }
453 Index += I;
454 }
455 return Index;
456}
457
458namespace {
459/// Specifies the way the mask should be analyzed for undefs/poisonous elements
460/// in the shuffle mask.
461enum class UseMask {
462 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
463 ///< check for the mask elements for the first argument (mask
464 ///< indices are in range [0:VF)).
465 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
466 ///< for the mask elements for the second argument (mask indices
467 ///< are in range [VF:2*VF))
468 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
469 ///< future shuffle elements and mark them as ones as being used
470 ///< in future. Non-undef elements are considered as unused since
471 ///< they're already marked as used in the mask.
472};
473} // namespace
474
475/// Prepares a use bitset for the given mask either for the first argument or
476/// for the second.
478 UseMask MaskArg) {
479 SmallBitVector UseMask(VF, true);
480 for (auto [Idx, Value] : enumerate(Mask)) {
481 if (Value == PoisonMaskElem) {
482 if (MaskArg == UseMask::UndefsAsMask)
483 UseMask.reset(Idx);
484 continue;
485 }
486 if (MaskArg == UseMask::FirstArg && Value < VF)
487 UseMask.reset(Value);
488 else if (MaskArg == UseMask::SecondArg && Value >= VF)
489 UseMask.reset(Value - VF);
490 }
491 return UseMask;
492}
493
494/// Checks if the given value is actually an undefined constant vector.
495/// Also, if the \p UseMask is not empty, tries to check if the non-masked
496/// elements actually mask the insertelement buildvector, if any.
497template <bool IsPoisonOnly = false>
499 const SmallBitVector &UseMask = {}) {
500 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
501 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
502 if (isa<T>(V))
503 return Res;
504 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
505 if (!VecTy)
506 return Res.reset();
507 auto *C = dyn_cast<Constant>(V);
508 if (!C) {
509 if (!UseMask.empty()) {
510 const Value *Base = V;
511 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
512 Base = II->getOperand(0);
513 if (isa<T>(II->getOperand(1)))
514 continue;
515 std::optional<unsigned> Idx = getElementIndex(II);
516 if (!Idx) {
517 Res.reset();
518 return Res;
519 }
520 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
521 Res.reset(*Idx);
522 }
523 // TODO: Add analysis for shuffles here too.
524 if (V == Base) {
525 Res.reset();
526 } else {
527 SmallBitVector SubMask(UseMask.size(), false);
528 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
529 }
530 } else {
531 Res.reset();
532 }
533 return Res;
534 }
535 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
536 if (Constant *Elem = C->getAggregateElement(I))
537 if (!isa<T>(Elem) &&
538 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
539 Res.reset(I);
540 }
541 return Res;
542}
543
544/// Checks if the vector of instructions can be represented as a shuffle, like:
545/// %x0 = extractelement <4 x i8> %x, i32 0
546/// %x3 = extractelement <4 x i8> %x, i32 3
547/// %y1 = extractelement <4 x i8> %y, i32 1
548/// %y2 = extractelement <4 x i8> %y, i32 2
549/// %x0x0 = mul i8 %x0, %x0
550/// %x3x3 = mul i8 %x3, %x3
551/// %y1y1 = mul i8 %y1, %y1
552/// %y2y2 = mul i8 %y2, %y2
553/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
554/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
555/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
556/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
557/// ret <4 x i8> %ins4
558/// can be transformed into:
559/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
560/// i32 6>
561/// %2 = mul <4 x i8> %1, %1
562/// ret <4 x i8> %2
563/// Mask will return the Shuffle Mask equivalent to the extracted elements.
564/// TODO: Can we split off and reuse the shuffle mask detection from
565/// ShuffleVectorInst/getShuffleCost?
566static std::optional<TargetTransformInfo::ShuffleKind>
568 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
569 if (It == VL.end())
570 return std::nullopt;
571 unsigned Size =
572 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
573 auto *EI = dyn_cast<ExtractElementInst>(V);
574 if (!EI)
575 return S;
576 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
577 if (!VTy)
578 return S;
579 return std::max(S, VTy->getNumElements());
580 });
581
582 Value *Vec1 = nullptr;
583 Value *Vec2 = nullptr;
584 bool HasNonUndefVec = any_of(VL, [](Value *V) {
585 auto *EE = dyn_cast<ExtractElementInst>(V);
586 if (!EE)
587 return false;
588 Value *Vec = EE->getVectorOperand();
589 if (isa<UndefValue>(Vec))
590 return false;
591 return isGuaranteedNotToBePoison(Vec);
592 });
593 enum ShuffleMode { Unknown, Select, Permute };
594 ShuffleMode CommonShuffleMode = Unknown;
595 Mask.assign(VL.size(), PoisonMaskElem);
596 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
597 // Undef can be represented as an undef element in a vector.
598 if (isa<UndefValue>(VL[I]))
599 continue;
600 auto *EI = cast<ExtractElementInst>(VL[I]);
601 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
602 return std::nullopt;
603 auto *Vec = EI->getVectorOperand();
604 // We can extractelement from undef or poison vector.
605 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
606 continue;
607 // All vector operands must have the same number of vector elements.
608 if (isa<UndefValue>(Vec)) {
609 Mask[I] = I;
610 } else {
611 if (isa<UndefValue>(EI->getIndexOperand()))
612 continue;
613 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
614 if (!Idx)
615 return std::nullopt;
616 // Undefined behavior if Idx is negative or >= Size.
617 if (Idx->getValue().uge(Size))
618 continue;
619 unsigned IntIdx = Idx->getValue().getZExtValue();
620 Mask[I] = IntIdx;
621 }
622 if (isUndefVector(Vec).all() && HasNonUndefVec)
623 continue;
624 // For correct shuffling we have to have at most 2 different vector operands
625 // in all extractelement instructions.
626 if (!Vec1 || Vec1 == Vec) {
627 Vec1 = Vec;
628 } else if (!Vec2 || Vec2 == Vec) {
629 Vec2 = Vec;
630 Mask[I] += Size;
631 } else {
632 return std::nullopt;
633 }
634 if (CommonShuffleMode == Permute)
635 continue;
636 // If the extract index is not the same as the operation number, it is a
637 // permutation.
638 if (Mask[I] % Size != I) {
639 CommonShuffleMode = Permute;
640 continue;
641 }
642 CommonShuffleMode = Select;
643 }
644 // If we're not crossing lanes in different vectors, consider it as blending.
645 if (CommonShuffleMode == Select && Vec2)
647 // If Vec2 was never used, we have a permutation of a single vector, otherwise
648 // we have permutation of 2 vectors.
651}
652
653/// \returns True if Extract{Value,Element} instruction extracts element Idx.
654static std::optional<unsigned> getExtractIndex(Instruction *E) {
655 unsigned Opcode = E->getOpcode();
656 assert((Opcode == Instruction::ExtractElement ||
657 Opcode == Instruction::ExtractValue) &&
658 "Expected extractelement or extractvalue instruction.");
659 if (Opcode == Instruction::ExtractElement) {
660 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
661 if (!CI)
662 return std::nullopt;
663 return CI->getZExtValue();
664 }
665 auto *EI = cast<ExtractValueInst>(E);
666 if (EI->getNumIndices() != 1)
667 return std::nullopt;
668 return *EI->idx_begin();
669}
670
671namespace {
672
673/// Main data required for vectorization of instructions.
674struct InstructionsState {
675 /// The very first instruction in the list with the main opcode.
676 Value *OpValue = nullptr;
677
678 /// The main/alternate instruction.
679 Instruction *MainOp = nullptr;
680 Instruction *AltOp = nullptr;
681
682 /// The main/alternate opcodes for the list of instructions.
683 unsigned getOpcode() const {
684 return MainOp ? MainOp->getOpcode() : 0;
685 }
686
687 unsigned getAltOpcode() const {
688 return AltOp ? AltOp->getOpcode() : 0;
689 }
690
691 /// Some of the instructions in the list have alternate opcodes.
692 bool isAltShuffle() const { return AltOp != MainOp; }
693
694 bool isOpcodeOrAlt(Instruction *I) const {
695 unsigned CheckedOpcode = I->getOpcode();
696 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
697 }
698
699 InstructionsState() = delete;
700 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
701 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
702};
703
704} // end anonymous namespace
705
706/// Chooses the correct key for scheduling data. If \p Op has the same (or
707/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
708/// OpValue.
709static Value *isOneOf(const InstructionsState &S, Value *Op) {
710 auto *I = dyn_cast<Instruction>(Op);
711 if (I && S.isOpcodeOrAlt(I))
712 return Op;
713 return S.OpValue;
714}
715
716/// \returns true if \p Opcode is allowed as part of the main/alternate
717/// instruction for SLP vectorization.
718///
719/// Example of unsupported opcode is SDIV that can potentially cause UB if the
720/// "shuffled out" lane would result in division by zero.
721static bool isValidForAlternation(unsigned Opcode) {
722 if (Instruction::isIntDivRem(Opcode))
723 return false;
724
725 return true;
726}
727
728static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
729 const TargetLibraryInfo &TLI,
730 unsigned BaseIndex = 0);
731
732/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
733/// compatible instructions or constants, or just some other regular values.
734static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
735 Value *Op1, const TargetLibraryInfo &TLI) {
736 return (isConstant(BaseOp0) && isConstant(Op0)) ||
737 (isConstant(BaseOp1) && isConstant(Op1)) ||
738 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
739 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
740 BaseOp0 == Op0 || BaseOp1 == Op1 ||
741 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
742 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
743}
744
745/// \returns true if a compare instruction \p CI has similar "look" and
746/// same predicate as \p BaseCI, "as is" or with its operands and predicate
747/// swapped, false otherwise.
748static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
749 const TargetLibraryInfo &TLI) {
750 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
751 "Assessing comparisons of different types?");
752 CmpInst::Predicate BasePred = BaseCI->getPredicate();
753 CmpInst::Predicate Pred = CI->getPredicate();
755
756 Value *BaseOp0 = BaseCI->getOperand(0);
757 Value *BaseOp1 = BaseCI->getOperand(1);
758 Value *Op0 = CI->getOperand(0);
759 Value *Op1 = CI->getOperand(1);
760
761 return (BasePred == Pred &&
762 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
763 (BasePred == SwappedPred &&
764 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
765}
766
767/// \returns analysis of the Instructions in \p VL described in
768/// InstructionsState, the Opcode that we suppose the whole list
769/// could be vectorized even if its structure is diverse.
770static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
771 const TargetLibraryInfo &TLI,
772 unsigned BaseIndex) {
773 // Make sure these are all Instructions.
774 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
775 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
776
777 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
778 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
779 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
780 CmpInst::Predicate BasePred =
781 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
783 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
784 unsigned AltOpcode = Opcode;
785 unsigned AltIndex = BaseIndex;
786
787 bool SwappedPredsCompatible = [&]() {
788 if (!IsCmpOp)
789 return false;
790 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
791 UniquePreds.insert(BasePred);
792 UniqueNonSwappedPreds.insert(BasePred);
793 for (Value *V : VL) {
794 auto *I = dyn_cast<CmpInst>(V);
795 if (!I)
796 return false;
797 CmpInst::Predicate CurrentPred = I->getPredicate();
798 CmpInst::Predicate SwappedCurrentPred =
799 CmpInst::getSwappedPredicate(CurrentPred);
800 UniqueNonSwappedPreds.insert(CurrentPred);
801 if (!UniquePreds.contains(CurrentPred) &&
802 !UniquePreds.contains(SwappedCurrentPred))
803 UniquePreds.insert(CurrentPred);
804 }
805 // Total number of predicates > 2, but if consider swapped predicates
806 // compatible only 2, consider swappable predicates as compatible opcodes,
807 // not alternate.
808 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
809 }();
810 // Check for one alternate opcode from another BinaryOperator.
811 // TODO - generalize to support all operators (types, calls etc.).
812 auto *IBase = cast<Instruction>(VL[BaseIndex]);
813 Intrinsic::ID BaseID = 0;
814 SmallVector<VFInfo> BaseMappings;
815 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
817 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
818 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
819 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
820 }
821 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
822 auto *I = cast<Instruction>(VL[Cnt]);
823 unsigned InstOpcode = I->getOpcode();
824 if (IsBinOp && isa<BinaryOperator>(I)) {
825 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
826 continue;
827 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
828 isValidForAlternation(Opcode)) {
829 AltOpcode = InstOpcode;
830 AltIndex = Cnt;
831 continue;
832 }
833 } else if (IsCastOp && isa<CastInst>(I)) {
834 Value *Op0 = IBase->getOperand(0);
835 Type *Ty0 = Op0->getType();
836 Value *Op1 = I->getOperand(0);
837 Type *Ty1 = Op1->getType();
838 if (Ty0 == Ty1) {
839 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
840 continue;
841 if (Opcode == AltOpcode) {
843 isValidForAlternation(InstOpcode) &&
844 "Cast isn't safe for alternation, logic needs to be updated!");
845 AltOpcode = InstOpcode;
846 AltIndex = Cnt;
847 continue;
848 }
849 }
850 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
851 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
852 Type *Ty0 = BaseInst->getOperand(0)->getType();
853 Type *Ty1 = Inst->getOperand(0)->getType();
854 if (Ty0 == Ty1) {
855 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
856 // Check for compatible operands. If the corresponding operands are not
857 // compatible - need to perform alternate vectorization.
858 CmpInst::Predicate CurrentPred = Inst->getPredicate();
859 CmpInst::Predicate SwappedCurrentPred =
860 CmpInst::getSwappedPredicate(CurrentPred);
861
862 if ((E == 2 || SwappedPredsCompatible) &&
863 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
864 continue;
865
866 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
867 continue;
868 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
869 if (AltIndex != BaseIndex) {
870 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
871 continue;
872 } else if (BasePred != CurrentPred) {
873 assert(
874 isValidForAlternation(InstOpcode) &&
875 "CmpInst isn't safe for alternation, logic needs to be updated!");
876 AltIndex = Cnt;
877 continue;
878 }
879 CmpInst::Predicate AltPred = AltInst->getPredicate();
880 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
881 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
882 continue;
883 }
884 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
885 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
886 if (Gep->getNumOperands() != 2 ||
887 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
888 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
889 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
891 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
892 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
893 auto *BaseLI = cast<LoadInst>(IBase);
894 if (!LI->isSimple() || !BaseLI->isSimple())
895 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
896 } else if (auto *Call = dyn_cast<CallInst>(I)) {
897 auto *CallBase = cast<CallInst>(IBase);
898 if (Call->getCalledFunction() != CallBase->getCalledFunction())
899 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
900 if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
901 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
902 Call->op_begin() + Call->getBundleOperandsEndIndex(),
903 CallBase->op_begin() +
905 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
907 if (ID != BaseID)
908 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
909 if (!ID) {
910 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
911 if (Mappings.size() != BaseMappings.size() ||
912 Mappings.front().ISA != BaseMappings.front().ISA ||
913 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
914 Mappings.front().VectorName != BaseMappings.front().VectorName ||
915 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
916 Mappings.front().Shape.Parameters !=
917 BaseMappings.front().Shape.Parameters)
918 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
919 }
920 }
921 continue;
922 }
923 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
924 }
925
926 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
927 cast<Instruction>(VL[AltIndex]));
928}
929
930/// \returns true if all of the values in \p VL have the same type or false
931/// otherwise.
933 Type *Ty = VL.front()->getType();
934 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
935}
936
937/// \returns True if in-tree use also needs extract. This refers to
938/// possible scalar operand in vectorized instruction.
939static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
940 TargetLibraryInfo *TLI) {
941 unsigned Opcode = UserInst->getOpcode();
942 switch (Opcode) {
943 case Instruction::Load: {
944 LoadInst *LI = cast<LoadInst>(UserInst);
945 return (LI->getPointerOperand() == Scalar);
946 }
947 case Instruction::Store: {
948 StoreInst *SI = cast<StoreInst>(UserInst);
949 return (SI->getPointerOperand() == Scalar);
950 }
951 case Instruction::Call: {
952 CallInst *CI = cast<CallInst>(UserInst);
954 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
955 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
956 Arg.value().get() == Scalar;
957 });
958 }
959 default:
960 return false;
961 }
962}
963
964/// \returns the AA location that is being access by the instruction.
966 if (StoreInst *SI = dyn_cast<StoreInst>(I))
967 return MemoryLocation::get(SI);
968 if (LoadInst *LI = dyn_cast<LoadInst>(I))
969 return MemoryLocation::get(LI);
970 return MemoryLocation();
971}
972
973/// \returns True if the instruction is not a volatile or atomic load/store.
974static bool isSimple(Instruction *I) {
975 if (LoadInst *LI = dyn_cast<LoadInst>(I))
976 return LI->isSimple();
977 if (StoreInst *SI = dyn_cast<StoreInst>(I))
978 return SI->isSimple();
979 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
980 return !MI->isVolatile();
981 return true;
982}
983
984/// Shuffles \p Mask in accordance with the given \p SubMask.
985/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
986/// one but two input vectors.
987static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
988 bool ExtendingManyInputs = false) {
989 if (SubMask.empty())
990 return;
991 assert(
992 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
993 // Check if input scalars were extended to match the size of other node.
994 (SubMask.size() == Mask.size() &&
995 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
996 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
997 "SubMask with many inputs support must be larger than the mask.");
998 if (Mask.empty()) {
999 Mask.append(SubMask.begin(), SubMask.end());
1000 return;
1001 }
1002 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1003 int TermValue = std::min(Mask.size(), SubMask.size());
1004 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1005 if (SubMask[I] == PoisonMaskElem ||
1006 (!ExtendingManyInputs &&
1007 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1008 continue;
1009 NewMask[I] = Mask[SubMask[I]];
1010 }
1011 Mask.swap(NewMask);
1012}
1013
1014/// Order may have elements assigned special value (size) which is out of
1015/// bounds. Such indices only appear on places which correspond to undef values
1016/// (see canReuseExtract for details) and used in order to avoid undef values
1017/// have effect on operands ordering.
1018/// The first loop below simply finds all unused indices and then the next loop
1019/// nest assigns these indices for undef values positions.
1020/// As an example below Order has two undef positions and they have assigned
1021/// values 3 and 7 respectively:
1022/// before: 6 9 5 4 9 2 1 0
1023/// after: 6 3 5 4 7 2 1 0
1025 const unsigned Sz = Order.size();
1026 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1027 SmallBitVector MaskedIndices(Sz);
1028 for (unsigned I = 0; I < Sz; ++I) {
1029 if (Order[I] < Sz)
1030 UnusedIndices.reset(Order[I]);
1031 else
1032 MaskedIndices.set(I);
1033 }
1034 if (MaskedIndices.none())
1035 return;
1036 assert(UnusedIndices.count() == MaskedIndices.count() &&
1037 "Non-synced masked/available indices.");
1038 int Idx = UnusedIndices.find_first();
1039 int MIdx = MaskedIndices.find_first();
1040 while (MIdx >= 0) {
1041 assert(Idx >= 0 && "Indices must be synced.");
1042 Order[MIdx] = Idx;
1043 Idx = UnusedIndices.find_next(Idx);
1044 MIdx = MaskedIndices.find_next(MIdx);
1045 }
1046}
1047
1048/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1049/// Opcode1.
1051 unsigned Opcode1) {
1052 Type *ScalarTy = VL[0]->getType();
1053 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1054 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1055 for (unsigned Lane : seq<unsigned>(VL.size()))
1056 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1057 OpcodeMask.set(Lane * ScalarTyNumElements,
1058 Lane * ScalarTyNumElements + ScalarTyNumElements);
1059 return OpcodeMask;
1060}
1061
1062namespace llvm {
1063
1065 SmallVectorImpl<int> &Mask) {
1066 Mask.clear();
1067 const unsigned E = Indices.size();
1068 Mask.resize(E, PoisonMaskElem);
1069 for (unsigned I = 0; I < E; ++I)
1070 Mask[Indices[I]] = I;
1071}
1072
1073/// Reorders the list of scalars in accordance with the given \p Mask.
1075 ArrayRef<int> Mask) {
1076 assert(!Mask.empty() && "Expected non-empty mask.");
1077 SmallVector<Value *> Prev(Scalars.size(),
1078 PoisonValue::get(Scalars.front()->getType()));
1079 Prev.swap(Scalars);
1080 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1081 if (Mask[I] != PoisonMaskElem)
1082 Scalars[Mask[I]] = Prev[I];
1083}
1084
1085/// Checks if the provided value does not require scheduling. It does not
1086/// require scheduling if this is not an instruction or it is an instruction
1087/// that does not read/write memory and all operands are either not instructions
1088/// or phi nodes or instructions from different blocks.
1090 auto *I = dyn_cast<Instruction>(V);
1091 if (!I)
1092 return true;
1093 return !mayHaveNonDefUseDependency(*I) &&
1094 all_of(I->operands(), [I](Value *V) {
1095 auto *IO = dyn_cast<Instruction>(V);
1096 if (!IO)
1097 return true;
1098 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1099 });
1100}
1101
1102/// Checks if the provided value does not require scheduling. It does not
1103/// require scheduling if this is not an instruction or it is an instruction
1104/// that does not read/write memory and all users are phi nodes or instructions
1105/// from the different blocks.
1106static bool isUsedOutsideBlock(Value *V) {
1107 auto *I = dyn_cast<Instruction>(V);
1108 if (!I)
1109 return true;
1110 // Limits the number of uses to save compile time.
1111 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1112 all_of(I->users(), [I](User *U) {
1113 auto *IU = dyn_cast<Instruction>(U);
1114 if (!IU)
1115 return true;
1116 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1117 });
1118}
1119
1120/// Checks if the specified value does not require scheduling. It does not
1121/// require scheduling if all operands and all users do not need to be scheduled
1122/// in the current basic block.
1125}
1126
1127/// Checks if the specified array of instructions does not require scheduling.
1128/// It is so if all either instructions have operands that do not require
1129/// scheduling or their users do not require scheduling since they are phis or
1130/// in other basic blocks.
1132 return !VL.empty() &&
1134}
1135
1136namespace slpvectorizer {
1137
1138/// Bottom Up SLP Vectorizer.
1139class BoUpSLP {
1140 struct TreeEntry;
1141 struct ScheduleData;
1144
1145public:
1146 /// Tracks the state we can represent the loads in the given sequence.
1147 enum class LoadsState {
1148 Gather,
1149 Vectorize,
1152 };
1153
1161
1163 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1166 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1167 AC(AC), DB(DB), DL(DL), ORE(ORE),
1168 Builder(Se->getContext(), TargetFolder(*DL)) {
1169 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1170 // Use the vector register size specified by the target unless overridden
1171 // by a command-line option.
1172 // TODO: It would be better to limit the vectorization factor based on
1173 // data type rather than just register size. For example, x86 AVX has
1174 // 256-bit registers, but it does not support integer operations
1175 // at that width (that requires AVX2).
1176 if (MaxVectorRegSizeOption.getNumOccurrences())
1177 MaxVecRegSize = MaxVectorRegSizeOption;
1178 else
1179 MaxVecRegSize =
1181 .getFixedValue();
1182
1183 if (MinVectorRegSizeOption.getNumOccurrences())
1184 MinVecRegSize = MinVectorRegSizeOption;
1185 else
1186 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1187 }
1188
1189 /// Vectorize the tree that starts with the elements in \p VL.
1190 /// Returns the vectorized root.
1192
1193 /// Vectorize the tree but with the list of externally used values \p
1194 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1195 /// generated extractvalue instructions.
1196 /// \param ReplacedExternals containd list of replaced external values
1197 /// {scalar, replace} after emitting extractelement for external uses.
1198 Value *
1199 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1200 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1201 Instruction *ReductionRoot = nullptr);
1202
1203 /// \returns the cost incurred by unwanted spills and fills, caused by
1204 /// holding live values over call sites.
1206
1207 /// \returns the vectorization cost of the subtree that starts at \p VL.
1208 /// A negative number means that this is profitable.
1209 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1210
1211 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1212 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1213 void buildTree(ArrayRef<Value *> Roots,
1214 const SmallDenseSet<Value *> &UserIgnoreLst);
1215
1216 /// Construct a vectorizable tree that starts at \p Roots.
1217 void buildTree(ArrayRef<Value *> Roots);
1218
1219 /// Returns whether the root node has in-tree uses.
1221 return !VectorizableTree.empty() &&
1222 !VectorizableTree.front()->UserTreeIndices.empty();
1223 }
1224
1225 /// Return the scalars of the root node.
1227 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1228 return VectorizableTree.front()->Scalars;
1229 }
1230
1231 /// Checks if the root graph node can be emitted with narrower bitwidth at
1232 /// codegen and returns it signedness, if so.
1234 return MinBWs.at(VectorizableTree.front().get()).second;
1235 }
1236
1237 /// Builds external uses of the vectorized scalars, i.e. the list of
1238 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1239 /// ExternallyUsedValues contains additional list of external uses to handle
1240 /// vectorization of reductions.
1241 void
1242 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1243
1244 /// Transforms graph nodes to target specific representations, if profitable.
1245 void transformNodes();
1246
1247 /// Clear the internal data structures that are created by 'buildTree'.
1248 void deleteTree() {
1249 VectorizableTree.clear();
1250 ScalarToTreeEntry.clear();
1251 MultiNodeScalars.clear();
1252 MustGather.clear();
1253 NonScheduledFirst.clear();
1254 EntryToLastInstruction.clear();
1255 ExternalUses.clear();
1256 ExternalUsesAsOriginalScalar.clear();
1257 for (auto &Iter : BlocksSchedules) {
1258 BlockScheduling *BS = Iter.second.get();
1259 BS->clear();
1260 }
1261 MinBWs.clear();
1262 ReductionBitWidth = 0;
1263 CastMaxMinBWSizes.reset();
1264 ExtraBitWidthNodes.clear();
1265 InstrElementSize.clear();
1266 UserIgnoreList = nullptr;
1267 PostponedGathers.clear();
1268 ValueToGatherNodes.clear();
1269 }
1270
1271 unsigned getTreeSize() const { return VectorizableTree.size(); }
1272
1273 /// Perform LICM and CSE on the newly generated gather sequences.
1275
1276 /// Checks if the specified gather tree entry \p TE can be represented as a
1277 /// shuffled vector entry + (possibly) permutation with other gathers. It
1278 /// implements the checks only for possibly ordered scalars (Loads,
1279 /// ExtractElement, ExtractValue), which can be part of the graph.
1280 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1281
1282 /// Sort loads into increasing pointers offsets to allow greater clustering.
1283 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1284
1285 /// Gets reordering data for the given tree entry. If the entry is vectorized
1286 /// - just return ReorderIndices, otherwise check if the scalars can be
1287 /// reordered and return the most optimal order.
1288 /// \return std::nullopt if ordering is not important, empty order, if
1289 /// identity order is important, or the actual order.
1290 /// \param TopToBottom If true, include the order of vectorized stores and
1291 /// insertelement nodes, otherwise skip them.
1292 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1293 bool TopToBottom);
1294
1295 /// Reorders the current graph to the most profitable order starting from the
1296 /// root node to the leaf nodes. The best order is chosen only from the nodes
1297 /// of the same size (vectorization factor). Smaller nodes are considered
1298 /// parts of subgraph with smaller VF and they are reordered independently. We
1299 /// can make it because we still need to extend smaller nodes to the wider VF
1300 /// and we can merge reordering shuffles with the widening shuffles.
1301 void reorderTopToBottom();
1302
1303 /// Reorders the current graph to the most profitable order starting from
1304 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1305 /// number of reshuffles if the leaf nodes use the same order. In this case we
1306 /// can merge the orders and just shuffle user node instead of shuffling its
1307 /// operands. Plus, even the leaf nodes have different orders, it allows to
1308 /// sink reordering in the graph closer to the root node and merge it later
1309 /// during analysis.
1310 void reorderBottomToTop(bool IgnoreReorder = false);
1311
1312 /// \return The vector element size in bits to use when vectorizing the
1313 /// expression tree ending at \p V. If V is a store, the size is the width of
1314 /// the stored value. Otherwise, the size is the width of the largest loaded
1315 /// value reaching V. This method is used by the vectorizer to calculate
1316 /// vectorization factors.
1317 unsigned getVectorElementSize(Value *V);
1318
1319 /// Compute the minimum type sizes required to represent the entries in a
1320 /// vectorizable tree.
1322
1323 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1324 unsigned getMaxVecRegSize() const {
1325 return MaxVecRegSize;
1326 }
1327
1328 // \returns minimum vector register size as set by cl::opt.
1329 unsigned getMinVecRegSize() const {
1330 return MinVecRegSize;
1331 }
1332
1333 unsigned getMinVF(unsigned Sz) const {
1334 return std::max(2U, getMinVecRegSize() / Sz);
1335 }
1336
1337 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1338 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1339 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1340 return MaxVF ? MaxVF : UINT_MAX;
1341 }
1342
1343 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1344 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1345 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1346 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1347 ///
1348 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1349 unsigned canMapToVector(Type *T) const;
1350
1351 /// \returns True if the VectorizableTree is both tiny and not fully
1352 /// vectorizable. We do not vectorize such trees.
1353 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1354
1355 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1356 /// can be load combined in the backend. Load combining may not be allowed in
1357 /// the IR optimizer, so we do not want to alter the pattern. For example,
1358 /// partially transforming a scalar bswap() pattern into vector code is
1359 /// effectively impossible for the backend to undo.
1360 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1361 /// may not be necessary.
1362 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1363
1364 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1365 /// can be load combined in the backend. Load combining may not be allowed in
1366 /// the IR optimizer, so we do not want to alter the pattern. For example,
1367 /// partially transforming a scalar bswap() pattern into vector code is
1368 /// effectively impossible for the backend to undo.
1369 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1370 /// may not be necessary.
1371 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1372
1373 /// Checks if the given array of loads can be represented as a vectorized,
1374 /// scatter or just simple gather.
1375 /// \param VL list of loads.
1376 /// \param VL0 main load value.
1377 /// \param Order returned order of load instructions.
1378 /// \param PointerOps returned list of pointer operands.
1379 /// \param TryRecursiveCheck used to check if long masked gather can be
1380 /// represented as a serie of loads/insert subvector, if profitable.
1383 SmallVectorImpl<Value *> &PointerOps,
1384 bool TryRecursiveCheck = true) const;
1385
1387
1388 /// This structure holds any data we need about the edges being traversed
1389 /// during buildTree_rec(). We keep track of:
1390 /// (i) the user TreeEntry index, and
1391 /// (ii) the index of the edge.
1392 struct EdgeInfo {
1393 EdgeInfo() = default;
1394 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1396 /// The user TreeEntry.
1397 TreeEntry *UserTE = nullptr;
1398 /// The operand index of the use.
1399 unsigned EdgeIdx = UINT_MAX;
1400#ifndef NDEBUG
1402 const BoUpSLP::EdgeInfo &EI) {
1403 EI.dump(OS);
1404 return OS;
1405 }
1406 /// Debug print.
1407 void dump(raw_ostream &OS) const {
1408 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1409 << " EdgeIdx:" << EdgeIdx << "}";
1410 }
1411 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1412#endif
1413 bool operator == (const EdgeInfo &Other) const {
1414 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1415 }
1416 };
1417
1418 /// A helper class used for scoring candidates for two consecutive lanes.
1420 const TargetLibraryInfo &TLI;
1421 const DataLayout &DL;
1422 ScalarEvolution &SE;
1423 const BoUpSLP &R;
1424 int NumLanes; // Total number of lanes (aka vectorization factor).
1425 int MaxLevel; // The maximum recursion depth for accumulating score.
1426
1427 public:
1429 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1430 int MaxLevel)
1431 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1432 MaxLevel(MaxLevel) {}
1433
1434 // The hard-coded scores listed here are not very important, though it shall
1435 // be higher for better matches to improve the resulting cost. When
1436 // computing the scores of matching one sub-tree with another, we are
1437 // basically counting the number of values that are matching. So even if all
1438 // scores are set to 1, we would still get a decent matching result.
1439 // However, sometimes we have to break ties. For example we may have to
1440 // choose between matching loads vs matching opcodes. This is what these
1441 // scores are helping us with: they provide the order of preference. Also,
1442 // this is important if the scalar is externally used or used in another
1443 // tree entry node in the different lane.
1444
1445 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1446 static const int ScoreConsecutiveLoads = 4;
1447 /// The same load multiple times. This should have a better score than
1448 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1449 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1450 /// a vector load and 1.0 for a broadcast.
1451 static const int ScoreSplatLoads = 3;
1452 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1453 static const int ScoreReversedLoads = 3;
1454 /// A load candidate for masked gather.
1455 static const int ScoreMaskedGatherCandidate = 1;
1456 /// ExtractElementInst from same vector and consecutive indexes.
1457 static const int ScoreConsecutiveExtracts = 4;
1458 /// ExtractElementInst from same vector and reversed indices.
1459 static const int ScoreReversedExtracts = 3;
1460 /// Constants.
1461 static const int ScoreConstants = 2;
1462 /// Instructions with the same opcode.
1463 static const int ScoreSameOpcode = 2;
1464 /// Instructions with alt opcodes (e.g, add + sub).
1465 static const int ScoreAltOpcodes = 1;
1466 /// Identical instructions (a.k.a. splat or broadcast).
1467 static const int ScoreSplat = 1;
1468 /// Matching with an undef is preferable to failing.
1469 static const int ScoreUndef = 1;
1470 /// Score for failing to find a decent match.
1471 static const int ScoreFail = 0;
1472 /// Score if all users are vectorized.
1473 static const int ScoreAllUserVectorized = 1;
1474
1475 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1476 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1477 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1478 /// MainAltOps.
1480 ArrayRef<Value *> MainAltOps) const {
1481 if (!isValidElementType(V1->getType()) ||
1482 !isValidElementType(V2->getType()))
1484
1485 if (V1 == V2) {
1486 if (isa<LoadInst>(V1)) {
1487 // Retruns true if the users of V1 and V2 won't need to be extracted.
1488 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1489 // Bail out if we have too many uses to save compilation time.
1490 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1491 return false;
1492
1493 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1494 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1495 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1496 });
1497 };
1498 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1499 };
1500 // A broadcast of a load can be cheaper on some targets.
1501 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1502 ElementCount::getFixed(NumLanes)) &&
1503 ((int)V1->getNumUses() == NumLanes ||
1504 AllUsersAreInternal(V1, V2)))
1506 }
1508 }
1509
1510 auto CheckSameEntryOrFail = [&]() {
1511 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1512 TE1 && TE1 == R.getTreeEntry(V2))
1515 };
1516
1517 auto *LI1 = dyn_cast<LoadInst>(V1);
1518 auto *LI2 = dyn_cast<LoadInst>(V2);
1519 if (LI1 && LI2) {
1520 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1521 !LI2->isSimple())
1522 return CheckSameEntryOrFail();
1523
1524 std::optional<int> Dist = getPointersDiff(
1525 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1526 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1527 if (!Dist || *Dist == 0) {
1528 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1529 getUnderlyingObject(LI2->getPointerOperand()) &&
1530 R.TTI->isLegalMaskedGather(
1531 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1533 return CheckSameEntryOrFail();
1534 }
1535 // The distance is too large - still may be profitable to use masked
1536 // loads/gathers.
1537 if (std::abs(*Dist) > NumLanes / 2)
1539 // This still will detect consecutive loads, but we might have "holes"
1540 // in some cases. It is ok for non-power-2 vectorization and may produce
1541 // better results. It should not affect current vectorization.
1544 }
1545
1546 auto *C1 = dyn_cast<Constant>(V1);
1547 auto *C2 = dyn_cast<Constant>(V2);
1548 if (C1 && C2)
1550
1551 // Extracts from consecutive indexes of the same vector better score as
1552 // the extracts could be optimized away.
1553 Value *EV1;
1554 ConstantInt *Ex1Idx;
1555 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1556 // Undefs are always profitable for extractelements.
1557 // Compiler can easily combine poison and extractelement <non-poison> or
1558 // undef and extractelement <poison>. But combining undef +
1559 // extractelement <non-poison-but-may-produce-poison> requires some
1560 // extra operations.
1561 if (isa<UndefValue>(V2))
1562 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1565 Value *EV2 = nullptr;
1566 ConstantInt *Ex2Idx = nullptr;
1567 if (match(V2,
1569 m_Undef())))) {
1570 // Undefs are always profitable for extractelements.
1571 if (!Ex2Idx)
1573 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1575 if (EV2 == EV1) {
1576 int Idx1 = Ex1Idx->getZExtValue();
1577 int Idx2 = Ex2Idx->getZExtValue();
1578 int Dist = Idx2 - Idx1;
1579 // The distance is too large - still may be profitable to use
1580 // shuffles.
1581 if (std::abs(Dist) == 0)
1583 if (std::abs(Dist) > NumLanes / 2)
1587 }
1589 }
1590 return CheckSameEntryOrFail();
1591 }
1592
1593 auto *I1 = dyn_cast<Instruction>(V1);
1594 auto *I2 = dyn_cast<Instruction>(V2);
1595 if (I1 && I2) {
1596 if (I1->getParent() != I2->getParent())
1597 return CheckSameEntryOrFail();
1598 SmallVector<Value *, 4> Ops(MainAltOps);
1599 Ops.push_back(I1);
1600 Ops.push_back(I2);
1601 InstructionsState S = getSameOpcode(Ops, TLI);
1602 // Note: Only consider instructions with <= 2 operands to avoid
1603 // complexity explosion.
1604 if (S.getOpcode() &&
1605 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1606 !S.isAltShuffle()) &&
1607 all_of(Ops, [&S](Value *V) {
1608 return cast<Instruction>(V)->getNumOperands() ==
1609 S.MainOp->getNumOperands();
1610 }))
1611 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1613 }
1614
1615 if (isa<UndefValue>(V2))
1617
1618 return CheckSameEntryOrFail();
1619 }
1620
1621 /// Go through the operands of \p LHS and \p RHS recursively until
1622 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1623 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1624 /// of \p U1 and \p U2), except at the beginning of the recursion where
1625 /// these are set to nullptr.
1626 ///
1627 /// For example:
1628 /// \verbatim
1629 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1630 /// \ / \ / \ / \ /
1631 /// + + + +
1632 /// G1 G2 G3 G4
1633 /// \endverbatim
1634 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1635 /// each level recursively, accumulating the score. It starts from matching
1636 /// the additions at level 0, then moves on to the loads (level 1). The
1637 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1638 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1639 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1640 /// Please note that the order of the operands does not matter, as we
1641 /// evaluate the score of all profitable combinations of operands. In
1642 /// other words the score of G1 and G4 is the same as G1 and G2. This
1643 /// heuristic is based on ideas described in:
1644 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1645 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1646 /// Luís F. W. Góes
1648 Instruction *U2, int CurrLevel,
1649 ArrayRef<Value *> MainAltOps) const {
1650
1651 // Get the shallow score of V1 and V2.
1652 int ShallowScoreAtThisLevel =
1653 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1654
1655 // If reached MaxLevel,
1656 // or if V1 and V2 are not instructions,
1657 // or if they are SPLAT,
1658 // or if they are not consecutive,
1659 // or if profitable to vectorize loads or extractelements, early return
1660 // the current cost.
1661 auto *I1 = dyn_cast<Instruction>(LHS);
1662 auto *I2 = dyn_cast<Instruction>(RHS);
1663 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1664 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1665 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1666 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1667 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1668 ShallowScoreAtThisLevel))
1669 return ShallowScoreAtThisLevel;
1670 assert(I1 && I2 && "Should have early exited.");
1671
1672 // Contains the I2 operand indexes that got matched with I1 operands.
1673 SmallSet<unsigned, 4> Op2Used;
1674
1675 // Recursion towards the operands of I1 and I2. We are trying all possible
1676 // operand pairs, and keeping track of the best score.
1677 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1678 OpIdx1 != NumOperands1; ++OpIdx1) {
1679 // Try to pair op1I with the best operand of I2.
1680 int MaxTmpScore = 0;
1681 unsigned MaxOpIdx2 = 0;
1682 bool FoundBest = false;
1683 // If I2 is commutative try all combinations.
1684 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1685 unsigned ToIdx = isCommutative(I2)
1686 ? I2->getNumOperands()
1687 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1688 assert(FromIdx <= ToIdx && "Bad index");
1689 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1690 // Skip operands already paired with OpIdx1.
1691 if (Op2Used.count(OpIdx2))
1692 continue;
1693 // Recursively calculate the cost at each level
1694 int TmpScore =
1695 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1696 I1, I2, CurrLevel + 1, std::nullopt);
1697 // Look for the best score.
1698 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1699 TmpScore > MaxTmpScore) {
1700 MaxTmpScore = TmpScore;
1701 MaxOpIdx2 = OpIdx2;
1702 FoundBest = true;
1703 }
1704 }
1705 if (FoundBest) {
1706 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1707 Op2Used.insert(MaxOpIdx2);
1708 ShallowScoreAtThisLevel += MaxTmpScore;
1709 }
1710 }
1711 return ShallowScoreAtThisLevel;
1712 }
1713 };
1714 /// A helper data structure to hold the operands of a vector of instructions.
1715 /// This supports a fixed vector length for all operand vectors.
1717 /// For each operand we need (i) the value, and (ii) the opcode that it
1718 /// would be attached to if the expression was in a left-linearized form.
1719 /// This is required to avoid illegal operand reordering.
1720 /// For example:
1721 /// \verbatim
1722 /// 0 Op1
1723 /// |/
1724 /// Op1 Op2 Linearized + Op2
1725 /// \ / ----------> |/
1726 /// - -
1727 ///
1728 /// Op1 - Op2 (0 + Op1) - Op2
1729 /// \endverbatim
1730 ///
1731 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1732 ///
1733 /// Another way to think of this is to track all the operations across the
1734 /// path from the operand all the way to the root of the tree and to
1735 /// calculate the operation that corresponds to this path. For example, the
1736 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1737 /// corresponding operation is a '-' (which matches the one in the
1738 /// linearized tree, as shown above).
1739 ///
1740 /// For lack of a better term, we refer to this operation as Accumulated
1741 /// Path Operation (APO).
1742 struct OperandData {
1743 OperandData() = default;
1744 OperandData(Value *V, bool APO, bool IsUsed)
1745 : V(V), APO(APO), IsUsed(IsUsed) {}
1746 /// The operand value.
1747 Value *V = nullptr;
1748 /// TreeEntries only allow a single opcode, or an alternate sequence of
1749 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1750 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1751 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1752 /// (e.g., Add/Mul)
1753 bool APO = false;
1754 /// Helper data for the reordering function.
1755 bool IsUsed = false;
1756 };
1757
1758 /// During operand reordering, we are trying to select the operand at lane
1759 /// that matches best with the operand at the neighboring lane. Our
1760 /// selection is based on the type of value we are looking for. For example,
1761 /// if the neighboring lane has a load, we need to look for a load that is
1762 /// accessing a consecutive address. These strategies are summarized in the
1763 /// 'ReorderingMode' enumerator.
1764 enum class ReorderingMode {
1765 Load, ///< Matching loads to consecutive memory addresses
1766 Opcode, ///< Matching instructions based on opcode (same or alternate)
1767 Constant, ///< Matching constants
1768 Splat, ///< Matching the same instruction multiple times (broadcast)
1769 Failed, ///< We failed to create a vectorizable group
1770 };
1771
1773
1774 /// A vector of operand vectors.
1776
1777 const TargetLibraryInfo &TLI;
1778 const DataLayout &DL;
1779 ScalarEvolution &SE;
1780 const BoUpSLP &R;
1781 const Loop *L = nullptr;
1782
1783 /// \returns the operand data at \p OpIdx and \p Lane.
1784 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1785 return OpsVec[OpIdx][Lane];
1786 }
1787
1788 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1789 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1790 return OpsVec[OpIdx][Lane];
1791 }
1792
1793 /// Clears the used flag for all entries.
1794 void clearUsed() {
1795 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1796 OpIdx != NumOperands; ++OpIdx)
1797 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1798 ++Lane)
1799 OpsVec[OpIdx][Lane].IsUsed = false;
1800 }
1801
1802 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1803 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1804 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1805 }
1806
1807 /// \param Lane lane of the operands under analysis.
1808 /// \param OpIdx operand index in \p Lane lane we're looking the best
1809 /// candidate for.
1810 /// \param Idx operand index of the current candidate value.
1811 /// \returns The additional score due to possible broadcasting of the
1812 /// elements in the lane. It is more profitable to have power-of-2 unique
1813 /// elements in the lane, it will be vectorized with higher probability
1814 /// after removing duplicates. Currently the SLP vectorizer supports only
1815 /// vectorization of the power-of-2 number of unique scalars.
1816 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1817 Value *IdxLaneV = getData(Idx, Lane).V;
1818 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1819 return 0;
1821 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1822 if (Ln == Lane)
1823 continue;
1824 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1825 if (!isa<Instruction>(OpIdxLnV))
1826 return 0;
1827 Uniques.insert(OpIdxLnV);
1828 }
1829 int UniquesCount = Uniques.size();
1830 int UniquesCntWithIdxLaneV =
1831 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1832 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1833 int UniquesCntWithOpIdxLaneV =
1834 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1835 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1836 return 0;
1837 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1838 UniquesCntWithOpIdxLaneV) -
1839 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1840 }
1841
1842 /// \param Lane lane of the operands under analysis.
1843 /// \param OpIdx operand index in \p Lane lane we're looking the best
1844 /// candidate for.
1845 /// \param Idx operand index of the current candidate value.
1846 /// \returns The additional score for the scalar which users are all
1847 /// vectorized.
1848 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1849 Value *IdxLaneV = getData(Idx, Lane).V;
1850 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1851 // Do not care about number of uses for vector-like instructions
1852 // (extractelement/extractvalue with constant indices), they are extracts
1853 // themselves and already externally used. Vectorization of such
1854 // instructions does not add extra extractelement instruction, just may
1855 // remove it.
1856 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1857 isVectorLikeInstWithConstOps(OpIdxLaneV))
1859 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1860 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1861 return 0;
1862 return R.areAllUsersVectorized(IdxLaneI)
1864 : 0;
1865 }
1866
1867 /// Score scaling factor for fully compatible instructions but with
1868 /// different number of external uses. Allows better selection of the
1869 /// instructions with less external uses.
1870 static const int ScoreScaleFactor = 10;
1871
1872 /// \Returns the look-ahead score, which tells us how much the sub-trees
1873 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1874 /// score. This helps break ties in an informed way when we cannot decide on
1875 /// the order of the operands by just considering the immediate
1876 /// predecessors.
1877 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1878 int Lane, unsigned OpIdx, unsigned Idx,
1879 bool &IsUsed) {
1880 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1882 // Keep track of the instruction stack as we recurse into the operands
1883 // during the look-ahead score exploration.
1884 int Score =
1885 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1886 /*CurrLevel=*/1, MainAltOps);
1887 if (Score) {
1888 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1889 if (Score <= -SplatScore) {
1890 // Set the minimum score for splat-like sequence to avoid setting
1891 // failed state.
1892 Score = 1;
1893 } else {
1894 Score += SplatScore;
1895 // Scale score to see the difference between different operands
1896 // and similar operands but all vectorized/not all vectorized
1897 // uses. It does not affect actual selection of the best
1898 // compatible operand in general, just allows to select the
1899 // operand with all vectorized uses.
1900 Score *= ScoreScaleFactor;
1901 Score += getExternalUseScore(Lane, OpIdx, Idx);
1902 IsUsed = true;
1903 }
1904 }
1905 return Score;
1906 }
1907
1908 /// Best defined scores per lanes between the passes. Used to choose the
1909 /// best operand (with the highest score) between the passes.
1910 /// The key - {Operand Index, Lane}.
1911 /// The value - the best score between the passes for the lane and the
1912 /// operand.
1914 BestScoresPerLanes;
1915
1916 // Search all operands in Ops[*][Lane] for the one that matches best
1917 // Ops[OpIdx][LastLane] and return its opreand index.
1918 // If no good match can be found, return std::nullopt.
1919 std::optional<unsigned>
1920 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1921 ArrayRef<ReorderingMode> ReorderingModes,
1922 ArrayRef<Value *> MainAltOps) {
1923 unsigned NumOperands = getNumOperands();
1924
1925 // The operand of the previous lane at OpIdx.
1926 Value *OpLastLane = getData(OpIdx, LastLane).V;
1927
1928 // Our strategy mode for OpIdx.
1929 ReorderingMode RMode = ReorderingModes[OpIdx];
1930 if (RMode == ReorderingMode::Failed)
1931 return std::nullopt;
1932
1933 // The linearized opcode of the operand at OpIdx, Lane.
1934 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1935
1936 // The best operand index and its score.
1937 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1938 // are using the score to differentiate between the two.
1939 struct BestOpData {
1940 std::optional<unsigned> Idx;
1941 unsigned Score = 0;
1942 } BestOp;
1943 BestOp.Score =
1944 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1945 .first->second;
1946
1947 // Track if the operand must be marked as used. If the operand is set to
1948 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1949 // want to reestimate the operands again on the following iterations).
1950 bool IsUsed = RMode == ReorderingMode::Splat ||
1951 RMode == ReorderingMode::Constant ||
1952 RMode == ReorderingMode::Load;
1953 // Iterate through all unused operands and look for the best.
1954 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1955 // Get the operand at Idx and Lane.
1956 OperandData &OpData = getData(Idx, Lane);
1957 Value *Op = OpData.V;
1958 bool OpAPO = OpData.APO;
1959
1960 // Skip already selected operands.
1961 if (OpData.IsUsed)
1962 continue;
1963
1964 // Skip if we are trying to move the operand to a position with a
1965 // different opcode in the linearized tree form. This would break the
1966 // semantics.
1967 if (OpAPO != OpIdxAPO)
1968 continue;
1969
1970 // Look for an operand that matches the current mode.
1971 switch (RMode) {
1972 case ReorderingMode::Load:
1973 case ReorderingMode::Opcode: {
1974 bool LeftToRight = Lane > LastLane;
1975 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1976 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1977 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1978 OpIdx, Idx, IsUsed);
1979 if (Score > static_cast<int>(BestOp.Score) ||
1980 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1981 Idx == OpIdx)) {
1982 BestOp.Idx = Idx;
1983 BestOp.Score = Score;
1984 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1985 }
1986 break;
1987 }
1988 case ReorderingMode::Constant:
1989 if (isa<Constant>(Op) ||
1990 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1991 BestOp.Idx = Idx;
1992 if (isa<Constant>(Op)) {
1994 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1996 }
1997 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1998 IsUsed = false;
1999 }
2000 break;
2001 case ReorderingMode::Splat:
2002 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2003 IsUsed = Op == OpLastLane;
2004 if (Op == OpLastLane) {
2005 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2006 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2008 }
2009 BestOp.Idx = Idx;
2010 }
2011 break;
2012 case ReorderingMode::Failed:
2013 llvm_unreachable("Not expected Failed reordering mode.");
2014 }
2015 }
2016
2017 if (BestOp.Idx) {
2018 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2019 return BestOp.Idx;
2020 }
2021 // If we could not find a good match return std::nullopt.
2022 return std::nullopt;
2023 }
2024
2025 /// Helper for reorderOperandVecs.
2026 /// \returns the lane that we should start reordering from. This is the one
2027 /// which has the least number of operands that can freely move about or
2028 /// less profitable because it already has the most optimal set of operands.
2029 unsigned getBestLaneToStartReordering() const {
2030 unsigned Min = UINT_MAX;
2031 unsigned SameOpNumber = 0;
2032 // std::pair<unsigned, unsigned> is used to implement a simple voting
2033 // algorithm and choose the lane with the least number of operands that
2034 // can freely move about or less profitable because it already has the
2035 // most optimal set of operands. The first unsigned is a counter for
2036 // voting, the second unsigned is the counter of lanes with instructions
2037 // with same/alternate opcodes and same parent basic block.
2039 // Try to be closer to the original results, if we have multiple lanes
2040 // with same cost. If 2 lanes have the same cost, use the one with the
2041 // lowest index.
2042 for (int I = getNumLanes(); I > 0; --I) {
2043 unsigned Lane = I - 1;
2044 OperandsOrderData NumFreeOpsHash =
2045 getMaxNumOperandsThatCanBeReordered(Lane);
2046 // Compare the number of operands that can move and choose the one with
2047 // the least number.
2048 if (NumFreeOpsHash.NumOfAPOs < Min) {
2049 Min = NumFreeOpsHash.NumOfAPOs;
2050 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2051 HashMap.clear();
2052 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2053 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2054 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2055 // Select the most optimal lane in terms of number of operands that
2056 // should be moved around.
2057 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2058 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2059 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2060 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2061 auto *It = HashMap.find(NumFreeOpsHash.Hash);
2062 if (It == HashMap.end())
2063 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2064 else
2065 ++It->second.first;
2066 }
2067 }
2068 // Select the lane with the minimum counter.
2069 unsigned BestLane = 0;
2070 unsigned CntMin = UINT_MAX;
2071 for (const auto &Data : reverse(HashMap)) {
2072 if (Data.second.first < CntMin) {
2073 CntMin = Data.second.first;
2074 BestLane = Data.second.second;
2075 }
2076 }
2077 return BestLane;
2078 }
2079
2080 /// Data structure that helps to reorder operands.
2081 struct OperandsOrderData {
2082 /// The best number of operands with the same APOs, which can be
2083 /// reordered.
2084 unsigned NumOfAPOs = UINT_MAX;
2085 /// Number of operands with the same/alternate instruction opcode and
2086 /// parent.
2087 unsigned NumOpsWithSameOpcodeParent = 0;
2088 /// Hash for the actual operands ordering.
2089 /// Used to count operands, actually their position id and opcode
2090 /// value. It is used in the voting mechanism to find the lane with the
2091 /// least number of operands that can freely move about or less profitable
2092 /// because it already has the most optimal set of operands. Can be
2093 /// replaced with SmallVector<unsigned> instead but hash code is faster
2094 /// and requires less memory.
2095 unsigned Hash = 0;
2096 };
2097 /// \returns the maximum number of operands that are allowed to be reordered
2098 /// for \p Lane and the number of compatible instructions(with the same
2099 /// parent/opcode). This is used as a heuristic for selecting the first lane
2100 /// to start operand reordering.
2101 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2102 unsigned CntTrue = 0;
2103 unsigned NumOperands = getNumOperands();
2104 // Operands with the same APO can be reordered. We therefore need to count
2105 // how many of them we have for each APO, like this: Cnt[APO] = x.
2106 // Since we only have two APOs, namely true and false, we can avoid using
2107 // a map. Instead we can simply count the number of operands that
2108 // correspond to one of them (in this case the 'true' APO), and calculate
2109 // the other by subtracting it from the total number of operands.
2110 // Operands with the same instruction opcode and parent are more
2111 // profitable since we don't need to move them in many cases, with a high
2112 // probability such lane already can be vectorized effectively.
2113 bool AllUndefs = true;
2114 unsigned NumOpsWithSameOpcodeParent = 0;
2115 Instruction *OpcodeI = nullptr;
2116 BasicBlock *Parent = nullptr;
2117 unsigned Hash = 0;
2118 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2119 const OperandData &OpData = getData(OpIdx, Lane);
2120 if (OpData.APO)
2121 ++CntTrue;
2122 // Use Boyer-Moore majority voting for finding the majority opcode and
2123 // the number of times it occurs.
2124 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2125 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2126 I->getParent() != Parent) {
2127 if (NumOpsWithSameOpcodeParent == 0) {
2128 NumOpsWithSameOpcodeParent = 1;
2129 OpcodeI = I;
2130 Parent = I->getParent();
2131 } else {
2132 --NumOpsWithSameOpcodeParent;
2133 }
2134 } else {
2135 ++NumOpsWithSameOpcodeParent;
2136 }
2137 }
2138 Hash = hash_combine(
2139 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2140 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2141 }
2142 if (AllUndefs)
2143 return {};
2144 OperandsOrderData Data;
2145 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2146 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2147 Data.Hash = Hash;
2148 return Data;
2149 }
2150
2151 /// Go through the instructions in VL and append their operands.
2152 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2153 assert(!VL.empty() && "Bad VL");
2154 assert((empty() || VL.size() == getNumLanes()) &&
2155 "Expected same number of lanes");
2156 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2157 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2158 constexpr unsigned IntrinsicNumOperands = 2;
2159 if (isa<IntrinsicInst>(VL[0]))
2160 NumOperands = IntrinsicNumOperands;
2161 OpsVec.resize(NumOperands);
2162 unsigned NumLanes = VL.size();
2163 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2164 OpsVec[OpIdx].resize(NumLanes);
2165 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2166 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2167 // Our tree has just 3 nodes: the root and two operands.
2168 // It is therefore trivial to get the APO. We only need to check the
2169 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2170 // RHS operand. The LHS operand of both add and sub is never attached
2171 // to an inversese operation in the linearized form, therefore its APO
2172 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2173
2174 // Since operand reordering is performed on groups of commutative
2175 // operations or alternating sequences (e.g., +, -), we can safely
2176 // tell the inverse operations by checking commutativity.
2177 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2178 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2179 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2180 APO, false};
2181 }
2182 }
2183 }
2184
2185 /// \returns the number of operands.
2186 unsigned getNumOperands() const { return OpsVec.size(); }
2187
2188 /// \returns the number of lanes.
2189 unsigned getNumLanes() const { return OpsVec[0].size(); }
2190
2191 /// \returns the operand value at \p OpIdx and \p Lane.
2192 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2193 return getData(OpIdx, Lane).V;
2194 }
2195
2196 /// \returns true if the data structure is empty.
2197 bool empty() const { return OpsVec.empty(); }
2198
2199 /// Clears the data.
2200 void clear() { OpsVec.clear(); }
2201
2202 /// \Returns true if there are enough operands identical to \p Op to fill
2203 /// the whole vector (it is mixed with constants or loop invariant values).
2204 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2205 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2206 bool OpAPO = getData(OpIdx, Lane).APO;
2207 bool IsInvariant = L && L->isLoopInvariant(Op);
2208 unsigned Cnt = 0;
2209 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2210 if (Ln == Lane)
2211 continue;
2212 // This is set to true if we found a candidate for broadcast at Lane.
2213 bool FoundCandidate = false;
2214 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2215 OperandData &Data = getData(OpI, Ln);
2216 if (Data.APO != OpAPO || Data.IsUsed)
2217 continue;
2218 Value *OpILane = getValue(OpI, Lane);
2219 bool IsConstantOp = isa<Constant>(OpILane);
2220 // Consider the broadcast candidate if:
2221 // 1. Same value is found in one of the operands.
2222 if (Data.V == Op ||
2223 // 2. The operand in the given lane is not constant but there is a
2224 // constant operand in another lane (which can be moved to the
2225 // given lane). In this case we can represent it as a simple
2226 // permutation of constant and broadcast.
2227 (!IsConstantOp &&
2228 ((Lns > 2 && isa<Constant>(Data.V)) ||
2229 // 2.1. If we have only 2 lanes, need to check that value in the
2230 // next lane does not build same opcode sequence.
2231 (Lns == 2 &&
2232 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2233 .getOpcode() &&
2234 isa<Constant>(Data.V)))) ||
2235 // 3. The operand in the current lane is loop invariant (can be
2236 // hoisted out) and another operand is also a loop invariant
2237 // (though not a constant). In this case the whole vector can be
2238 // hoisted out.
2239 // FIXME: need to teach the cost model about this case for better
2240 // estimation.
2241 (IsInvariant && !isa<Constant>(Data.V) &&
2242 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2243 L->isLoopInvariant(Data.V))) {
2244 FoundCandidate = true;
2245 Data.IsUsed = Data.V == Op;
2246 if (Data.V == Op)
2247 ++Cnt;
2248 break;
2249 }
2250 }
2251 if (!FoundCandidate)
2252 return false;
2253 }
2254 return getNumLanes() == 2 || Cnt > 1;
2255 }
2256
2257 /// Checks if there is at least single compatible operand in lanes other
2258 /// than \p Lane, compatible with the operand \p Op.
2259 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2260 bool OpAPO = getData(OpIdx, Lane).APO;
2261 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2262 if (Ln == Lane)
2263 continue;
2264 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2265 const OperandData &Data = getData(OpI, Ln);
2266 if (Data.APO != OpAPO || Data.IsUsed)
2267 return true;
2268 Value *OpILn = getValue(OpI, Ln);
2269 return (L && L->isLoopInvariant(OpILn)) ||
2270 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2271 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2272 }))
2273 return true;
2274 }
2275 return false;
2276 }
2277
2278 public:
2279 /// Initialize with all the operands of the instruction vector \p RootVL.
2281 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2282 L(R.LI->getLoopFor(
2283 (cast<Instruction>(RootVL.front())->getParent()))) {
2284 // Append all the operands of RootVL.
2285 appendOperandsOfVL(RootVL);
2286 }
2287
2288 /// \Returns a value vector with the operands across all lanes for the
2289 /// opearnd at \p OpIdx.
2290 ValueList getVL(unsigned OpIdx) const {
2291 ValueList OpVL(OpsVec[OpIdx].size());
2292 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2293 "Expected same num of lanes across all operands");
2294 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2295 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2296 return OpVL;
2297 }
2298
2299 // Performs operand reordering for 2 or more operands.
2300 // The original operands are in OrigOps[OpIdx][Lane].
2301 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2302 void reorder() {
2303 unsigned NumOperands = getNumOperands();
2304 unsigned NumLanes = getNumLanes();
2305 // Each operand has its own mode. We are using this mode to help us select
2306 // the instructions for each lane, so that they match best with the ones
2307 // we have selected so far.
2308 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2309
2310 // This is a greedy single-pass algorithm. We are going over each lane
2311 // once and deciding on the best order right away with no back-tracking.
2312 // However, in order to increase its effectiveness, we start with the lane
2313 // that has operands that can move the least. For example, given the
2314 // following lanes:
2315 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2316 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2317 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2318 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2319 // we will start at Lane 1, since the operands of the subtraction cannot
2320 // be reordered. Then we will visit the rest of the lanes in a circular
2321 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2322
2323 // Find the first lane that we will start our search from.
2324 unsigned FirstLane = getBestLaneToStartReordering();
2325
2326 // Initialize the modes.
2327 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2328 Value *OpLane0 = getValue(OpIdx, FirstLane);
2329 // Keep track if we have instructions with all the same opcode on one
2330 // side.
2331 if (isa<LoadInst>(OpLane0))
2332 ReorderingModes[OpIdx] = ReorderingMode::Load;
2333 else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2334 // Check if OpLane0 should be broadcast.
2335 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2336 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2337 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2338 else
2339 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2340 } else if (isa<Constant>(OpLane0))
2341 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2342 else if (isa<Argument>(OpLane0))
2343 // Our best hope is a Splat. It may save some cost in some cases.
2344 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2345 else
2346 // NOTE: This should be unreachable.
2347 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2348 }
2349
2350 // Check that we don't have same operands. No need to reorder if operands
2351 // are just perfect diamond or shuffled diamond match. Do not do it only
2352 // for possible broadcasts or non-power of 2 number of scalars (just for
2353 // now).
2354 auto &&SkipReordering = [this]() {
2355 SmallPtrSet<Value *, 4> UniqueValues;
2356 ArrayRef<OperandData> Op0 = OpsVec.front();
2357 for (const OperandData &Data : Op0)
2358 UniqueValues.insert(Data.V);
2359 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2360 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2361 return !UniqueValues.contains(Data.V);
2362 }))
2363 return false;
2364 }
2365 // TODO: Check if we can remove a check for non-power-2 number of
2366 // scalars after full support of non-power-2 vectorization.
2367 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2368 };
2369
2370 // If the initial strategy fails for any of the operand indexes, then we
2371 // perform reordering again in a second pass. This helps avoid assigning
2372 // high priority to the failed strategy, and should improve reordering for
2373 // the non-failed operand indexes.
2374 for (int Pass = 0; Pass != 2; ++Pass) {
2375 // Check if no need to reorder operands since they're are perfect or
2376 // shuffled diamond match.
2377 // Need to do it to avoid extra external use cost counting for
2378 // shuffled matches, which may cause regressions.
2379 if (SkipReordering())
2380 break;
2381 // Skip the second pass if the first pass did not fail.
2382 bool StrategyFailed = false;
2383 // Mark all operand data as free to use.
2384 clearUsed();
2385 // We keep the original operand order for the FirstLane, so reorder the
2386 // rest of the lanes. We are visiting the nodes in a circular fashion,
2387 // using FirstLane as the center point and increasing the radius
2388 // distance.
2389 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2390 for (unsigned I = 0; I < NumOperands; ++I)
2391 MainAltOps[I].push_back(getData(I, FirstLane).V);
2392
2393 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2394 // Visit the lane on the right and then the lane on the left.
2395 for (int Direction : {+1, -1}) {
2396 int Lane = FirstLane + Direction * Distance;
2397 if (Lane < 0 || Lane >= (int)NumLanes)
2398 continue;
2399 int LastLane = Lane - Direction;
2400 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2401 "Out of bounds");
2402 // Look for a good match for each operand.
2403 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2404 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2405 std::optional<unsigned> BestIdx = getBestOperand(
2406 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2407 // By not selecting a value, we allow the operands that follow to
2408 // select a better matching value. We will get a non-null value in
2409 // the next run of getBestOperand().
2410 if (BestIdx) {
2411 // Swap the current operand with the one returned by
2412 // getBestOperand().
2413 swap(OpIdx, *BestIdx, Lane);
2414 } else {
2415 // Enable the second pass.
2416 StrategyFailed = true;
2417 }
2418 // Try to get the alternate opcode and follow it during analysis.
2419 if (MainAltOps[OpIdx].size() != 2) {
2420 OperandData &AltOp = getData(OpIdx, Lane);
2421 InstructionsState OpS =
2422 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2423 if (OpS.getOpcode() && OpS.isAltShuffle())
2424 MainAltOps[OpIdx].push_back(AltOp.V);
2425 }
2426 }
2427 }
2428 }
2429 // Skip second pass if the strategy did not fail.
2430 if (!StrategyFailed)
2431 break;
2432 }
2433 }
2434
2435#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2436 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2437 switch (RMode) {
2438 case ReorderingMode::Load:
2439 return "Load";
2440 case ReorderingMode::Opcode:
2441 return "Opcode";
2442 case ReorderingMode::Constant:
2443 return "Constant";
2444 case ReorderingMode::Splat:
2445 return "Splat";
2446 case ReorderingMode::Failed:
2447 return "Failed";
2448 }
2449 llvm_unreachable("Unimplemented Reordering Type");
2450 }
2451
2452 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2453 raw_ostream &OS) {
2454 return OS << getModeStr(RMode);
2455 }
2456
2457 /// Debug print.
2458 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2459 printMode(RMode, dbgs());
2460 }
2461
2462 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2463 return printMode(RMode, OS);
2464 }
2465
2467 const unsigned Indent = 2;
2468 unsigned Cnt = 0;
2469 for (const OperandDataVec &OpDataVec : OpsVec) {
2470 OS << "Operand " << Cnt++ << "\n";
2471 for (const OperandData &OpData : OpDataVec) {
2472 OS.indent(Indent) << "{";
2473 if (Value *V = OpData.V)
2474 OS << *V;
2475 else
2476 OS << "null";
2477 OS << ", APO:" << OpData.APO << "}\n";
2478 }
2479 OS << "\n";
2480 }
2481 return OS;
2482 }
2483
2484 /// Debug print.
2485 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2486#endif
2487 };
2488
2489 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2490 /// for a pair which have highest score deemed to have best chance to form
2491 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2492 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2493 /// of the cost, considered to be good enough score.
2494 std::optional<int>
2495 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2496 int Limit = LookAheadHeuristics::ScoreFail) const {
2497 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2499 int BestScore = Limit;
2500 std::optional<int> Index;
2501 for (int I : seq<int>(0, Candidates.size())) {
2502 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2503 Candidates[I].second,
2504 /*U1=*/nullptr, /*U2=*/nullptr,
2505 /*Level=*/1, std::nullopt);
2506 if (Score > BestScore) {
2507 BestScore = Score;
2508 Index = I;
2509 }
2510 }
2511 return Index;
2512 }
2513
2514 /// Checks if the instruction is marked for deletion.
2515 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2516
2517 /// Removes an instruction from its block and eventually deletes it.
2518 /// It's like Instruction::eraseFromParent() except that the actual deletion
2519 /// is delayed until BoUpSLP is destructed.
2521 DeletedInstructions.insert(I);
2522 }
2523
2524 /// Remove instructions from the parent function and clear the operands of \p
2525 /// DeadVals instructions, marking for deletion trivially dead operands.
2526 template <typename T>
2529 for (T *V : DeadVals) {
2530 auto *I = cast<Instruction>(V);
2531 DeletedInstructions.insert(I);
2532 }
2533 DenseSet<Value *> Processed;
2534 for (T *V : DeadVals) {
2535 if (!V || !Processed.insert(V).second)
2536 continue;
2537 auto *I = cast<Instruction>(V);
2540 if (const TreeEntry *Entry = getTreeEntry(I)) {
2541 Entries.push_back(Entry);
2542 auto It = MultiNodeScalars.find(I);
2543 if (It != MultiNodeScalars.end())
2544 Entries.append(It->second.begin(), It->second.end());
2545 }
2546 for (Use &U : I->operands()) {
2547 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2548 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2550 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2551 return Entry->VectorizedValue == OpI;
2552 })))
2553 DeadInsts.push_back(OpI);
2554 }
2555 I->dropAllReferences();
2556 }
2557 for (T *V : DeadVals) {
2558 auto *I = cast<Instruction>(V);
2559 if (!I->getParent())
2560 continue;
2561 assert((I->use_empty() || all_of(I->uses(),
2562 [&](Use &U) {
2563 return isDeleted(
2564 cast<Instruction>(U.getUser()));
2565 })) &&
2566 "trying to erase instruction with users.");
2567 I->removeFromParent();
2568 SE->forgetValue(I);
2569 }
2570 // Process the dead instruction list until empty.
2571 while (!DeadInsts.empty()) {
2572 Value *V = DeadInsts.pop_back_val();
2573 Instruction *VI = cast_or_null<Instruction>(V);
2574 if (!VI || !VI->getParent())
2575 continue;
2577 "Live instruction found in dead worklist!");
2578 assert(VI->use_empty() && "Instructions with uses are not dead.");
2579
2580 // Don't lose the debug info while deleting the instructions.
2581 salvageDebugInfo(*VI);
2582
2583 // Null out all of the instruction's operands to see if any operand
2584 // becomes dead as we go.
2585 for (Use &OpU : VI->operands()) {
2586 Value *OpV = OpU.get();
2587 if (!OpV)
2588 continue;
2589 OpU.set(nullptr);
2590
2591 if (!OpV->use_empty())
2592 continue;
2593
2594 // If the operand is an instruction that became dead as we nulled out
2595 // the operand, and if it is 'trivially' dead, delete it in a future
2596 // loop iteration.
2597 if (auto *OpI = dyn_cast<Instruction>(OpV))
2598 if (!DeletedInstructions.contains(OpI) &&
2600 DeadInsts.push_back(OpI);
2601 }
2602
2603 VI->removeFromParent();
2604 DeletedInstructions.insert(VI);
2605 SE->forgetValue(VI);
2606 }
2607 }
2608
2609 /// Checks if the instruction was already analyzed for being possible
2610 /// reduction root.
2612 return AnalyzedReductionsRoots.count(I);
2613 }
2614 /// Register given instruction as already analyzed for being possible
2615 /// reduction root.
2617 AnalyzedReductionsRoots.insert(I);
2618 }
2619 /// Checks if the provided list of reduced values was checked already for
2620 /// vectorization.
2622 return AnalyzedReductionVals.contains(hash_value(VL));
2623 }
2624 /// Adds the list of reduced values to list of already checked values for the
2625 /// vectorization.
2627 AnalyzedReductionVals.insert(hash_value(VL));
2628 }
2629 /// Clear the list of the analyzed reduction root instructions.
2631 AnalyzedReductionsRoots.clear();
2632 AnalyzedReductionVals.clear();
2633 AnalyzedMinBWVals.clear();
2634 }
2635 /// Checks if the given value is gathered in one of the nodes.
2636 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2637 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2638 }
2639 /// Checks if the given value is gathered in one of the nodes.
2640 bool isGathered(const Value *V) const {
2641 return MustGather.contains(V);
2642 }
2643 /// Checks if the specified value was not schedule.
2644 bool isNotScheduled(const Value *V) const {
2645 return NonScheduledFirst.contains(V);
2646 }
2647
2648 /// Check if the value is vectorized in the tree.
2649 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2650
2651 ~BoUpSLP();
2652
2653private:
2654 /// Determine if a node \p E in can be demoted to a smaller type with a
2655 /// truncation. We collect the entries that will be demoted in ToDemote.
2656 /// \param E Node for analysis
2657 /// \param ToDemote indices of the nodes to be demoted.
2658 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2659 unsigned &BitWidth,
2660 SmallVectorImpl<unsigned> &ToDemote,
2662 unsigned &MaxDepthLevel,
2663 bool &IsProfitableToDemote,
2664 bool IsTruncRoot) const;
2665
2666 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2667 /// reordering (i.e. the operands can be reordered because they have only one
2668 /// user and reordarable).
2669 /// \param ReorderableGathers List of all gather nodes that require reordering
2670 /// (e.g., gather of extractlements or partially vectorizable loads).
2671 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2672 /// reordering, subset of \p NonVectorized.
2673 bool
2674 canReorderOperands(TreeEntry *UserTE,
2675 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2676 ArrayRef<TreeEntry *> ReorderableGathers,
2677 SmallVectorImpl<TreeEntry *> &GatherOps);
2678
2679 /// Checks if the given \p TE is a gather node with clustered reused scalars
2680 /// and reorders it per given \p Mask.
2681 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2682
2683 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2684 /// if any. If it is not vectorized (gather node), returns nullptr.
2685 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2686 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2687 TreeEntry *TE = nullptr;
2688 const auto *It = find_if(VL, [&](Value *V) {
2689 TE = getTreeEntry(V);
2690 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2691 return true;
2692 auto It = MultiNodeScalars.find(V);
2693 if (It != MultiNodeScalars.end()) {
2694 for (TreeEntry *E : It->second) {
2695 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2696 TE = E;
2697 return true;
2698 }
2699 }
2700 }
2701 return false;
2702 });
2703 if (It != VL.end()) {
2704 assert(TE->isSame(VL) && "Expected same scalars.");
2705 return TE;
2706 }
2707 return nullptr;
2708 }
2709
2710 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2711 /// if any. If it is not vectorized (gather node), returns nullptr.
2712 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2713 unsigned OpIdx) const {
2714 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2715 const_cast<TreeEntry *>(UserTE), OpIdx);
2716 }
2717
2718 /// Checks if all users of \p I are the part of the vectorization tree.
2719 bool areAllUsersVectorized(
2720 Instruction *I,
2721 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2722
2723 /// Return information about the vector formed for the specified index
2724 /// of a vector of (the same) instruction.
2726
2727 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2728 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2729
2730 /// \returns Cast context for the given graph node.
2732 getCastContextHint(const TreeEntry &TE) const;
2733
2734 /// \returns the cost of the vectorizable entry.
2735 InstructionCost getEntryCost(const TreeEntry *E,
2736 ArrayRef<Value *> VectorizedVals,
2737 SmallPtrSetImpl<Value *> &CheckedExtracts);
2738
2739 /// This is the recursive part of buildTree.
2740 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2741 const EdgeInfo &EI);
2742
2743 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2744 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2745 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2746 /// returns false, setting \p CurrentOrder to either an empty vector or a
2747 /// non-identity permutation that allows to reuse extract instructions.
2748 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2749 /// extract order.
2750 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2751 SmallVectorImpl<unsigned> &CurrentOrder,
2752 bool ResizeAllowed = false) const;
2753
2754 /// Vectorize a single entry in the tree.
2755 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2756 /// avoid issues with def-use order.
2757 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2758
2759 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2760 /// \p E.
2761 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2762 /// avoid issues with def-use order.
2763 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2764
2765 /// Create a new vector from a list of scalar values. Produces a sequence
2766 /// which exploits values reused across lanes, and arranges the inserts
2767 /// for ease of later optimization.
2768 template <typename BVTy, typename ResTy, typename... Args>
2769 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2770
2771 /// Create a new vector from a list of scalar values. Produces a sequence
2772 /// which exploits values reused across lanes, and arranges the inserts
2773 /// for ease of later optimization.
2774 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2775
2776 /// Returns the instruction in the bundle, which can be used as a base point
2777 /// for scheduling. Usually it is the last instruction in the bundle, except
2778 /// for the case when all operands are external (in this case, it is the first
2779 /// instruction in the list).
2780 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2781
2782 /// Tries to find extractelement instructions with constant indices from fixed
2783 /// vector type and gather such instructions into a bunch, which highly likely
2784 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2785 /// was successful, the matched scalars are replaced by poison values in \p VL
2786 /// for future analysis.
2787 std::optional<TargetTransformInfo::ShuffleKind>
2788 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2789 SmallVectorImpl<int> &Mask) const;
2790
2791 /// Tries to find extractelement instructions with constant indices from fixed
2792 /// vector type and gather such instructions into a bunch, which highly likely
2793 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2794 /// was successful, the matched scalars are replaced by poison values in \p VL
2795 /// for future analysis.
2797 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2799 unsigned NumParts) const;
2800
2801 /// Checks if the gathered \p VL can be represented as a single register
2802 /// shuffle(s) of previous tree entries.
2803 /// \param TE Tree entry checked for permutation.
2804 /// \param VL List of scalars (a subset of the TE scalar), checked for
2805 /// permutations. Must form single-register vector.
2806 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2807 /// commands to build the mask using the original vector value, without
2808 /// relying on the potential reordering.
2809 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2810 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2811 std::optional<TargetTransformInfo::ShuffleKind>
2812 isGatherShuffledSingleRegisterEntry(
2813 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2814 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2815 bool ForOrder);
2816
2817 /// Checks if the gathered \p VL can be represented as multi-register
2818 /// shuffle(s) of previous tree entries.
2819 /// \param TE Tree entry checked for permutation.
2820 /// \param VL List of scalars (a subset of the TE scalar), checked for
2821 /// permutations.
2822 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2823 /// commands to build the mask using the original vector value, without
2824 /// relying on the potential reordering.
2825 /// \returns per-register series of ShuffleKind, if gathered values can be
2826 /// represented as shuffles of previous tree entries. \p Mask is filled with
2827 /// the shuffle mask (also on per-register base).
2829 isGatherShuffledEntry(
2830 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2832 unsigned NumParts, bool ForOrder = false);
2833
2834 /// \returns the scalarization cost for this list of values. Assuming that
2835 /// this subtree gets vectorized, we may need to extract the values from the
2836 /// roots. This method calculates the cost of extracting the values.
2837 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2838 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2839 Type *ScalarTy) const;
2840
2841 /// Set the Builder insert point to one after the last instruction in
2842 /// the bundle
2843 void setInsertPointAfterBundle(const TreeEntry *E);
2844
2845 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2846 /// specified, the starting vector value is poison.
2847 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2848
2849 /// \returns whether the VectorizableTree is fully vectorizable and will
2850 /// be beneficial even the tree height is tiny.
2851 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2852
2853 /// Reorder commutative or alt operands to get better probability of
2854 /// generating vectorized code.
2855 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2858 const BoUpSLP &R);
2859
2860 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2861 /// users of \p TE and collects the stores. It returns the map from the store
2862 /// pointers to the collected stores.
2864 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2865
2866 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2867 /// stores in \p StoresVec can form a vector instruction. If so it returns
2868 /// true and populates \p ReorderIndices with the shuffle indices of the
2869 /// stores when compared to the sorted vector.
2870 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2871 OrdersType &ReorderIndices) const;
2872
2873 /// Iterates through the users of \p TE, looking for scalar stores that can be
2874 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2875 /// their order and builds an order index vector for each store bundle. It
2876 /// returns all these order vectors found.
2877 /// We run this after the tree has formed, otherwise we may come across user
2878 /// instructions that are not yet in the tree.
2880 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2881
2882 struct TreeEntry {
2883 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2884 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2885
2886 /// \returns Common mask for reorder indices and reused scalars.
2887 SmallVector<int> getCommonMask() const {
2889 inversePermutation(ReorderIndices, Mask);
2890 ::addMask(Mask, ReuseShuffleIndices);
2891 return Mask;
2892 }
2893
2894 /// \returns true if the scalars in VL are equal to this entry.
2895 bool isSame(ArrayRef<Value *> VL) const {
2896 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2897 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2898 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2899 return VL.size() == Mask.size() &&
2900 std::equal(VL.begin(), VL.end(), Mask.begin(),
2901 [Scalars](Value *V, int Idx) {
2902 return (isa<UndefValue>(V) &&
2903 Idx == PoisonMaskElem) ||
2904 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2905 });
2906 };
2907 if (!ReorderIndices.empty()) {
2908 // TODO: implement matching if the nodes are just reordered, still can
2909 // treat the vector as the same if the list of scalars matches VL
2910 // directly, without reordering.
2912 inversePermutation(ReorderIndices, Mask);
2913 if (VL.size() == Scalars.size())
2914 return IsSame(Scalars, Mask);
2915 if (VL.size() == ReuseShuffleIndices.size()) {
2916 ::addMask(Mask, ReuseShuffleIndices);
2917 return IsSame(Scalars, Mask);
2918 }
2919 return false;
2920 }
2921 return IsSame(Scalars, ReuseShuffleIndices);
2922 }
2923
2924 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2925 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2926 UserTreeIndices.front().UserTE == UserEI.UserTE;
2927 }
2928
2929 /// \returns true if current entry has same operands as \p TE.
2930 bool hasEqualOperands(const TreeEntry &TE) const {
2931 if (TE.getNumOperands() != getNumOperands())
2932 return false;
2933 SmallBitVector Used(getNumOperands());
2934 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2935 unsigned PrevCount = Used.count();
2936 for (unsigned K = 0; K < E; ++K) {
2937 if (Used.test(K))
2938 continue;
2939 if (getOperand(K) == TE.getOperand(I)) {
2940 Used.set(K);
2941 break;
2942 }
2943 }
2944 // Check if we actually found the matching operand.
2945 if (PrevCount == Used.count())
2946 return false;
2947 }
2948 return true;
2949 }
2950
2951 /// \return Final vectorization factor for the node. Defined by the total
2952 /// number of vectorized scalars, including those, used several times in the
2953 /// entry and counted in the \a ReuseShuffleIndices, if any.
2954 unsigned getVectorFactor() const {
2955 if (!ReuseShuffleIndices.empty())
2956 return ReuseShuffleIndices.size();
2957 return Scalars.size();
2958 };
2959
2960 /// Checks if the current node is a gather node.
2961 bool isGather() const {return State == NeedToGather; }
2962
2963 /// A vector of scalars.
2964 ValueList Scalars;
2965
2966 /// The Scalars are vectorized into this value. It is initialized to Null.
2967 WeakTrackingVH VectorizedValue = nullptr;
2968
2969 /// New vector phi instructions emitted for the vectorized phi nodes.
2970 PHINode *PHI = nullptr;
2971
2972 /// Do we need to gather this sequence or vectorize it
2973 /// (either with vector instruction or with scatter/gather
2974 /// intrinsics for store/load)?
2975 enum EntryState {
2976 Vectorize, ///< The node is regularly vectorized.
2977 ScatterVectorize, ///< Masked scatter/gather node.
2978 StridedVectorize, ///< Strided loads (and stores)
2979 NeedToGather, ///< Gather/buildvector node.
2980 CombinedVectorize, ///< Vectorized node, combined with its user into more
2981 ///< complex node like select/cmp to minmax, mul/add to
2982 ///< fma, etc. Must be used for the following nodes in
2983 ///< the pattern, not the very first one.
2984 };
2985 EntryState State;
2986
2987 /// List of combined opcodes supported by the vectorizer.
2988 enum CombinedOpcode {
2989 NotCombinedOp = -1,
2990 MinMax = Instruction::OtherOpsEnd + 1,
2991 };
2992 CombinedOpcode CombinedOp = NotCombinedOp;
2993
2994 /// Does this sequence require some shuffling?
2995 SmallVector<int, 4> ReuseShuffleIndices;
2996
2997 /// Does this entry require reordering?
2998 SmallVector<unsigned, 4> ReorderIndices;
2999
3000 /// Points back to the VectorizableTree.
3001 ///
3002 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3003 /// to be a pointer and needs to be able to initialize the child iterator.
3004 /// Thus we need a reference back to the container to translate the indices
3005 /// to entries.
3006 VecTreeTy &Container;
3007
3008 /// The TreeEntry index containing the user of this entry. We can actually
3009 /// have multiple users so the data structure is not truly a tree.
3010 SmallVector<EdgeInfo, 1> UserTreeIndices;
3011
3012 /// The index of this treeEntry in VectorizableTree.
3013 int Idx = -1;
3014
3015 private:
3016 /// The operands of each instruction in each lane Operands[op_index][lane].
3017 /// Note: This helps avoid the replication of the code that performs the
3018 /// reordering of operands during buildTree_rec() and vectorizeTree().
3020
3021 /// The main/alternate instruction.
3022 Instruction *MainOp = nullptr;
3023 Instruction *AltOp = nullptr;
3024
3025 public:
3026 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3027 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3028 if (Operands.size() < OpIdx + 1)
3029 Operands.resize(OpIdx + 1);
3030 assert(Operands[OpIdx].empty() && "Already resized?");
3031 assert(OpVL.size() <= Scalars.size() &&
3032 "Number of operands is greater than the number of scalars.");
3033 Operands[OpIdx].resize(OpVL.size());
3034 copy(OpVL, Operands[OpIdx].begin());
3035 }
3036
3037 /// Set the operands of this bundle in their original order.
3038 void setOperandsInOrder() {
3039 assert(Operands.empty() && "Already initialized?");
3040 auto *I0 = cast<Instruction>(Scalars[0]);
3041 Operands.resize(I0->getNumOperands());
3042 unsigned NumLanes = Scalars.size();
3043 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3044 OpIdx != NumOperands; ++OpIdx) {
3045 Operands[OpIdx].resize(NumLanes);
3046 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3047 auto *I = cast<Instruction>(Scalars[Lane]);
3048 assert(I->getNumOperands() == NumOperands &&
3049 "Expected same number of operands");
3050 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3051 }
3052 }
3053 }
3054
3055 /// Reorders operands of the node to the given mask \p Mask.
3056 void reorderOperands(ArrayRef<int> Mask) {
3057 for (ValueList &Operand : Operands)
3058 reorderScalars(Operand, Mask);
3059 }
3060
3061 /// \returns the \p OpIdx operand of this TreeEntry.
3062 ValueList &getOperand(unsigned OpIdx) {
3063 assert(OpIdx < Operands.size() && "Off bounds");
3064 return Operands[OpIdx];
3065 }
3066
3067 /// \returns the \p OpIdx operand of this TreeEntry.
3068 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3069 assert(OpIdx < Operands.size() && "Off bounds");
3070 return Operands[OpIdx];
3071 }
3072
3073 /// \returns the number of operands.
3074 unsigned getNumOperands() const { return Operands.size(); }
3075
3076 /// \return the single \p OpIdx operand.
3077 Value *getSingleOperand(unsigned OpIdx) const {
3078 assert(OpIdx < Operands.size() && "Off bounds");
3079 assert(!Operands[OpIdx].empty() && "No operand available");
3080 return Operands[OpIdx][0];
3081 }
3082
3083 /// Some of the instructions in the list have alternate opcodes.
3084 bool isAltShuffle() const { return MainOp != AltOp; }
3085
3086 bool isOpcodeOrAlt(Instruction *I) const {
3087 unsigned CheckedOpcode = I->getOpcode();
3088 return (getOpcode() == CheckedOpcode ||
3089 getAltOpcode() == CheckedOpcode);
3090 }
3091
3092 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3093 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3094 /// \p OpValue.
3095 Value *isOneOf(Value *Op) const {
3096 auto *I = dyn_cast<Instruction>(Op);
3097 if (I && isOpcodeOrAlt(I))
3098 return Op;
3099 return MainOp;
3100 }
3101
3102 void setOperations(const InstructionsState &S) {
3103 MainOp = S.MainOp;
3104 AltOp = S.AltOp;
3105 }
3106
3107 Instruction *getMainOp() const {
3108 return MainOp;
3109 }
3110
3111 Instruction *getAltOp() const {
3112 return AltOp;
3113 }
3114
3115 /// The main/alternate opcodes for the list of instructions.
3116 unsigned getOpcode() const {
3117 return MainOp ? MainOp->getOpcode() : 0;
3118 }
3119
3120 unsigned getAltOpcode() const {
3121 return AltOp ? AltOp->getOpcode() : 0;
3122 }
3123
3124 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3125 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3126 int findLaneForValue(Value *V) const {
3127 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
3128 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3129 if (!ReorderIndices.empty())
3130 FoundLane = ReorderIndices[FoundLane];
3131 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3132 if (!ReuseShuffleIndices.empty()) {
3133 FoundLane = std::distance(ReuseShuffleIndices.begin(),
3134 find(ReuseShuffleIndices, FoundLane));
3135 }
3136 return FoundLane;
3137 }
3138
3139 /// Build a shuffle mask for graph entry which represents a merge of main
3140 /// and alternate operations.
3141 void
3142 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3144 SmallVectorImpl<Value *> *OpScalars = nullptr,
3145 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3146
3147 /// Return true if this is a non-power-of-2 node.
3148 bool isNonPowOf2Vec() const {
3149 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
3150 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3151 "Reshuffling not supported with non-power-of-2 vectors yet.");
3152 return IsNonPowerOf2;
3153 }
3154
3155#ifndef NDEBUG
3156 /// Debug printer.
3157 LLVM_DUMP_METHOD void dump() const {
3158 dbgs() << Idx << ".\n";
3159 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3160 dbgs() << "Operand " << OpI << ":\n";
3161 for (const Value *V : Operands[OpI])
3162 dbgs().indent(2) << *V << "\n";
3163 }
3164 dbgs() << "Scalars: \n";
3165 for (Value *V : Scalars)
3166 dbgs().indent(2) << *V << "\n";
3167 dbgs() << "State: ";
3168 switch (State) {
3169 case Vectorize:
3170 dbgs() << "Vectorize\n";
3171 break;
3172 case ScatterVectorize:
3173 dbgs() << "ScatterVectorize\n";
3174 break;
3175 case StridedVectorize:
3176 dbgs() << "StridedVectorize\n";
3177 break;
3178 case NeedToGather:
3179 dbgs() << "NeedToGather\n";
3180 break;
3181 case CombinedVectorize:
3182 dbgs() << "CombinedVectorize\n";
3183 break;
3184 }
3185 dbgs() << "MainOp: ";
3186 if (MainOp)
3187 dbgs() << *MainOp << "\n";
3188 else
3189 dbgs() << "NULL\n";
3190 dbgs() << "AltOp: ";
3191 if (AltOp)
3192 dbgs() << *AltOp << "\n";
3193 else
3194 dbgs() << "NULL\n";
3195 dbgs() << "VectorizedValue: ";
3196 if (VectorizedValue)
3197 dbgs() << *VectorizedValue << "\n";
3198 else
3199 dbgs() << "NULL\n";
3200 dbgs() << "ReuseShuffleIndices: ";
3201 if (ReuseShuffleIndices.empty())
3202 dbgs() << "Empty";
3203 else
3204 for (int ReuseIdx : ReuseShuffleIndices)
3205 dbgs() << ReuseIdx << ", ";
3206 dbgs() << "\n";
3207 dbgs() << "ReorderIndices: ";
3208 for (unsigned ReorderIdx : ReorderIndices)
3209 dbgs() << ReorderIdx << ", ";
3210 dbgs() << "\n";
3211 dbgs() << "UserTreeIndices: ";
3212 for (const auto &EInfo : UserTreeIndices)
3213 dbgs() << EInfo << ", ";
3214 dbgs() << "\n";
3215 }
3216#endif
3217 };
3218
3219#ifndef NDEBUG
3220 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3221 InstructionCost VecCost, InstructionCost ScalarCost,
3222 StringRef Banner) const {
3223 dbgs() << "SLP: " << Banner << ":\n";
3224 E->dump();
3225 dbgs() << "SLP: Costs:\n";
3226 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3227 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3228 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3229 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3230 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3231 }
3232#endif
3233
3234 /// Create a new VectorizableTree entry.
3235 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3236 std::optional<ScheduleData *> Bundle,
3237 const InstructionsState &S,
3238 const EdgeInfo &UserTreeIdx,
3239 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3240 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3241 TreeEntry::EntryState EntryState =
3242 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3243 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3244 ReuseShuffleIndices, ReorderIndices);
3245 }
3246
3247 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3248 TreeEntry::EntryState EntryState,
3249 std::optional<ScheduleData *> Bundle,
3250 const InstructionsState &S,
3251 const EdgeInfo &UserTreeIdx,
3252 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3253 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3254 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3255 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3256 "Need to vectorize gather entry?");
3257 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3258 TreeEntry *Last = VectorizableTree.back().get();
3259 Last->Idx = VectorizableTree.size() - 1;
3260 Last->State = EntryState;
3261 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3262 ReuseShuffleIndices.end());
3263 if (ReorderIndices.empty()) {
3264 Last->Scalars.assign(VL.begin(), VL.end());
3265 Last->setOperations(S);
3266 } else {
3267 // Reorder scalars and build final mask.
3268 Last->Scalars.assign(VL.size(), nullptr);
3269 transform(ReorderIndices, Last->Scalars.begin(),
3270 [VL](unsigned Idx) -> Value * {
3271 if (Idx >= VL.size())
3272 return UndefValue::get(VL.front()->getType());
3273 return VL[Idx];
3274 });
3275 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3276 Last->setOperations(S);
3277 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3278 }
3279 if (!Last->isGather()) {
3280 for (Value *V : VL) {
3281 const TreeEntry *TE = getTreeEntry(V);
3282 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3283 "Scalar already in tree!");
3284 if (TE) {
3285 if (TE != Last)
3286 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3287 continue;
3288 }
3289 ScalarToTreeEntry[V] = Last;
3290 }
3291 // Update the scheduler bundle to point to this TreeEntry.
3292 ScheduleData *BundleMember = *Bundle;
3293 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3294 isVectorLikeInstWithConstOps(S.MainOp) ||
3295 doesNotNeedToSchedule(VL)) &&
3296 "Bundle and VL out of sync");
3297 if (BundleMember) {
3298 for (Value *V : VL) {
3300 continue;
3301 if (!BundleMember)
3302 continue;
3303 BundleMember->TE = Last;
3304 BundleMember = BundleMember->NextInBundle;
3305 }
3306 }
3307 assert(!BundleMember && "Bundle and VL out of sync");
3308 } else {
3309 // Build a map for gathered scalars to the nodes where they are used.
3310 bool AllConstsOrCasts = true;
3311 for (Value *V : VL)
3312 if (!isConstant(V)) {
3313 auto *I = dyn_cast<CastInst>(V);
3314 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3315 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3316 }
3317 if (AllConstsOrCasts)
3318 CastMaxMinBWSizes =
3319 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3320 MustGather.insert(VL.begin(), VL.end());
3321 }
3322
3323 if (UserTreeIdx.UserTE) {
3324 Last->UserTreeIndices.push_back(UserTreeIdx);
3325 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3326 "Reordering isn't implemented for non-power-of-2 nodes yet");
3327 }
3328 return Last;
3329 }
3330
3331 /// -- Vectorization State --
3332 /// Holds all of the tree entries.
3333 TreeEntry::VecTreeTy VectorizableTree;
3334
3335#ifndef NDEBUG
3336 /// Debug printer.
3337 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3338 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3339 VectorizableTree[Id]->dump();
3340 dbgs() << "\n";
3341 }
3342 }
3343#endif
3344
3345 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3346
3347 const TreeEntry *getTreeEntry(Value *V) const {
3348 return ScalarToTreeEntry.lookup(V);
3349 }
3350
3351 /// Check that the operand node of alternate node does not generate
3352 /// buildvector sequence. If it is, then probably not worth it to build
3353 /// alternate shuffle, if number of buildvector operands + alternate
3354 /// instruction > than the number of buildvector instructions.
3355 /// \param S the instructions state of the analyzed values.
3356 /// \param VL list of the instructions with alternate opcodes.
3357 bool areAltOperandsProfitable(const InstructionsState &S,
3358 ArrayRef<Value *> VL) const;
3359
3360 /// Checks if the specified list of the instructions/values can be vectorized
3361 /// and fills required data before actual scheduling of the instructions.
3362 TreeEntry::EntryState getScalarsVectorizationState(
3363 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3364 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3365
3366 /// Maps a specific scalar to its tree entry.
3367 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3368
3369 /// List of scalars, used in several vectorize nodes, and the list of the
3370 /// nodes.
3372
3373 /// Maps a value to the proposed vectorizable size.
3374 SmallDenseMap<Value *, unsigned> InstrElementSize;
3375
3376 /// A list of scalars that we found that we need to keep as scalars.
3377 ValueSet MustGather;
3378
3379 /// A set of first non-schedulable values.
3380 ValueSet NonScheduledFirst;
3381
3382 /// A map between the vectorized entries and the last instructions in the
3383 /// bundles. The bundles are built in use order, not in the def order of the
3384 /// instructions. So, we cannot rely directly on the last instruction in the
3385 /// bundle being the last instruction in the program order during
3386 /// vectorization process since the basic blocks are affected, need to
3387 /// pre-gather them before.
3388 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3389
3390 /// List of gather nodes, depending on other gather/vector nodes, which should
3391 /// be emitted after the vector instruction emission process to correctly
3392 /// handle order of the vector instructions and shuffles.
3393 SetVector<const TreeEntry *> PostponedGathers;
3394
3395 using ValueToGatherNodesMap =
3397 ValueToGatherNodesMap ValueToGatherNodes;
3398
3399 /// This POD struct describes one external user in the vectorized tree.
3400 struct ExternalUser {
3401 ExternalUser(Value *S, llvm::User *U, int L)
3402 : Scalar(S), User(U), Lane(L) {}
3403
3404 // Which scalar in our function.
3405 Value *Scalar;
3406
3407 // Which user that uses the scalar.
3409
3410 // Which lane does the scalar belong to.
3411 int Lane;
3412 };
3413 using UserList = SmallVector<ExternalUser, 16>;
3414
3415 /// Checks if two instructions may access the same memory.
3416 ///
3417 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3418 /// is invariant in the calling loop.
3419 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3420 Instruction *Inst2) {
3421 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3422 return true;
3423 // First check if the result is already in the cache.
3424 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3425 auto It = AliasCache.find(Key);
3426 if (It != AliasCache.end())
3427 return It->second;
3428 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3429 // Store the result in the cache.
3430 AliasCache.try_emplace(Key, Aliased);
3431 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3432 return Aliased;
3433 }
3434
3435 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3436
3437 /// Cache for alias results.
3438 /// TODO: consider moving this to the AliasAnalysis itself.
3440
3441 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3442 // globally through SLP because we don't perform any action which
3443 // invalidates capture results.
3444 BatchAAResults BatchAA;
3445
3446 /// Temporary store for deleted instructions. Instructions will be deleted
3447 /// eventually when the BoUpSLP is destructed. The deferral is required to
3448 /// ensure that there are no incorrect collisions in the AliasCache, which
3449 /// can happen if a new instruction is allocated at the same address as a
3450 /// previously deleted instruction.
3451 DenseSet<Instruction *> DeletedInstructions;
3452
3453 /// Set of the instruction, being analyzed already for reductions.
3454 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3455
3456 /// Set of hashes for the list of reduction values already being analyzed.
3457 DenseSet<size_t> AnalyzedReductionVals;
3458
3459 /// Values, already been analyzed for mininmal bitwidth and found to be
3460 /// non-profitable.
3461 DenseSet<Value *> AnalyzedMinBWVals;
3462
3463 /// A list of values that need to extracted out of the tree.
3464 /// This list holds pairs of (Internal Scalar : External User). External User
3465 /// can be nullptr, it means that this Internal Scalar will be used later,
3466 /// after vectorization.
3467 UserList ExternalUses;
3468
3469 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3470 /// extractelement instructions.
3471 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3472
3473 /// Values used only by @llvm.assume calls.
3475
3476 /// Holds all of the instructions that we gathered, shuffle instructions and
3477 /// extractelements.
3478 SetVector<Instruction *> GatherShuffleExtractSeq;
3479
3480 /// A list of blocks that we are going to CSE.
3481 DenseSet<BasicBlock *> CSEBlocks;
3482
3483 /// Contains all scheduling relevant data for an instruction.
3484 /// A ScheduleData either represents a single instruction or a member of an
3485 /// instruction bundle (= a group of instructions which is combined into a
3486 /// vector instruction).
3487 struct ScheduleData {
3488 // The initial value for the dependency counters. It means that the
3489 // dependencies are not calculated yet.
3490 enum { InvalidDeps = -1 };
3491
3492 ScheduleData() = default;
3493
3494 void init(int BlockSchedulingRegionID, Value *OpVal) {
3495 FirstInBundle = this;
3496 NextInBundle = nullptr;
3497 NextLoadStore = nullptr;
3498 IsScheduled = false;
3499 SchedulingRegionID = BlockSchedulingRegionID;
3500 clearDependencies();
3501 OpValue = OpVal;
3502 TE = nullptr;
3503 }
3504
3505 /// Verify basic self consistency properties
3506 void verify() {
3507 if (hasValidDependencies()) {
3508 assert(UnscheduledDeps <= Dependencies && "invariant");
3509 } else {
3510 assert(UnscheduledDeps == Dependencies && "invariant");
3511 }
3512
3513 if (IsScheduled) {
3514 assert(isSchedulingEntity() &&
3515 "unexpected scheduled state");
3516 for (const ScheduleData *BundleMember = this; BundleMember;
3517 BundleMember = BundleMember->NextInBundle) {
3518 assert(BundleMember->hasValidDependencies() &&
3519 BundleMember->UnscheduledDeps == 0 &&
3520 "unexpected scheduled state");
3521 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3522 "only bundle is marked scheduled");
3523 }
3524 }
3525
3526 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3527 "all bundle members must be in same basic block");
3528 }
3529
3530 /// Returns true if the dependency information has been calculated.
3531 /// Note that depenendency validity can vary between instructions within
3532 /// a single bundle.
3533 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3534
3535 /// Returns true for single instructions and for bundle representatives
3536 /// (= the head of a bundle).
3537 bool isSchedulingEntity() const { return FirstInBundle == this; }
3538
3539 /// Returns true if it represents an instruction bundle and not only a
3540 /// single instruction.
3541 bool isPartOfBundle() const {
3542 return NextInBundle != nullptr || FirstInBundle != this || TE;
3543 }
3544
3545 /// Returns true if it is ready for scheduling, i.e. it has no more
3546 /// unscheduled depending instructions/bundles.
3547 bool isReady() const {
3548 assert(isSchedulingEntity() &&
3549 "can't consider non-scheduling entity for ready list");
3550 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3551 }
3552
3553 /// Modifies the number of unscheduled dependencies for this instruction,
3554 /// and returns the number of remaining dependencies for the containing
3555 /// bundle.
3556 int incrementUnscheduledDeps(int Incr) {
3557 assert(hasValidDependencies() &&
3558 "increment of unscheduled deps would be meaningless");
3559 UnscheduledDeps += Incr;
3560 return FirstInBundle->unscheduledDepsInBundle();
3561 }
3562
3563 /// Sets the number of unscheduled dependencies to the number of
3564 /// dependencies.
3565 void resetUnscheduledDeps() {
3566 UnscheduledDeps = Dependencies;
3567 }
3568
3569 /// Clears all dependency information.
3570 void clearDependencies() {
3571 Dependencies = InvalidDeps;
3572 resetUnscheduledDeps();
3573 MemoryDependencies.clear();
3574 ControlDependencies.clear();
3575 }
3576
3577 int unscheduledDepsInBundle() const {
3578 assert(isSchedulingEntity() && "only meaningful on the bundle");
3579 int Sum = 0;
3580 for (const ScheduleData *BundleMember = this; BundleMember;
3581 BundleMember = BundleMember->NextInBundle) {
3582 if (BundleMember->UnscheduledDeps == InvalidDeps)
3583 return InvalidDeps;
3584 Sum += BundleMember->UnscheduledDeps;
3585 }
3586 return Sum;
3587 }
3588
3589 void dump(raw_ostream &os) const {
3590 if (!isSchedulingEntity()) {
3591 os << "/ " << *Inst;
3592 } else if (NextInBundle) {
3593 os << '[' << *Inst;
3594 ScheduleData *SD = NextInBundle;
3595 while (SD) {
3596 os << ';' << *SD->Inst;
3597 SD = SD->NextInBundle;
3598 }
3599 os << ']';
3600 } else {
3601 os << *Inst;
3602 }
3603 }
3604
3605 Instruction *Inst = nullptr;
3606
3607 /// Opcode of the current instruction in the schedule data.
3608 Value *OpValue = nullptr;
3609
3610 /// The TreeEntry that this instruction corresponds to.
3611 TreeEntry *TE = nullptr;
3612
3613 /// Points to the head in an instruction bundle (and always to this for
3614 /// single instructions).
3615 ScheduleData *FirstInBundle = nullptr;
3616
3617 /// Single linked list of all instructions in a bundle. Null if it is a
3618 /// single instruction.
3619 ScheduleData *NextInBundle = nullptr;
3620
3621 /// Single linked list of all memory instructions (e.g. load, store, call)
3622 /// in the block - until the end of the scheduling region.
3623 ScheduleData *NextLoadStore = nullptr;
3624
3625 /// The dependent memory instructions.
3626 /// This list is derived on demand in calculateDependencies().
3627 SmallVector<ScheduleData *, 4> MemoryDependencies;
3628
3629 /// List of instructions which this instruction could be control dependent
3630 /// on. Allowing such nodes to be scheduled below this one could introduce
3631 /// a runtime fault which didn't exist in the original program.
3632 /// ex: this is a load or udiv following a readonly call which inf loops
3633 SmallVector<ScheduleData *, 4> ControlDependencies;
3634
3635 /// This ScheduleData is in the current scheduling region if this matches
3636 /// the current SchedulingRegionID of BlockScheduling.
3637 int SchedulingRegionID = 0;
3638
3639 /// Used for getting a "good" final ordering of instructions.
3640 int SchedulingPriority = 0;
3641
3642 /// The number of dependencies. Constitutes of the number of users of the
3643 /// instruction plus the number of dependent memory instructions (if any).
3644 /// This value is calculated on demand.
3645 /// If InvalidDeps, the number of dependencies is not calculated yet.
3646 int Dependencies = InvalidDeps;
3647
3648 /// The number of dependencies minus the number of dependencies of scheduled
3649 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3650 /// for scheduling.
3651 /// Note that this is negative as long as Dependencies is not calculated.
3652 int UnscheduledDeps = InvalidDeps;
3653
3654 /// True if this instruction is scheduled (or considered as scheduled in the
3655 /// dry-run).
3656 bool IsScheduled = false;
3657 };
3658
3659#ifndef NDEBUG
3661 const BoUpSLP::ScheduleData &SD) {
3662 SD.dump(os);
3663 return os;
3664 }
3665#endif
3666
3667 friend struct GraphTraits<BoUpSLP *>;
3668 friend struct DOTGraphTraits<BoUpSLP *>;
3669
3670 /// Contains all scheduling data for a basic block.
3671 /// It does not schedules instructions, which are not memory read/write
3672 /// instructions and their operands are either constants, or arguments, or
3673 /// phis, or instructions from others blocks, or their users are phis or from
3674 /// the other blocks. The resulting vector instructions can be placed at the
3675 /// beginning of the basic block without scheduling (if operands does not need
3676 /// to be scheduled) or at the end of the block (if users are outside of the
3677 /// block). It allows to save some compile time and memory used by the
3678 /// compiler.
3679 /// ScheduleData is assigned for each instruction in between the boundaries of
3680 /// the tree entry, even for those, which are not part of the graph. It is
3681 /// required to correctly follow the dependencies between the instructions and
3682 /// their correct scheduling. The ScheduleData is not allocated for the
3683 /// instructions, which do not require scheduling, like phis, nodes with
3684 /// extractelements/insertelements only or nodes with instructions, with
3685 /// uses/operands outside of the block.
3686 struct BlockScheduling {
3687 BlockScheduling(BasicBlock *BB)
3688 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3689
3690 void clear() {
3691 ReadyInsts.clear();
3692 ScheduleStart = nullptr;
3693 ScheduleEnd = nullptr;
3694 FirstLoadStoreInRegion = nullptr;
3695 LastLoadStoreInRegion = nullptr;
3696 RegionHasStackSave = false;
3697
3698 // Reduce the maximum schedule region size by the size of the
3699 // previous scheduling run.
3700 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3701 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3702 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3703 ScheduleRegionSize = 0;
3704
3705 // Make a new scheduling region, i.e. all existing ScheduleData is not
3706 // in the new region yet.
3707 ++SchedulingRegionID;
3708 }
3709
3710 ScheduleData *getScheduleData(Instruction *I) {
3711 if (BB != I->getParent())
3712 // Avoid lookup if can't possibly be in map.
3713 return nullptr;
3714 ScheduleData *SD = ScheduleDataMap.lookup(I);
3715 if (SD && isInSchedulingRegion(SD))
3716 return SD;
3717 return nullptr;
3718 }
3719
3720 ScheduleData *getScheduleData(Value *V) {
3721 if (auto *I = dyn_cast<Instruction>(V))
3722 return getScheduleData(I);
3723 return nullptr;
3724 }
3725
3726 ScheduleData *getScheduleData(Value *V, Value *Key) {
3727 if (V == Key)
3728 return getScheduleData(V);
3729 auto I = ExtraScheduleDataMap.find(V);
3730 if (I != ExtraScheduleDataMap.end()) {
3731 ScheduleData *SD = I->second.lookup(Key);
3732 if (SD && isInSchedulingRegion(SD))
3733 return SD;
3734 }
3735 return nullptr;
3736 }
3737
3738 bool isInSchedulingRegion(ScheduleData *SD) const {
3739 return SD->SchedulingRegionID == SchedulingRegionID;
3740 }
3741
3742 /// Marks an instruction as scheduled and puts all dependent ready
3743 /// instructions into the ready-list.
3744 template <typename ReadyListType>
3745 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3746 SD->IsScheduled = true;
3747 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3748
3749 for (ScheduleData *BundleMember = SD; BundleMember;
3750 BundleMember = BundleMember->NextInBundle) {
3751 if (BundleMember->Inst != BundleMember->OpValue)
3752 continue;
3753
3754 // Handle the def-use chain dependencies.
3755
3756 // Decrement the unscheduled counter and insert to ready list if ready.
3757 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3758 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3759 if (OpDef && OpDef->hasValidDependencies() &&
3760 OpDef->incrementUnscheduledDeps(-1) == 0) {
3761 // There are no more unscheduled dependencies after
3762 // decrementing, so we can put the dependent instruction
3763 // into the ready list.
3764 ScheduleData *DepBundle = OpDef->FirstInBundle;
3765 assert(!DepBundle->IsScheduled &&
3766 "already scheduled bundle gets ready");
3767 ReadyList.insert(DepBundle);
3768 LLVM_DEBUG(dbgs()
3769 << "SLP: gets ready (def): " << *DepBundle << "\n");
3770 }
3771 });
3772 };
3773
3774 // If BundleMember is a vector bundle, its operands may have been
3775 // reordered during buildTree(). We therefore need to get its operands
3776 // through the TreeEntry.
3777 if (TreeEntry *TE = BundleMember->TE) {
3778 // Need to search for the lane since the tree entry can be reordered.
3779 int Lane = std::distance(TE->Scalars.begin(),
3780 find(TE->Scalars, BundleMember->Inst));
3781 assert(Lane >= 0 && "Lane not set");
3782
3783 // Since vectorization tree is being built recursively this assertion
3784 // ensures that the tree entry has all operands set before reaching
3785 // this code. Couple of exceptions known at the moment are extracts
3786 // where their second (immediate) operand is not added. Since
3787 // immediates do not affect scheduler behavior this is considered
3788 // okay.
3789 auto *In = BundleMember->Inst;
3790 assert(
3791 In &&
3792 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3793 In->getNumOperands() == TE->getNumOperands()) &&
3794 "Missed TreeEntry operands?");
3795 (void)In; // fake use to avoid build failure when assertions disabled
3796
3797 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3798 OpIdx != NumOperands; ++OpIdx)
3799 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3800 DecrUnsched(I);
3801 } else {
3802 // If BundleMember is a stand-alone instruction, no operand reordering
3803 // has taken place, so we directly access its operands.
3804 for (Use &U : BundleMember->Inst->operands())
3805 if (auto *I = dyn_cast<Instruction>(U.get()))
3806 DecrUnsched(I);
3807 }
3808 // Handle the memory dependencies.
3809 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3810 if (MemoryDepSD->hasValidDependencies() &&
3811 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3812 // There are no more unscheduled dependencies after decrementing,
3813 // so we can put the dependent instruction into the ready list.
3814 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3815 assert(!DepBundle->IsScheduled &&
3816 "already scheduled bundle gets ready");
3817 ReadyList.insert(DepBundle);
3819 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3820 }
3821 }
3822 // Handle the control dependencies.
3823 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3824 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3825 // There are no more unscheduled dependencies after decrementing,
3826 // so we can put the dependent instruction into the ready list.
3827 ScheduleData *DepBundle = DepSD->FirstInBundle;
3828 assert(!DepBundle->IsScheduled &&
3829 "already scheduled bundle gets ready");
3830 ReadyList.insert(DepBundle);
3832 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3833 }
3834 }
3835 }
3836 }
3837
3838 /// Verify basic self consistency properties of the data structure.
3839 void verify() {
3840 if (!ScheduleStart)
3841 return;
3842
3843 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3844 ScheduleStart->comesBefore(ScheduleEnd) &&
3845 "Not a valid scheduling region?");
3846
3847 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3848 auto *SD = getScheduleData(I);
3849 if (!SD)
3850 continue;
3851 assert(isInSchedulingRegion(SD) &&
3852 "primary schedule data not in window?");
3853 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3854 "entire bundle in window!");
3855 (void)SD;
3856 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3857 }
3858
3859 for (auto *SD : ReadyInsts) {
3860 assert(SD->isSchedulingEntity() && SD->isReady() &&
3861 "item in ready list not ready?");
3862 (void)SD;
3863 }
3864 }
3865
3866 void doForAllOpcodes(Value *V,
3867 function_ref<void(ScheduleData *SD)> Action) {
3868 if (ScheduleData *SD = getScheduleData(V))
3869 Action(SD);
3870 auto I = ExtraScheduleDataMap.find(V);
3871 if (I != ExtraScheduleDataMap.end())
3872 for (auto &P : I->second)
3873 if (isInSchedulingRegion(P.second))
3874 Action(P.second);
3875 }
3876
3877 /// Put all instructions into the ReadyList which are ready for scheduling.
3878 template <typename ReadyListType>
3879 void initialFillReadyList(ReadyListType &ReadyList) {
3880 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3881 doForAllOpcodes(I, [&](ScheduleData *SD) {
3882 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3883 SD->isReady()) {
3884 ReadyList.insert(SD);
3885 LLVM_DEBUG(dbgs()
3886 << "SLP: initially in ready list: " << *SD << "\n");
3887 }
3888 });
3889 }
3890 }
3891
3892 /// Build a bundle from the ScheduleData nodes corresponding to the
3893 /// scalar instruction for each lane.
3894 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3895
3896 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3897 /// cyclic dependencies. This is only a dry-run, no instructions are
3898 /// actually moved at this stage.
3899 /// \returns the scheduling bundle. The returned Optional value is not
3900 /// std::nullopt if \p VL is allowed to be scheduled.
3901 std::optional<ScheduleData *>
3902 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3903 const InstructionsState &S);
3904
3905 /// Un-bundles a group of instructions.
3906 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3907
3908 /// Allocates schedule data chunk.
3909 ScheduleData *allocateScheduleDataChunks();
3910
3911 /// Extends the scheduling region so that V is inside the region.
3912 /// \returns true if the region size is within the limit.
3913 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3914
3915 /// Initialize the ScheduleData structures for new instructions in the
3916 /// scheduling region.
3917 void initScheduleData(Instruction *FromI, Instruction *ToI,
3918 ScheduleData *PrevLoadStore,
3919 ScheduleData *NextLoadStore);
3920
3921 /// Updates the dependency information of a bundle and of all instructions/
3922 /// bundles which depend on the original bundle.
3923 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3924 BoUpSLP *SLP);
3925
3926 /// Sets all instruction in the scheduling region to un-scheduled.
3927 void resetSchedule();
3928
3929 BasicBlock *BB;
3930
3931 /// Simple memory allocation for ScheduleData.
3933
3934 /// The size of a ScheduleData array in ScheduleDataChunks.
3935 int ChunkSize;
3936
3937 /// The allocator position in the current chunk, which is the last entry
3938 /// of ScheduleDataChunks.
3939 int ChunkPos;
3940
3941 /// Attaches ScheduleData to Instruction.
3942 /// Note that the mapping survives during all vectorization iterations, i.e.
3943 /// ScheduleData structures are recycled.
3945
3946 /// Attaches ScheduleData to Instruction with the leading key.
3948 ExtraScheduleDataMap;
3949
3950 /// The ready-list for scheduling (only used for the dry-run).
3951 SetVector<ScheduleData *> ReadyInsts;
3952
3953 /// The first instruction of the scheduling region.
3954 Instruction *ScheduleStart = nullptr;
3955
3956 /// The first instruction _after_ the scheduling region.
3957 Instruction *ScheduleEnd = nullptr;
3958
3959 /// The first memory accessing instruction in the scheduling region
3960 /// (can be null).
3961 ScheduleData *FirstLoadStoreInRegion = nullptr;
3962
3963 /// The last memory accessing instruction in the scheduling region
3964 /// (can be null).
3965 ScheduleData *LastLoadStoreInRegion = nullptr;
3966
3967 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3968 /// region? Used to optimize the dependence calculation for the
3969 /// common case where there isn't.
3970 bool RegionHasStackSave = false;
3971
3972 /// The current size of the scheduling region.
3973 int ScheduleRegionSize = 0;
3974
3975 /// The maximum size allowed for the scheduling region.
3976 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3977
3978 /// The ID of the scheduling region. For a new vectorization iteration this
3979 /// is incremented which "removes" all ScheduleData from the region.
3980 /// Make sure that the initial SchedulingRegionID is greater than the
3981 /// initial SchedulingRegionID in ScheduleData (which is 0).
3982 int SchedulingRegionID = 1;
3983 };
3984
3985 /// Attaches the BlockScheduling structures to basic blocks.
3987
3988 /// Performs the "real" scheduling. Done before vectorization is actually
3989 /// performed in a basic block.
3990 void scheduleBlock(BlockScheduling *BS);
3991
3992 /// List of users to ignore during scheduling and that don't need extracting.
3993 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3994
3995 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3996 /// sorted SmallVectors of unsigned.
3997 struct OrdersTypeDenseMapInfo {
3998 static OrdersType getEmptyKey() {
3999 OrdersType V;
4000 V.push_back(~1U);
4001 return V;
4002 }
4003
4004 static OrdersType getTombstoneKey() {
4005 OrdersType V;
4006 V.push_back(~2U);
4007 return V;
4008 }
4009
4010 static unsigned getHashValue(const OrdersType &V) {
4011 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4012 }
4013
4014 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4015 return LHS == RHS;
4016 }
4017 };
4018
4019 // Analysis and block reference.
4020 Function *F;
4021 ScalarEvolution *SE;
4023 TargetLibraryInfo *TLI;
4024 LoopInfo *LI;
4025 DominatorTree *DT;
4026 AssumptionCache *AC;
4027 DemandedBits *DB;
4028 const DataLayout *DL;
4030
4031 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4032 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4033
4034 /// Instruction builder to construct the vectorized tree.
4036
4037 /// A map of scalar integer values to the smallest bit width with which they
4038 /// can legally be represented. The values map to (width, signed) pairs,
4039 /// where "width" indicates the minimum bit width and "signed" is True if the
4040 /// value must be signed-extended, rather than zero-extended, back to its
4041 /// original width.
4043
4044 /// Final size of the reduced vector, if the current graph represents the
4045 /// input for the reduction and it was possible to narrow the size of the
4046 /// reduction.
4047 unsigned ReductionBitWidth = 0;
4048
4049 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4050 /// type sizes, used in the tree.
4051 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4052
4053 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4054 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4055 DenseSet<unsigned> ExtraBitWidthNodes;
4056};
4057
4058} // end namespace slpvectorizer
4059
4060template <> struct GraphTraits<BoUpSLP *> {
4061 using TreeEntry = BoUpSLP::TreeEntry;
4062
4063 /// NodeRef has to be a pointer per the GraphWriter.
4065
4067
4068 /// Add the VectorizableTree to the index iterator to be able to return
4069 /// TreeEntry pointers.
4070 struct ChildIteratorType
4071 : public iterator_adaptor_base<
4072 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4074
4076 ContainerTy &VT)
4077 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4078
4079 NodeRef operator*() { return I->UserTE; }
4080 };
4081
4083 return R.VectorizableTree[0].get();
4084 }
4085
4086 static ChildIteratorType child_begin(NodeRef N) {
4087 return {N->UserTreeIndices.begin(), N->Container};
4088 }
4089
4090 static ChildIteratorType child_end(NodeRef N) {
4091 return {N->UserTreeIndices.end(), N->Container};
4092 }
4093
4094 /// For the node iterator we just need to turn the TreeEntry iterator into a
4095 /// TreeEntry* iterator so that it dereferences to NodeRef.
4096 class nodes_iterator {
4098 ItTy It;
4099
4100 public:
4101 nodes_iterator(const ItTy &It2) : It(It2) {}
4102 NodeRef operator*() { return It->get(); }
4103 nodes_iterator operator++() {
4104 ++It;
4105 return *this;
4106 }
4107 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4108 };
4109
4110 static nodes_iterator nodes_begin(BoUpSLP *R) {
4111 return nodes_iterator(R->VectorizableTree.begin());
4112 }
4113
4114 static nodes_iterator nodes_end(BoUpSLP *R) {
4115 return nodes_iterator(R->VectorizableTree.end());
4116 }
4117
4118 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4119};
4120
4121template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4122 using TreeEntry = BoUpSLP::TreeEntry;
4123
4124 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4125
4126 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4127 std::string Str;
4129 OS << Entry->Idx << ".\n";
4130 if (isSplat(Entry->Scalars))
4131 OS << "<splat> ";
4132 for (auto *V : Entry->Scalars) {
4133 OS << *V;
4134 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4135 return EU.Scalar == V;
4136 }))
4137 OS << " <extract>";
4138 OS << "\n";
4139 }
4140 return Str;
4141 }
4142
4143 static std::string getNodeAttributes(const TreeEntry *Entry,
4144 const BoUpSLP *) {
4145 if (Entry->isGather())
4146 return "color=red";
4147 if (Entry->State == TreeEntry::ScatterVectorize ||
4148 Entry->State == TreeEntry::StridedVectorize)
4149 return "color=blue";
4150 return "";
4151 }
4152};
4153
4154} // end namespace llvm
4155
4158 for (auto *I : DeletedInstructions) {
4159 if (!I->getParent()) {
4160 // Temporarily insert instruction back to erase them from parent and
4161 // memory later.
4162 if (isa<PHINode>(I))
4163 // Phi nodes must be the very first instructions in the block.
4164 I->insertBefore(F->getEntryBlock(),
4165 F->getEntryBlock().getFirstNonPHIIt());
4166 else
4167 I->insertBefore(F->getEntryBlock().getTerminator());
4168 continue;
4169 }
4170 for (Use &U : I->operands()) {
4171 auto *Op = dyn_cast<Instruction>(U.get());
4172 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4174 DeadInsts.emplace_back(Op);
4175 }
4176 I->dropAllReferences();
4177 }
4178 for (auto *I : DeletedInstructions) {
4179 assert(I->use_empty() &&
4180 "trying to erase instruction with users.");
4181 I->eraseFromParent();
4182 }
4183
4184 // Cleanup any dead scalar code feeding the vectorized instructions
4186
4187#ifdef EXPENSIVE_CHECKS
4188 // If we could guarantee that this call is not extremely slow, we could
4189 // remove the ifdef limitation (see PR47712).
4190 assert(!verifyFunction(*F, &dbgs()));
4191#endif
4192}
4193
4194/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4195/// contains original mask for the scalars reused in the node. Procedure
4196/// transform this mask in accordance with the given \p Mask.
4198 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4199 "Expected non-empty mask.");
4200 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4201 Prev.swap(Reuses);
4202 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4203 if (Mask[I] != PoisonMaskElem)
4204 Reuses[Mask[I]] = Prev[I];
4205}
4206
4207/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4208/// the original order of the scalars. Procedure transforms the provided order
4209/// in accordance with the given \p Mask. If the resulting \p Order is just an
4210/// identity order, \p Order is cleared.
4212 bool BottomOrder = false) {
4213 assert(!Mask.empty() && "Expected non-empty mask.");
4214 unsigned Sz = Mask.size();
4215 if (BottomOrder) {
4216 SmallVector<unsigned> PrevOrder;
4217 if (Order.empty()) {
4218 PrevOrder.resize(Sz);
4219 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4220 } else {
4221 PrevOrder.swap(Order);
4222 }
4223 Order.assign(Sz, Sz);
4224 for (unsigned I = 0; I < Sz; ++I)
4225 if (Mask[I] != PoisonMaskElem)
4226 Order[I] = PrevOrder[Mask[I]];
4227 if (all_of(enumerate(Order), [&](const auto &Data) {
4228 return Data.value() == Sz || Data.index() == Data.value();
4229 })) {
4230 Order.clear();
4231 return;
4232 }
4233 fixupOrderingIndices(Order);
4234 return;
4235 }
4236 SmallVector<int> MaskOrder;
4237 if (Order.empty()) {
4238 MaskOrder.resize(Sz);
4239 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4240 } else {
4241 inversePermutation(Order, MaskOrder);
4242 }
4243 reorderReuses(MaskOrder, Mask);
4244 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4245 Order.clear();
4246 return;
4247 }
4248 Order.assign(Sz, Sz);
4249 for (unsigned I = 0; I < Sz; ++I)
4250 if (MaskOrder[I] != PoisonMaskElem)
4251 Order[MaskOrder[I]] = I;
4252 fixupOrderingIndices(Order);
4253}
4254
4255std::optional<BoUpSLP::OrdersType>
4256BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4257 assert(TE.isGather() && "Expected gather node only.");
4258 // Try to find subvector extract/insert patterns and reorder only such
4259 // patterns.
4260 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4261 Type *ScalarTy = GatheredScalars.front()->getType();
4262 int NumScalars = GatheredScalars.size();
4263 if (!isValidElementType(ScalarTy))
4264 return std::nullopt;
4265 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4266 int NumParts = TTI->getNumberOfParts(VecTy);
4267 if (NumParts == 0 || NumParts >= NumScalars)
4268 NumParts = 1;
4269 SmallVector<int> ExtractMask;
4270 SmallVector<int> Mask;
4273 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4275 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4276 /*ForOrder=*/true);
4277 // No shuffled operands - ignore.
4278 if (GatherShuffles.empty() && ExtractShuffles.empty())
4279 return std::nullopt;
4280 OrdersType CurrentOrder(NumScalars, NumScalars);
4281 if (GatherShuffles.size() == 1 &&
4282 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4283 Entries.front().front()->isSame(TE.Scalars)) {
4284 // Perfect match in the graph, will reuse the previously vectorized
4285 // node. Cost is 0.
4286 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4287 return CurrentOrder;
4288 }
4289 auto IsSplatMask = [](ArrayRef<int> Mask) {
4290 int SingleElt = PoisonMaskElem;
4291 return all_of(Mask, [&](int I) {
4292 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4293 SingleElt = I;
4294 return I == PoisonMaskElem || I == SingleElt;
4295 });
4296 };
4297 // Exclusive broadcast mask - ignore.
4298 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4299 (Entries.size() != 1 ||
4300 Entries.front().front()->ReorderIndices.empty())) ||
4301 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4302 return std::nullopt;
4303 SmallBitVector ShuffledSubMasks(NumParts);
4304 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4305 ArrayRef<int> Mask, int PartSz, int NumParts,
4306 function_ref<unsigned(unsigned)> GetVF) {
4307 for (int I : seq<int>(0, NumParts)) {
4308 if (ShuffledSubMasks.test(I))
4309 continue;
4310 const int VF = GetVF(I);
4311 if (VF == 0)
4312 continue;
4313 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4314 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4315 // Shuffle of at least 2 vectors - ignore.
4316 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4317 std::fill(Slice.begin(), Slice.end(), NumScalars);
4318 ShuffledSubMasks.set(I);
4319 continue;
4320 }
4321 // Try to include as much elements from the mask as possible.
4322 int FirstMin = INT_MAX;
4323 int SecondVecFound = false;
4324 for (int K : seq<int>(Limit)) {
4325 int Idx = Mask[I * PartSz + K];
4326 if (Idx == PoisonMaskElem) {
4327 Value *V = GatheredScalars[I * PartSz + K];
4328 if (isConstant(V) && !isa<PoisonValue>(V)) {
4329 SecondVecFound = true;
4330 break;
4331 }
4332 continue;
4333 }
4334 if (Idx < VF) {
4335 if (FirstMin > Idx)
4336 FirstMin = Idx;
4337 } else {
4338 SecondVecFound = true;
4339 break;
4340 }
4341 }
4342 FirstMin = (FirstMin / PartSz) * PartSz;
4343 // Shuffle of at least 2 vectors - ignore.
4344 if (SecondVecFound) {
4345 std::fill(Slice.begin(), Slice.end(), NumScalars);
4346 ShuffledSubMasks.set(I);
4347 continue;
4348 }
4349 for (int K : seq<int>(Limit)) {
4350 int Idx = Mask[I * PartSz + K];
4351 if (Idx == PoisonMaskElem)
4352 continue;
4353 Idx -= FirstMin;
4354 if (Idx >= PartSz) {
4355 SecondVecFound = true;
4356 break;
4357 }
4358 if (CurrentOrder[I * PartSz + Idx] >
4359 static_cast<unsigned>(I * PartSz + K) &&
4360 CurrentOrder[I * PartSz + Idx] !=
4361 static_cast<unsigned>(I * PartSz + Idx))
4362 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4363 }
4364 // Shuffle of at least 2 vectors - ignore.
4365 if (SecondVecFound) {
4366 std::fill(Slice.begin(), Slice.end(), NumScalars);
4367 ShuffledSubMasks.set(I);
4368 continue;
4369 }
4370 }
4371 };
4372 int PartSz = getPartNumElems(NumScalars, NumParts);
4373 if (!ExtractShuffles.empty())
4374 TransformMaskToOrder(
4375 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4376 if (!ExtractShuffles[I])
4377 return 0U;
4378 unsigned VF = 0;
4379 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4380 for (unsigned Idx : seq<unsigned>(Sz)) {
4381 int K = I * PartSz + Idx;
4382 if (ExtractMask[K] == PoisonMaskElem)
4383 continue;
4384 if (!TE.ReuseShuffleIndices.empty())
4385 K = TE.ReuseShuffleIndices[K];
4386 if (!TE.ReorderIndices.empty())
4387 K = std::distance(TE.ReorderIndices.begin(),
4388 find(TE.ReorderIndices, K));
4389 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4390 if (!EI)
4391 continue;
4392 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4393 ->getElementCount()
4394 .getKnownMinValue());
4395 }
4396 return VF;
4397 });
4398 // Check special corner case - single shuffle of the same entry.
4399 if (GatherShuffles.size() == 1 && NumParts != 1) {
4400 if (ShuffledSubMasks.any())
4401 return std::nullopt;
4402 PartSz = NumScalars;
4403 NumParts = 1;
4404 }
4405 if (!Entries.empty())
4406 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4407 if (!GatherShuffles[I])
4408 return 0U;
4409 return std::max(Entries[I].front()->getVectorFactor(),
4410 Entries[I].back()->getVectorFactor());
4411 });
4412 int NumUndefs =
4413 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4414 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4415 return std::nullopt;
4416 return std::move(CurrentOrder);
4417}
4418
4419static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4420 const TargetLibraryInfo &TLI,
4421 bool CompareOpcodes = true) {
4422 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4423 return false;
4424 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4425 if (!GEP1)
4426 return false;
4427 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4428 if (!GEP2)
4429 return false;
4430 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4431 ((isConstant(GEP1->getOperand(1)) &&
4432 isConstant(GEP2->getOperand(1))) ||
4433 !CompareOpcodes ||
4434 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4435 .getOpcode());
4436}
4437
4438/// Calculates minimal alignment as a common alignment.
4439template <typename T>
4441 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4442 for (Value *V : VL.drop_front())
4443 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4444 return CommonAlignment;
4445}
4446
4447/// Check if \p Order represents reverse order.
4449 unsigned Sz = Order.size();
4450 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4451 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4452 });
4453}
4454
4455/// Checks if the provided list of pointers \p Pointers represents the strided
4456/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4457/// Otherwise, if \p Inst is not specified, just initialized optional value is
4458/// returned to show that the pointers represent strided pointers. If \p Inst
4459/// specified, the runtime stride is materialized before the given \p Inst.
4460/// \returns std::nullopt if the pointers are not pointers with the runtime
4461/// stride, nullptr or actual stride value, otherwise.
4462static std::optional<Value *>
4464 const DataLayout &DL, ScalarEvolution &SE,
4465 SmallVectorImpl<unsigned> &SortedIndices,
4466 Instruction *Inst = nullptr) {
4468 const SCEV *PtrSCEVLowest = nullptr;
4469 const SCEV *PtrSCEVHighest = nullptr;
4470 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4471 // addresses).
4472 for (Value *Ptr : PointerOps) {
4473 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4474 if (!PtrSCEV)
4475 return std::nullopt;
4476 SCEVs.push_back(PtrSCEV);
4477 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4478 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4479 continue;
4480 }
4481 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4482 if (isa<SCEVCouldNotCompute>(Diff))
4483 return std::nullopt;
4484 if (Diff->isNonConstantNegative()) {
4485 PtrSCEVLowest = PtrSCEV;
4486 continue;
4487 }
4488 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4489 if (isa<SCEVCouldNotCompute>(Diff1))
4490 return std::nullopt;
4491 if (Diff1->isNonConstantNegative()) {
4492 PtrSCEVHighest = PtrSCEV;
4493 continue;
4494 }
4495 }
4496 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4497 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4498 if (isa<SCEVCouldNotCompute>(Dist))
4499 return std::nullopt;
4500 int Size = DL.getTypeStoreSize(ElemTy);
4501 auto TryGetStride = [&](const SCEV *Dist,
4502 const SCEV *Multiplier) -> const SCEV * {
4503 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4504 if (M->getOperand(0) == Multiplier)
4505 return M->getOperand(1);
4506 if (M->getOperand(1) == Multiplier)
4507 return M->getOperand(0);
4508 return nullptr;
4509 }
4510 if (Multiplier == Dist)
4511 return SE.getConstant(Dist->getType(), 1);
4512 return SE.getUDivExactExpr(Dist, Multiplier);
4513 };
4514 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4515 const SCEV *Stride = nullptr;
4516 if (Size != 1 || SCEVs.size() > 2) {
4517 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4518 Stride = TryGetStride(Dist, Sz);
4519 if (!Stride)
4520 return std::nullopt;
4521 }
4522 if (!Stride || isa<SCEVConstant>(Stride))
4523 return std::nullopt;
4524 // Iterate through all pointers and check if all distances are
4525 // unique multiple of Stride.
4526 using DistOrdPair = std::pair<int64_t, int>;
4527 auto Compare = llvm::less_first();
4528 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4529 int Cnt = 0;
4530 bool IsConsecutive = true;
4531 for (const SCEV *PtrSCEV : SCEVs) {
4532 unsigned Dist = 0;
4533 if (PtrSCEV != PtrSCEVLowest) {
4534 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4535 const SCEV *Coeff = TryGetStride(Diff, Stride);
4536 if (!Coeff)
4537 return std::nullopt;
4538 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4539 if (!SC || isa<SCEVCouldNotCompute>(SC))
4540 return std::nullopt;
4541 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4542 SE.getMulExpr(Stride, SC)))
4543 ->isZero())
4544 return std::nullopt;
4545 Dist = SC->getAPInt().getZExtValue();
4546 }
4547 // If the strides are not the same or repeated, we can't vectorize.
4548 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4549 return std::nullopt;
4550 auto Res = Offsets.emplace(Dist, Cnt);
4551 if (!Res.second)
4552 return std::nullopt;
4553 // Consecutive order if the inserted element is the last one.
4554 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4555 ++Cnt;
4556 }
4557 if (Offsets.size() != SCEVs.size())
4558 return std::nullopt;
4559 SortedIndices.clear();
4560 if (!IsConsecutive) {
4561 // Fill SortedIndices array only if it is non-consecutive.
4562 SortedIndices.resize(PointerOps.size());
4563 Cnt = 0;
4564 for (const std::pair<int64_t, int> &Pair : Offsets) {
4565 SortedIndices[Cnt] = Pair.second;
4566 ++Cnt;
4567 }
4568 }
4569 if (!Inst)
4570 return nullptr;
4571 SCEVExpander Expander(SE, DL, "strided-load-vec");
4572 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4573}
4574
4575static std::pair<InstructionCost, InstructionCost>
4577 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4578 Type *ScalarTy, VectorType *VecTy);
4579
4580/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4581/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4582/// subvector pattern.
4583static InstructionCost
4585 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
4587 int Index = 0, VectorType *SubTp = nullptr,
4588 ArrayRef<const Value *> Args = std::nullopt) {
4589 if (Kind != TTI::SK_PermuteTwoSrc)
4590 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4591 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4592 int NumSubElts;
4593 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
4594 Mask, NumSrcElts, NumSubElts, Index)) {
4595 if (Index + NumSubElts > NumSrcElts &&
4596 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4597 return TTI.getShuffleCost(
4599 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4601 }
4602 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4603}
4604
4606 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4607 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4608 // Check that a vectorized load would load the same memory as a scalar
4609 // load. For example, we don't want to vectorize loads that are smaller
4610 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4611 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4612 // from such a struct, we read/write packed bits disagreeing with the
4613 // unvectorized version.
4614 Type *ScalarTy = VL0->getType();
4615
4616 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4617 return LoadsState::Gather;
4618
4619 // Make sure all loads in the bundle are simple - we can't vectorize
4620 // atomic or volatile loads.
4621 PointerOps.clear();
4622 const unsigned Sz = VL.size();
4623 PointerOps.resize(Sz);
4624 auto *POIter = PointerOps.begin();
4625 for (Value *V : VL) {
4626 auto *L = cast<LoadInst>(V);
4627 if (!L->isSimple())
4628 return LoadsState::Gather;
4629 *POIter = L->getPointerOperand();
4630 ++POIter;
4631 }
4632
4633 Order.clear();
4634 auto *VecTy = getWidenedType(ScalarTy, Sz);
4635 // Check the order of pointer operands or that all pointers are the same.
4636 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4637 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4638 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4639 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4640 "supported with VectorizeNonPowerOf2");
4641 return LoadsState::Gather;
4642 }
4643
4644 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4645 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4646 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4647 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4649 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4650 return arePointersCompatible(P, PointerOps.front(), *TLI);
4651 })) {
4652 if (IsSorted) {
4653 Value *Ptr0;
4654 Value *PtrN;
4655 if (Order.empty()) {
4656 Ptr0 = PointerOps.front();
4657 PtrN = PointerOps.back();
4658 } else {
4659 Ptr0 = PointerOps[Order.front()];
4660 PtrN = PointerOps[Order.back()];
4661 }
4662 std::optional<int> Diff =
4663 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4664 // Check that the sorted loads are consecutive.
4665 if (static_cast<unsigned>(*Diff) == Sz - 1)
4666 return LoadsState::Vectorize;
4667 // Simple check if not a strided access - clear order.
4668 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4669 // Try to generate strided load node if:
4670 // 1. Target with strided load support is detected.
4671 // 2. The number of loads is greater than MinProfitableStridedLoads,
4672 // or the potential stride <= MaxProfitableLoadStride and the
4673 // potential stride is power-of-2 (to avoid perf regressions for the very
4674 // small number of loads) and max distance > number of loads, or potential
4675 // stride is -1.
4676 // 3. The loads are ordered, or number of unordered loads <=
4677 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4678 // (this check is to avoid extra costs for very expensive shuffles).
4679 // 4. Any pointer operand is an instruction with the users outside of the
4680 // current graph (for masked gathers extra extractelement instructions
4681 // might be required).
4682 auto IsAnyPointerUsedOutGraph =
4683 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
4684 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
4685 return !getTreeEntry(U) && !MustGather.contains(U);
4686 });
4687 });
4688 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4690 (static_cast<unsigned>(std::abs(*Diff)) <=
4692 isPowerOf2_32(std::abs(*Diff)))) &&
4693 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4694 *Diff == -(static_cast<int>(Sz) - 1))) {
4695 int Stride = *Diff / static_cast<int>(Sz - 1);
4696 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4697 Align Alignment =
4698 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4699 ->getAlign();
4700 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4701 // Iterate through all pointers and check if all distances are
4702 // unique multiple of Dist.
4703 SmallSet<int, 4> Dists;
4704 for (Value *Ptr : PointerOps) {
4705 int Dist = 0;
4706 if (Ptr == PtrN)
4707 Dist = *Diff;
4708 else if (Ptr != Ptr0)
4709 Dist =
4710 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4711 // If the strides are not the same or repeated, we can't
4712 // vectorize.
4713 if (((Dist / Stride) * Stride) != Dist ||
4714 !Dists.insert(Dist).second)
4715 break;
4716 }
4717 if (Dists.size() == Sz)
4719 }
4720 }
4721 }
4722 }
4723 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4724 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4725 unsigned MinVF = getMinVF(Sz);
4726 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4727 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4728 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4729 unsigned VectorizedCnt = 0;
4731 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4732 Cnt += VF, ++VectorizedCnt) {
4733 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4735 SmallVector<Value *> PointerOps;
4736 LoadsState LS =
4737 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4738 /*TryRecursiveCheck=*/false);
4739 // Check that the sorted loads are consecutive.
4740 if (LS == LoadsState::Gather)
4741 break;
4742 // If need the reorder - consider as high-cost masked gather for now.
4743 if ((LS == LoadsState::Vectorize ||
4745 !Order.empty() && !isReverseOrder(Order))
4747 States.push_back(LS);
4748 }
4749 // Can be vectorized later as a serie of loads/insertelements.
4750 if (VectorizedCnt == VL.size() / VF) {
4751 // Compare masked gather cost and loads + insersubvector costs.
4753 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4754 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4755 CostKind, ScalarTy, VecTy);
4756 InstructionCost MaskedGatherCost =
4758 Instruction::Load, VecTy,
4759 cast<LoadInst>(VL0)->getPointerOperand(),
4760 /*VariableMask=*/false, CommonAlignment, CostKind) +
4761 VectorGEPCost - ScalarGEPCost;
4762 InstructionCost VecLdCost = 0;
4763 auto *SubVecTy = getWidenedType(ScalarTy, VF);
4764 for (auto [I, LS] : enumerate(States)) {
4765 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4766 switch (LS) {
4767 case LoadsState::Vectorize: {
4768 auto [ScalarGEPCost, VectorGEPCost] =
4769 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4770 LI0->getPointerOperand(), Instruction::Load,
4771 CostKind, ScalarTy, SubVecTy);
4772 VecLdCost += TTI.getMemoryOpCost(
4773 Instruction::Load, SubVecTy, LI0->getAlign(),
4774 LI0->getPointerAddressSpace(), CostKind,
4776 VectorGEPCost - ScalarGEPCost;
4777 break;
4778 }
4780 auto [ScalarGEPCost, VectorGEPCost] =
4781 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4782 LI0->getPointerOperand(), Instruction::Load,
4783 CostKind, ScalarTy, SubVecTy);
4784 VecLdCost +=
4786 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4787 /*VariableMask=*/false, CommonAlignment, CostKind) +
4788 VectorGEPCost - ScalarGEPCost;
4789 break;
4790 }
4792 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4793 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4794 LI0->getPointerOperand(), Instruction::GetElementPtr,
4795 CostKind, ScalarTy, SubVecTy);
4796 VecLdCost +=
4798 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4799 /*VariableMask=*/false, CommonAlignment, CostKind) +
4800 VectorGEPCost - ScalarGEPCost;
4801 break;
4802 }
4803 case LoadsState::Gather:
4805 "Expected only consecutive, strided or masked gather loads.");
4806 }
4807 SmallVector<int> ShuffleMask(VL.size());
4808 for (int Idx : seq<int>(0, VL.size()))
4809 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4810 VecLdCost +=
4812 ShuffleMask, CostKind, I * VF, SubVecTy);
4813 }
4814 // If masked gather cost is higher - better to vectorize, so
4815 // consider it as a gather node. It will be better estimated
4816 // later.
4817 if (MaskedGatherCost >= VecLdCost)
4818 return true;
4819 }
4820 }
4821 return false;
4822 };
4823 // TODO: need to improve analysis of the pointers, if not all of them are
4824 // GEPs or have > 2 operands, we end up with a gather node, which just
4825 // increases the cost.
4826 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4827 bool ProfitableGatherPointers =
4828 L && Sz > 2 &&
4829 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4830 return L->isLoopInvariant(V);
4831 })) <= Sz / 2;
4832 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4833 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4834 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4835 (GEP && GEP->getNumOperands() == 2 &&
4836 isa<Constant, Instruction>(GEP->getOperand(1)));
4837 })) {
4838 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4839 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4840 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4841 // Check if potential masked gather can be represented as series
4842 // of loads + insertsubvectors.
4843 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4844 // If masked gather cost is higher - better to vectorize, so
4845 // consider it as a gather node. It will be better estimated
4846 // later.
4847 return LoadsState::Gather;
4848 }
4850 }
4851 }
4852 }
4853
4854 return LoadsState::Gather;
4855}
4856
4858 const DataLayout &DL, ScalarEvolution &SE,
4859 SmallVectorImpl<unsigned> &SortedIndices) {
4861 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4862 "Expected list of pointer operands.");
4863 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4864 // Ptr into, sort and return the sorted indices with values next to one
4865 // another.
4867 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4868
4869 unsigned Cnt = 1;
4870 for (Value *Ptr : VL.drop_front()) {
4871 bool Found = any_of(Bases, [&](auto &Base) {
4872 std::optional<int> Diff =
4873 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4874 /*StrictCheck=*/true);
4875 if (!Diff)
4876 return false;
4877
4878 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4879 return true;
4880 });
4881
4882 if (!Found) {
4883 // If we haven't found enough to usefully cluster, return early.
4884 if (Bases.size() > VL.size() / 2 - 1)
4885 return false;
4886
4887 // Not found already - add a new Base
4888 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4889 }
4890 }
4891
4892 // For each of the bases sort the pointers by Offset and check if any of the
4893 // base become consecutively allocated.
4894 bool AnyConsecutive = false;
4895 for (auto &Base : Bases) {
4896 auto &Vec = Base.second;
4897 if (Vec.size() > 1) {
4898 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4899 const std::tuple<Value *, int, unsigned> &Y) {
4900 return std::get<1>(X) < std::get<1>(Y);
4901 });
4902 int InitialOffset = std::get<1>(Vec[0]);
4903 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4904 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4905 });
4906 }
4907 }
4908
4909 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4910 SortedIndices.clear();
4911 if (!AnyConsecutive)
4912 return false;
4913
4914 // If we have a better order, also sort the base pointers by increasing
4915 // (variable) values if possible, to try and keep the order more regular. In
4916 // order to create a valid strict-weak order we cluster by the Root of gep
4917 // chains and sort within each.
4919 for (auto &Base : Bases) {
4920 Value *Strip = Base.first->stripInBoundsConstantOffsets();
4921 Value *Root = Strip;
4922 while (auto *Gep = dyn_cast<GetElementPtrInst>(Root))
4923 Root = Gep->getOperand(0);
4924 SortedBases.emplace_back(Base.first, Strip, Root);
4925 }
4926 auto *Begin = SortedBases.begin();
4927 auto *End = SortedBases.end();
4928 while (Begin != End) {
4929 Value *Root = std::get<2>(*Begin);
4930 auto *Mid = std::stable_partition(
4931 Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; });
4933 for (auto I = Begin; I < Mid; ++I)
4934 LessThan.try_emplace(std::get<1>(*I));
4935 for (auto I = Begin; I < Mid; ++I) {
4936 Value *V = std::get<1>(*I);
4937 while (auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
4938 V = Gep->getOperand(0);
4939 if (LessThan.contains(V))
4940 LessThan[V][std::get<1>(*I)] = true;
4941 }
4942 }
4943 std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) {
4944 return LessThan[std::get<1>(V1)][std::get<1>(V2)];
4945 });
4946 Begin = Mid;
4947 }
4948
4949 // Collect the final order of sorted indices
4950 for (auto Base : SortedBases)
4951 for (auto &T : Bases[std::get<0>(Base)])
4952 SortedIndices.push_back(std::get<2>(T));
4953
4954 assert(SortedIndices.size() == VL.size() &&
4955 "Expected SortedIndices to be the size of VL");
4956 return true;
4957}
4958
4959std::optional<BoUpSLP::OrdersType>
4960BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4961 assert(TE.isGather() && "Expected gather node only.");
4962 Type *ScalarTy = TE.Scalars[0]->getType();
4963
4965 Ptrs.reserve(TE.Scalars.size());
4966 for (