LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107
108#define SV_NAME "slp-vectorizer"
109#define DEBUG_TYPE "SLP"
110
111STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
112
113DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
114 "Controls which SLP graphs should be vectorized.");
115
116static cl::opt<bool>
117 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
118 cl::desc("Run the SLP vectorization passes"));
119
120static cl::opt<bool>
121 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
122 cl::desc("Enable vectorization for wider vector utilization"));
123
124static cl::opt<int>
126 cl::desc("Only vectorize if you gain more than this "
127 "number "));
128
130 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
131 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
132 "heuristics and makes vectorization decision via cost modeling."));
133
134static cl::opt<bool>
135ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
136 cl::desc("Attempt to vectorize horizontal reductions"));
137
139 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
140 cl::desc(
141 "Attempt to vectorize horizontal reductions feeding into a store"));
142
143static cl::opt<int>
145 cl::desc("Attempt to vectorize for this register size in bits"));
146
149 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
150
151/// Limits the size of scheduling regions in a block.
152/// It avoid long compile times for _very_ large blocks where vector
153/// instructions are spread over a wide range.
154/// This limit is way higher than needed by real-world functions.
155static cl::opt<int>
156ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
157 cl::desc("Limit the size of the SLP scheduling region per block"));
158
160 "slp-min-reg-size", cl::init(128), cl::Hidden,
161 cl::desc("Attempt to vectorize for this register size in bits"));
162
164 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
165 cl::desc("Limit the recursion depth when building a vectorizable tree"));
166
168 "slp-min-tree-size", cl::init(3), cl::Hidden,
169 cl::desc("Only vectorize small trees if they are fully vectorizable"));
170
171// The maximum depth that the look-ahead score heuristic will explore.
172// The higher this value, the higher the compilation time overhead.
174 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
175 cl::desc("The maximum look-ahead depth for operand reordering scores"));
176
177// The maximum depth that the look-ahead score heuristic will explore
178// when it probing among candidates for vectorization tree roots.
179// The higher this value, the higher the compilation time overhead but unlike
180// similar limit for operands ordering this is less frequently used, hence
181// impact of higher value is less noticeable.
183 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
184 cl::desc("The maximum look-ahead depth for searching best rooting option"));
185
187 "slp-min-strided-loads", cl::init(2), cl::Hidden,
188 cl::desc("The minimum number of loads, which should be considered strided, "
189 "if the stride is > 1 or is runtime value"));
190
192 "slp-max-stride", cl::init(8), cl::Hidden,
193 cl::desc("The maximum stride, considered to be profitable."));
194
195static cl::opt<bool>
196 ViewSLPTree("view-slp-tree", cl::Hidden,
197 cl::desc("Display the SLP trees with Graphviz"));
198
200 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
201 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
202
203// Limit the number of alias checks. The limit is chosen so that
204// it has no negative effect on the llvm benchmarks.
205static const unsigned AliasedCheckLimit = 10;
206
207// Limit of the number of uses for potentially transformed instructions/values,
208// used in checks to avoid compile-time explode.
209static constexpr int UsesLimit = 64;
210
211// Another limit for the alias checks: The maximum distance between load/store
212// instructions where alias checks are done.
213// This limit is useful for very large basic blocks.
214static const unsigned MaxMemDepDistance = 160;
215
216/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
217/// regions to be handled.
218static const int MinScheduleRegionSize = 16;
219
220/// Maximum allowed number of operands in the PHI nodes.
221static const unsigned MaxPHINumOperands = 128;
222
223/// Predicate for the element types that the SLP vectorizer supports.
224///
225/// The most important thing to filter here are types which are invalid in LLVM
226/// vectors. We also filter target specific types which have absolutely no
227/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
228/// avoids spending time checking the cost model and realizing that they will
229/// be inevitably scalarized.
230static bool isValidElementType(Type *Ty) {
231 // TODO: Support ScalableVectorType.
232 if (SLPReVec && isa<FixedVectorType>(Ty))
233 Ty = Ty->getScalarType();
234 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
235 !Ty->isPPC_FP128Ty();
236}
237
238/// Returns the type of the given value/instruction \p V. If it is store,
239/// returns the type of its value operand, for Cmp - the types of the compare
240/// operands and for insertelement - the type os the inserted operand.
241/// Otherwise, just the type of the value is returned.
243 if (auto *SI = dyn_cast<StoreInst>(V))
244 return SI->getValueOperand()->getType();
245 if (auto *CI = dyn_cast<CmpInst>(V))
246 return CI->getOperand(0)->getType();
247 if (auto *IE = dyn_cast<InsertElementInst>(V))
248 return IE->getOperand(1)->getType();
249 return V->getType();
250}
251
252/// \returns the number of elements for Ty.
253static unsigned getNumElements(Type *Ty) {
254 assert(!isa<ScalableVectorType>(Ty) &&
255 "ScalableVectorType is not supported.");
256 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
257 return VecTy->getNumElements();
258 return 1;
259}
260
261/// \returns the vector type of ScalarTy based on vectorization factor.
262static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
263 return FixedVectorType::get(ScalarTy->getScalarType(),
264 VF * getNumElements(ScalarTy));
265}
266
267/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
268/// which forms type, which splits by \p TTI into whole vector types during
269/// legalization.
271 Type *Ty, unsigned Sz) {
272 if (!isValidElementType(Ty))
273 return bit_ceil(Sz);
274 // Find the number of elements, which forms full vectors.
275 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
276 if (NumParts == 0 || NumParts >= Sz)
277 return bit_ceil(Sz);
278 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
279}
280
281/// Returns the number of elements of the given type \p Ty, not greater than \p
282/// Sz, which forms type, which splits by \p TTI into whole vector types during
283/// legalization.
284static unsigned
286 unsigned Sz) {
287 if (!isValidElementType(Ty))
288 return bit_floor(Sz);
289 // Find the number of elements, which forms full vectors.
290 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
291 if (NumParts == 0 || NumParts >= Sz)
292 return bit_floor(Sz);
293 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294 if (RegVF > Sz)
295 return bit_floor(Sz);
296 return (Sz / RegVF) * RegVF;
297}
298
299static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
300 SmallVectorImpl<int> &Mask) {
301 // The ShuffleBuilder implementation use shufflevector to splat an "element".
302 // But the element have different meaning for SLP (scalar) and REVEC
303 // (vector). We need to expand Mask into masks which shufflevector can use
304 // directly.
305 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
306 for (unsigned I : seq<unsigned>(Mask.size()))
307 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
308 I * VecTyNumElements, VecTyNumElements)))
309 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
310 : Mask[I] * VecTyNumElements + J;
311 Mask.swap(NewMask);
312}
313
314/// \returns the number of groups of shufflevector
315/// A group has the following features
316/// 1. All of value in a group are shufflevector.
317/// 2. The mask of all shufflevector is isExtractSubvectorMask.
318/// 3. The mask of all shufflevector uses all of the elements of the source.
319/// e.g., it is 1 group (%0)
320/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
321/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
322/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
323/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
324/// it is 2 groups (%3 and %4)
325/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
326/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
327/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
328/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
329/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
330/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
331/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
332/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
333/// it is 0 group
334/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
335/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
336/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
337/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339 if (VL.empty())
340 return 0;
341 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
342 return 0;
343 auto *SV = cast<ShuffleVectorInst>(VL.front());
344 unsigned SVNumElements =
345 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
346 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
347 if (SVNumElements % ShuffleMaskSize != 0)
348 return 0;
349 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
350 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
351 return 0;
352 unsigned NumGroup = 0;
353 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
354 auto *SV = cast<ShuffleVectorInst>(VL[I]);
355 Value *Src = SV->getOperand(0);
356 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
357 SmallBitVector ExpectedIndex(GroupSize);
358 if (!all_of(Group, [&](Value *V) {
359 auto *SV = cast<ShuffleVectorInst>(V);
360 // From the same source.
361 if (SV->getOperand(0) != Src)
362 return false;
363 int Index;
364 if (!SV->isExtractSubvectorMask(Index))
365 return false;
366 ExpectedIndex.set(Index / ShuffleMaskSize);
367 return true;
368 }))
369 return 0;
370 if (!ExpectedIndex.all())
371 return 0;
372 ++NumGroup;
373 }
374 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
375 return NumGroup;
376}
377
378/// \returns a shufflevector mask which is used to vectorize shufflevectors
379/// e.g.,
380/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
381/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
382/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
383/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
384/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
385/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
386/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
387/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
388/// the result is
389/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
391 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
392 auto *SV = cast<ShuffleVectorInst>(VL.front());
393 unsigned SVNumElements =
394 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
395 SmallVector<int> Mask;
396 unsigned AccumulateLength = 0;
397 for (Value *V : VL) {
398 auto *SV = cast<ShuffleVectorInst>(V);
399 for (int M : SV->getShuffleMask())
400 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
401 : AccumulateLength + M);
402 AccumulateLength += SVNumElements;
403 }
404 return Mask;
405}
406
407/// \returns True if the value is a constant (but not globals/constant
408/// expressions).
409static bool isConstant(Value *V) {
410 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
411}
412
413/// Checks if \p V is one of vector-like instructions, i.e. undef,
414/// insertelement/extractelement with constant indices for fixed vector type or
415/// extractvalue instruction.
417 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
418 !isa<ExtractValueInst, UndefValue>(V))
419 return false;
420 auto *I = dyn_cast<Instruction>(V);
421 if (!I || isa<ExtractValueInst>(I))
422 return true;
423 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
424 return false;
425 if (isa<ExtractElementInst>(I))
426 return isConstant(I->getOperand(1));
427 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
428 return isConstant(I->getOperand(2));
429}
430
431/// Returns power-of-2 number of elements in a single register (part), given the
432/// total number of elements \p Size and number of registers (parts) \p
433/// NumParts.
434static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
435 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
436}
437
438/// Returns correct remaining number of elements, considering total amount \p
439/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
440/// and current register (part) \p Part.
441static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
442 unsigned Part) {
443 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
444}
445
446#if !defined(NDEBUG)
447/// Print a short descriptor of the instruction bundle suitable for debug output.
448static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
449 std::string Result;
450 raw_string_ostream OS(Result);
451 if (Idx >= 0)
452 OS << "Idx: " << Idx << ", ";
453 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
454 return Result;
455}
456#endif
457
458/// \returns true if all of the instructions in \p VL are in the same block or
459/// false otherwise.
461 auto *It = find_if(VL, IsaPred<Instruction>);
462 if (It == VL.end())
463 return false;
464 Instruction *I0 = cast<Instruction>(*It);
466 return true;
467
468 BasicBlock *BB = I0->getParent();
469 for (Value *V : iterator_range(It, VL.end())) {
470 if (isa<PoisonValue>(V))
471 continue;
472 auto *II = dyn_cast<Instruction>(V);
473 if (!II)
474 return false;
475
476 if (BB != II->getParent())
477 return false;
478 }
479 return true;
480}
481
482/// \returns True if all of the values in \p VL are constants (but not
483/// globals/constant expressions).
485 // Constant expressions and globals can't be vectorized like normal integer/FP
486 // constants.
487 return all_of(VL, isConstant);
488}
489
490/// \returns True if all of the values in \p VL are identical or some of them
491/// are UndefValue.
492static bool isSplat(ArrayRef<Value *> VL) {
493 Value *FirstNonUndef = nullptr;
494 for (Value *V : VL) {
495 if (isa<UndefValue>(V))
496 continue;
497 if (!FirstNonUndef) {
498 FirstNonUndef = V;
499 continue;
500 }
501 if (V != FirstNonUndef)
502 return false;
503 }
504 return FirstNonUndef != nullptr;
505}
506
507/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
509 if (auto *Cmp = dyn_cast<CmpInst>(I))
510 return Cmp->isCommutative();
511 if (auto *BO = dyn_cast<BinaryOperator>(I))
512 return BO->isCommutative() ||
513 (BO->getOpcode() == Instruction::Sub &&
514 !BO->hasNUsesOrMore(UsesLimit) &&
515 all_of(
516 BO->uses(),
517 [](const Use &U) {
518 // Commutative, if icmp eq/ne sub, 0
519 CmpPredicate Pred;
520 if (match(U.getUser(),
521 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
522 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
523 return true;
524 // Commutative, if abs(sub nsw, true) or abs(sub, false).
525 ConstantInt *Flag;
526 return match(U.getUser(),
527 m_Intrinsic<Intrinsic::abs>(
528 m_Specific(U.get()), m_ConstantInt(Flag))) &&
529 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
530 Flag->isOne());
531 })) ||
532 (BO->getOpcode() == Instruction::FSub &&
533 !BO->hasNUsesOrMore(UsesLimit) &&
534 all_of(BO->uses(), [](const Use &U) {
535 return match(U.getUser(),
536 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
537 }));
538 return I->isCommutative();
539}
540
541template <typename T>
542static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
543 unsigned Offset) {
544 static_assert(std::is_same_v<T, InsertElementInst> ||
545 std::is_same_v<T, ExtractElementInst>,
546 "unsupported T");
547 int Index = Offset;
548 if (const auto *IE = dyn_cast<T>(Inst)) {
549 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
550 if (!VT)
551 return std::nullopt;
552 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
553 if (!CI)
554 return std::nullopt;
555 if (CI->getValue().uge(VT->getNumElements()))
556 return std::nullopt;
557 Index *= VT->getNumElements();
558 Index += CI->getZExtValue();
559 return Index;
560 }
561 return std::nullopt;
562}
563
564/// \returns inserting or extracting index of InsertElement, ExtractElement or
565/// InsertValue instruction, using Offset as base offset for index.
566/// \returns std::nullopt if the index is not an immediate.
567static std::optional<unsigned> getElementIndex(const Value *Inst,
568 unsigned Offset = 0) {
569 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
570 return Index;
571 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
572 return Index;
573
574 int Index = Offset;
575
576 const auto *IV = dyn_cast<InsertValueInst>(Inst);
577 if (!IV)
578 return std::nullopt;
579
580 Type *CurrentType = IV->getType();
581 for (unsigned I : IV->indices()) {
582 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
583 Index *= ST->getNumElements();
584 CurrentType = ST->getElementType(I);
585 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
586 Index *= AT->getNumElements();
587 CurrentType = AT->getElementType();
588 } else {
589 return std::nullopt;
590 }
591 Index += I;
592 }
593 return Index;
594}
595
596namespace {
597/// Specifies the way the mask should be analyzed for undefs/poisonous elements
598/// in the shuffle mask.
599enum class UseMask {
600 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
601 ///< check for the mask elements for the first argument (mask
602 ///< indices are in range [0:VF)).
603 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
604 ///< for the mask elements for the second argument (mask indices
605 ///< are in range [VF:2*VF))
606 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
607 ///< future shuffle elements and mark them as ones as being used
608 ///< in future. Non-undef elements are considered as unused since
609 ///< they're already marked as used in the mask.
610};
611} // namespace
612
613/// Prepares a use bitset for the given mask either for the first argument or
614/// for the second.
616 UseMask MaskArg) {
617 SmallBitVector UseMask(VF, true);
618 for (auto [Idx, Value] : enumerate(Mask)) {
619 if (Value == PoisonMaskElem) {
620 if (MaskArg == UseMask::UndefsAsMask)
621 UseMask.reset(Idx);
622 continue;
623 }
624 if (MaskArg == UseMask::FirstArg && Value < VF)
625 UseMask.reset(Value);
626 else if (MaskArg == UseMask::SecondArg && Value >= VF)
627 UseMask.reset(Value - VF);
628 }
629 return UseMask;
630}
631
632/// Checks if the given value is actually an undefined constant vector.
633/// Also, if the \p UseMask is not empty, tries to check if the non-masked
634/// elements actually mask the insertelement buildvector, if any.
635template <bool IsPoisonOnly = false>
637 const SmallBitVector &UseMask = {}) {
638 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
639 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
640 if (isa<T>(V))
641 return Res;
642 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
643 if (!VecTy)
644 return Res.reset();
645 auto *C = dyn_cast<Constant>(V);
646 if (!C) {
647 if (!UseMask.empty()) {
648 const Value *Base = V;
649 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
650 Base = II->getOperand(0);
651 if (isa<T>(II->getOperand(1)))
652 continue;
653 std::optional<unsigned> Idx = getElementIndex(II);
654 if (!Idx) {
655 Res.reset();
656 return Res;
657 }
658 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
659 Res.reset(*Idx);
660 }
661 // TODO: Add analysis for shuffles here too.
662 if (V == Base) {
663 Res.reset();
664 } else {
665 SmallBitVector SubMask(UseMask.size(), false);
666 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
667 }
668 } else {
669 Res.reset();
670 }
671 return Res;
672 }
673 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
674 if (Constant *Elem = C->getAggregateElement(I))
675 if (!isa<T>(Elem) &&
676 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
677 Res.reset(I);
678 }
679 return Res;
680}
681
682/// Checks if the vector of instructions can be represented as a shuffle, like:
683/// %x0 = extractelement <4 x i8> %x, i32 0
684/// %x3 = extractelement <4 x i8> %x, i32 3
685/// %y1 = extractelement <4 x i8> %y, i32 1
686/// %y2 = extractelement <4 x i8> %y, i32 2
687/// %x0x0 = mul i8 %x0, %x0
688/// %x3x3 = mul i8 %x3, %x3
689/// %y1y1 = mul i8 %y1, %y1
690/// %y2y2 = mul i8 %y2, %y2
691/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
692/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
693/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
694/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
695/// ret <4 x i8> %ins4
696/// can be transformed into:
697/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
698/// i32 6>
699/// %2 = mul <4 x i8> %1, %1
700/// ret <4 x i8> %2
701/// Mask will return the Shuffle Mask equivalent to the extracted elements.
702/// TODO: Can we split off and reuse the shuffle mask detection from
703/// ShuffleVectorInst/getShuffleCost?
704static std::optional<TargetTransformInfo::ShuffleKind>
706 AssumptionCache *AC) {
707 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
708 if (It == VL.end())
709 return std::nullopt;
710 unsigned Size =
711 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
712 auto *EI = dyn_cast<ExtractElementInst>(V);
713 if (!EI)
714 return S;
715 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
716 if (!VTy)
717 return S;
718 return std::max(S, VTy->getNumElements());
719 });
720
721 Value *Vec1 = nullptr;
722 Value *Vec2 = nullptr;
723 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
724 auto *EE = dyn_cast<ExtractElementInst>(V);
725 if (!EE)
726 return false;
727 Value *Vec = EE->getVectorOperand();
728 if (isa<UndefValue>(Vec))
729 return false;
730 return isGuaranteedNotToBePoison(Vec, AC);
731 });
732 enum ShuffleMode { Unknown, Select, Permute };
733 ShuffleMode CommonShuffleMode = Unknown;
734 Mask.assign(VL.size(), PoisonMaskElem);
735 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
736 // Undef can be represented as an undef element in a vector.
737 if (isa<UndefValue>(VL[I]))
738 continue;
739 auto *EI = cast<ExtractElementInst>(VL[I]);
740 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
741 return std::nullopt;
742 auto *Vec = EI->getVectorOperand();
743 // We can extractelement from undef or poison vector.
744 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
745 continue;
746 // All vector operands must have the same number of vector elements.
747 if (isa<UndefValue>(Vec)) {
748 Mask[I] = I;
749 } else {
750 if (isa<UndefValue>(EI->getIndexOperand()))
751 continue;
752 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
753 if (!Idx)
754 return std::nullopt;
755 // Undefined behavior if Idx is negative or >= Size.
756 if (Idx->getValue().uge(Size))
757 continue;
758 unsigned IntIdx = Idx->getValue().getZExtValue();
759 Mask[I] = IntIdx;
760 }
761 if (isUndefVector(Vec).all() && HasNonUndefVec)
762 continue;
763 // For correct shuffling we have to have at most 2 different vector operands
764 // in all extractelement instructions.
765 if (!Vec1 || Vec1 == Vec) {
766 Vec1 = Vec;
767 } else if (!Vec2 || Vec2 == Vec) {
768 Vec2 = Vec;
769 Mask[I] += Size;
770 } else {
771 return std::nullopt;
772 }
773 if (CommonShuffleMode == Permute)
774 continue;
775 // If the extract index is not the same as the operation number, it is a
776 // permutation.
777 if (Mask[I] % Size != I) {
778 CommonShuffleMode = Permute;
779 continue;
780 }
781 CommonShuffleMode = Select;
782 }
783 // If we're not crossing lanes in different vectors, consider it as blending.
784 if (CommonShuffleMode == Select && Vec2)
786 // If Vec2 was never used, we have a permutation of a single vector, otherwise
787 // we have permutation of 2 vectors.
790}
791
792/// \returns True if Extract{Value,Element} instruction extracts element Idx.
793static std::optional<unsigned> getExtractIndex(Instruction *E) {
794 unsigned Opcode = E->getOpcode();
795 assert((Opcode == Instruction::ExtractElement ||
796 Opcode == Instruction::ExtractValue) &&
797 "Expected extractelement or extractvalue instruction.");
798 if (Opcode == Instruction::ExtractElement) {
799 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
800 if (!CI)
801 return std::nullopt;
802 return CI->getZExtValue();
803 }
804 auto *EI = cast<ExtractValueInst>(E);
805 if (EI->getNumIndices() != 1)
806 return std::nullopt;
807 return *EI->idx_begin();
808}
809
810namespace {
811
812/// Main data required for vectorization of instructions.
813class InstructionsState {
814 /// The main/alternate instruction. MainOp is also VL0.
815 Instruction *MainOp = nullptr;
816 Instruction *AltOp = nullptr;
817
818public:
819 Instruction *getMainOp() const { return MainOp; }
820
821 Instruction *getAltOp() const { return AltOp; }
822
823 /// The main/alternate opcodes for the list of instructions.
824 unsigned getOpcode() const {
825 return MainOp ? MainOp->getOpcode() : 0;
826 }
827
828 unsigned getAltOpcode() const {
829 return AltOp ? AltOp->getOpcode() : 0;
830 }
831
832 /// Some of the instructions in the list have alternate opcodes.
833 bool isAltShuffle() const { return AltOp != MainOp; }
834
835 bool isOpcodeOrAlt(Instruction *I) const {
836 unsigned CheckedOpcode = I->getOpcode();
837 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
838 }
839
840 InstructionsState() = delete;
841 InstructionsState(Instruction *MainOp, Instruction *AltOp)
842 : MainOp(MainOp), AltOp(AltOp) {}
843 static InstructionsState invalid() { return {nullptr, nullptr}; }
844};
845
846} // end anonymous namespace
847
848/// \returns true if \p Opcode is allowed as part of the main/alternate
849/// instruction for SLP vectorization.
850///
851/// Example of unsupported opcode is SDIV that can potentially cause UB if the
852/// "shuffled out" lane would result in division by zero.
853static bool isValidForAlternation(unsigned Opcode) {
854 if (Instruction::isIntDivRem(Opcode))
855 return false;
856
857 return true;
858}
859
860static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
861 const TargetLibraryInfo &TLI);
862
863/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
864/// compatible instructions or constants, or just some other regular values.
865static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
866 Value *Op1, const TargetLibraryInfo &TLI) {
867 return (isConstant(BaseOp0) && isConstant(Op0)) ||
868 (isConstant(BaseOp1) && isConstant(Op1)) ||
869 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
870 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
871 BaseOp0 == Op0 || BaseOp1 == Op1 ||
872 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
873 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
874}
875
876/// \returns true if a compare instruction \p CI has similar "look" and
877/// same predicate as \p BaseCI, "as is" or with its operands and predicate
878/// swapped, false otherwise.
879static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
880 const TargetLibraryInfo &TLI) {
881 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
882 "Assessing comparisons of different types?");
883 CmpInst::Predicate BasePred = BaseCI->getPredicate();
884 CmpInst::Predicate Pred = CI->getPredicate();
886
887 Value *BaseOp0 = BaseCI->getOperand(0);
888 Value *BaseOp1 = BaseCI->getOperand(1);
889 Value *Op0 = CI->getOperand(0);
890 Value *Op1 = CI->getOperand(1);
891
892 return (BasePred == Pred &&
893 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
894 (BasePred == SwappedPred &&
895 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
896}
897
898/// \returns analysis of the Instructions in \p VL described in
899/// InstructionsState, the Opcode that we suppose the whole list
900/// could be vectorized even if its structure is diverse.
901static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
902 const TargetLibraryInfo &TLI) {
903 // Make sure these are all Instructions.
904 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
905 return InstructionsState::invalid();
906
907 auto *It = find_if(VL, IsaPred<Instruction>);
908 if (It == VL.end())
909 return InstructionsState::invalid();
910
911 Value *V = *It;
912 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
913 if ((VL.size() > 2 && !isa<PHINode>(V) && InstCnt < VL.size() / 2) ||
914 (VL.size() == 2 && InstCnt < 2))
915 return InstructionsState::invalid();
916
917 bool IsCastOp = isa<CastInst>(V);
918 bool IsBinOp = isa<BinaryOperator>(V);
919 bool IsCmpOp = isa<CmpInst>(V);
920 CmpInst::Predicate BasePred =
921 IsCmpOp ? cast<CmpInst>(V)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE;
922 unsigned Opcode = cast<Instruction>(V)->getOpcode();
923 unsigned AltOpcode = Opcode;
924 unsigned AltIndex = std::distance(VL.begin(), It);
925
926 bool SwappedPredsCompatible = [&]() {
927 if (!IsCmpOp)
928 return false;
929 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
930 UniquePreds.insert(BasePred);
931 UniqueNonSwappedPreds.insert(BasePred);
932 for (Value *V : VL) {
933 auto *I = dyn_cast<CmpInst>(V);
934 if (!I)
935 return false;
936 CmpInst::Predicate CurrentPred = I->getPredicate();
937 CmpInst::Predicate SwappedCurrentPred =
938 CmpInst::getSwappedPredicate(CurrentPred);
939 UniqueNonSwappedPreds.insert(CurrentPred);
940 if (!UniquePreds.contains(CurrentPred) &&
941 !UniquePreds.contains(SwappedCurrentPred))
942 UniquePreds.insert(CurrentPred);
943 }
944 // Total number of predicates > 2, but if consider swapped predicates
945 // compatible only 2, consider swappable predicates as compatible opcodes,
946 // not alternate.
947 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
948 }();
949 // Check for one alternate opcode from another BinaryOperator.
950 // TODO - generalize to support all operators (types, calls etc.).
951 auto *IBase = cast<Instruction>(V);
952 Intrinsic::ID BaseID = 0;
953 SmallVector<VFInfo> BaseMappings;
954 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
956 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
957 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
958 return InstructionsState::invalid();
959 }
960 bool AnyPoison = InstCnt != VL.size();
961 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
962 auto *I = dyn_cast<Instruction>(VL[Cnt]);
963 if (!I)
964 continue;
965
966 // Cannot combine poison and divisions.
967 // TODO: do some smart analysis of the CallInsts to exclude divide-like
968 // intrinsics/functions only.
969 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
970 return InstructionsState::invalid();
971 unsigned InstOpcode = I->getOpcode();
972 if (IsBinOp && isa<BinaryOperator>(I)) {
973 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
974 continue;
975 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
976 isValidForAlternation(Opcode)) {
977 AltOpcode = InstOpcode;
978 AltIndex = Cnt;
979 continue;
980 }
981 } else if (IsCastOp && isa<CastInst>(I)) {
982 Value *Op0 = IBase->getOperand(0);
983 Type *Ty0 = Op0->getType();
984 Value *Op1 = I->getOperand(0);
985 Type *Ty1 = Op1->getType();
986 if (Ty0 == Ty1) {
987 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
988 continue;
989 if (Opcode == AltOpcode) {
991 isValidForAlternation(InstOpcode) &&
992 "Cast isn't safe for alternation, logic needs to be updated!");
993 AltOpcode = InstOpcode;
994 AltIndex = Cnt;
995 continue;
996 }
997 }
998 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
999 auto *BaseInst = cast<CmpInst>(V);
1000 Type *Ty0 = BaseInst->getOperand(0)->getType();
1001 Type *Ty1 = Inst->getOperand(0)->getType();
1002 if (Ty0 == Ty1) {
1003 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1004 assert(InstOpcode == AltOpcode &&
1005 "Alternate instructions are only supported by BinaryOperator "
1006 "and CastInst.");
1007 // Check for compatible operands. If the corresponding operands are not
1008 // compatible - need to perform alternate vectorization.
1009 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1010 CmpInst::Predicate SwappedCurrentPred =
1011 CmpInst::getSwappedPredicate(CurrentPred);
1012
1013 if ((E == 2 || SwappedPredsCompatible) &&
1014 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1015 continue;
1016
1017 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1018 continue;
1019 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
1020 if (AltIndex) {
1021 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1022 continue;
1023 } else if (BasePred != CurrentPred) {
1024 assert(
1025 isValidForAlternation(InstOpcode) &&
1026 "CmpInst isn't safe for alternation, logic needs to be updated!");
1027 AltIndex = Cnt;
1028 continue;
1029 }
1030 CmpInst::Predicate AltPred = AltInst->getPredicate();
1031 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1032 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1033 continue;
1034 }
1035 } else if (InstOpcode == Opcode) {
1036 assert(InstOpcode == AltOpcode &&
1037 "Alternate instructions are only supported by BinaryOperator and "
1038 "CastInst.");
1039 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1040 if (Gep->getNumOperands() != 2 ||
1041 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
1042 return InstructionsState::invalid();
1043 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1045 return InstructionsState::invalid();
1046 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1047 auto *BaseLI = cast<LoadInst>(IBase);
1048 if (!LI->isSimple() || !BaseLI->isSimple())
1049 return InstructionsState::invalid();
1050 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1051 auto *CallBase = cast<CallInst>(IBase);
1052 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1053 return InstructionsState::invalid();
1054 if (Call->hasOperandBundles() &&
1056 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1057 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1058 CallBase->op_begin() +
1060 return InstructionsState::invalid();
1062 if (ID != BaseID)
1063 return InstructionsState::invalid();
1064 if (!ID) {
1065 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1066 if (Mappings.size() != BaseMappings.size() ||
1067 Mappings.front().ISA != BaseMappings.front().ISA ||
1068 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1069 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1070 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1071 Mappings.front().Shape.Parameters !=
1072 BaseMappings.front().Shape.Parameters)
1073 return InstructionsState::invalid();
1074 }
1075 }
1076 continue;
1077 }
1078 return InstructionsState::invalid();
1079 }
1080
1081 return InstructionsState(cast<Instruction>(V),
1082 cast<Instruction>(VL[AltIndex]));
1083}
1084
1085/// \returns true if all of the values in \p VL have the same type or false
1086/// otherwise.
1088 Type *Ty = VL.front()->getType();
1089 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1090}
1091
1092/// \returns True if in-tree use also needs extract. This refers to
1093/// possible scalar operand in vectorized instruction.
1094static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1095 TargetLibraryInfo *TLI,
1096 const TargetTransformInfo *TTI) {
1097 if (!UserInst)
1098 return false;
1099 unsigned Opcode = UserInst->getOpcode();
1100 switch (Opcode) {
1101 case Instruction::Load: {
1102 LoadInst *LI = cast<LoadInst>(UserInst);
1103 return (LI->getPointerOperand() == Scalar);
1104 }
1105 case Instruction::Store: {
1106 StoreInst *SI = cast<StoreInst>(UserInst);
1107 return (SI->getPointerOperand() == Scalar);
1108 }
1109 case Instruction::Call: {
1110 CallInst *CI = cast<CallInst>(UserInst);
1112 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1113 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1114 Arg.value().get() == Scalar;
1115 });
1116 }
1117 default:
1118 return false;
1119 }
1120}
1121
1122/// \returns the AA location that is being access by the instruction.
1124 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1125 return MemoryLocation::get(SI);
1126 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1127 return MemoryLocation::get(LI);
1128 return MemoryLocation();
1129}
1130
1131/// \returns True if the instruction is not a volatile or atomic load/store.
1132static bool isSimple(Instruction *I) {
1133 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1134 return LI->isSimple();
1135 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1136 return SI->isSimple();
1137 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1138 return !MI->isVolatile();
1139 return true;
1140}
1141
1142/// Shuffles \p Mask in accordance with the given \p SubMask.
1143/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1144/// one but two input vectors.
1145static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1146 bool ExtendingManyInputs = false) {
1147 if (SubMask.empty())
1148 return;
1149 assert(
1150 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1151 // Check if input scalars were extended to match the size of other node.
1152 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1153 "SubMask with many inputs support must be larger than the mask.");
1154 if (Mask.empty()) {
1155 Mask.append(SubMask.begin(), SubMask.end());
1156 return;
1157 }
1158 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1159 int TermValue = std::min(Mask.size(), SubMask.size());
1160 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1161 if (SubMask[I] == PoisonMaskElem ||
1162 (!ExtendingManyInputs &&
1163 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1164 continue;
1165 NewMask[I] = Mask[SubMask[I]];
1166 }
1167 Mask.swap(NewMask);
1168}
1169
1170/// Order may have elements assigned special value (size) which is out of
1171/// bounds. Such indices only appear on places which correspond to undef values
1172/// (see canReuseExtract for details) and used in order to avoid undef values
1173/// have effect on operands ordering.
1174/// The first loop below simply finds all unused indices and then the next loop
1175/// nest assigns these indices for undef values positions.
1176/// As an example below Order has two undef positions and they have assigned
1177/// values 3 and 7 respectively:
1178/// before: 6 9 5 4 9 2 1 0
1179/// after: 6 3 5 4 7 2 1 0
1181 const unsigned Sz = Order.size();
1182 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1183 SmallBitVector MaskedIndices(Sz);
1184 for (unsigned I = 0; I < Sz; ++I) {
1185 if (Order[I] < Sz)
1186 UnusedIndices.reset(Order[I]);
1187 else
1188 MaskedIndices.set(I);
1189 }
1190 if (MaskedIndices.none())
1191 return;
1192 assert(UnusedIndices.count() == MaskedIndices.count() &&
1193 "Non-synced masked/available indices.");
1194 int Idx = UnusedIndices.find_first();
1195 int MIdx = MaskedIndices.find_first();
1196 while (MIdx >= 0) {
1197 assert(Idx >= 0 && "Indices must be synced.");
1198 Order[MIdx] = Idx;
1199 Idx = UnusedIndices.find_next(Idx);
1200 MIdx = MaskedIndices.find_next(MIdx);
1201 }
1202}
1203
1204/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1205/// Opcode1.
1207 unsigned Opcode1) {
1208 Type *ScalarTy = VL[0]->getType();
1209 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1210 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1211 for (unsigned Lane : seq<unsigned>(VL.size())) {
1212 if (isa<PoisonValue>(VL[Lane]))
1213 continue;
1214 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1215 OpcodeMask.set(Lane * ScalarTyNumElements,
1216 Lane * ScalarTyNumElements + ScalarTyNumElements);
1217 }
1218 return OpcodeMask;
1219}
1220
1221namespace llvm {
1222
1224 SmallVectorImpl<int> &Mask) {
1225 Mask.clear();
1226 const unsigned E = Indices.size();
1227 Mask.resize(E, PoisonMaskElem);
1228 for (unsigned I = 0; I < E; ++I)
1229 Mask[Indices[I]] = I;
1230}
1231
1232/// Reorders the list of scalars in accordance with the given \p Mask.
1234 ArrayRef<int> Mask) {
1235 assert(!Mask.empty() && "Expected non-empty mask.");
1236 SmallVector<Value *> Prev(Scalars.size(),
1237 PoisonValue::get(Scalars.front()->getType()));
1238 Prev.swap(Scalars);
1239 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1240 if (Mask[I] != PoisonMaskElem)
1241 Scalars[Mask[I]] = Prev[I];
1242}
1243
1244/// Checks if the provided value does not require scheduling. It does not
1245/// require scheduling if this is not an instruction or it is an instruction
1246/// that does not read/write memory and all operands are either not instructions
1247/// or phi nodes or instructions from different blocks.
1249 auto *I = dyn_cast<Instruction>(V);
1250 if (!I)
1251 return true;
1252 return !mayHaveNonDefUseDependency(*I) &&
1253 all_of(I->operands(), [I](Value *V) {
1254 auto *IO = dyn_cast<Instruction>(V);
1255 if (!IO)
1256 return true;
1257 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1258 });
1259}
1260
1261/// Checks if the provided value does not require scheduling. It does not
1262/// require scheduling if this is not an instruction or it is an instruction
1263/// that does not read/write memory and all users are phi nodes or instructions
1264/// from the different blocks.
1265static bool isUsedOutsideBlock(Value *V) {
1266 auto *I = dyn_cast<Instruction>(V);
1267 if (!I)
1268 return true;
1269 // Limits the number of uses to save compile time.
1270 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1271 all_of(I->users(), [I](User *U) {
1272 auto *IU = dyn_cast<Instruction>(U);
1273 if (!IU)
1274 return true;
1275 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1276 });
1277}
1278
1279/// Checks if the specified value does not require scheduling. It does not
1280/// require scheduling if all operands and all users do not need to be scheduled
1281/// in the current basic block.
1284}
1285
1286/// Checks if the specified array of instructions does not require scheduling.
1287/// It is so if all either instructions have operands that do not require
1288/// scheduling or their users do not require scheduling since they are phis or
1289/// in other basic blocks.
1291 return !VL.empty() &&
1293}
1294
1295/// Returns true if widened type of \p Ty elements with size \p Sz represents
1296/// full vector type, i.e. adding extra element results in extra parts upon type
1297/// legalization.
1299 unsigned Sz) {
1300 if (Sz <= 1)
1301 return false;
1302 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1303 return false;
1304 if (has_single_bit(Sz))
1305 return true;
1306 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1307 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1308 Sz % NumParts == 0;
1309}
1310
1311namespace slpvectorizer {
1312
1313/// Bottom Up SLP Vectorizer.
1314class BoUpSLP {
1315 struct TreeEntry;
1316 struct ScheduleData;
1319
1320public:
1321 /// Tracks the state we can represent the loads in the given sequence.
1322 enum class LoadsState {
1323 Gather,
1324 Vectorize,
1327 };
1328
1335
1337 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1340 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1341 AC(AC), DB(DB), DL(DL), ORE(ORE),
1342 Builder(Se->getContext(), TargetFolder(*DL)) {
1343 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1344 // Use the vector register size specified by the target unless overridden
1345 // by a command-line option.
1346 // TODO: It would be better to limit the vectorization factor based on
1347 // data type rather than just register size. For example, x86 AVX has
1348 // 256-bit registers, but it does not support integer operations
1349 // at that width (that requires AVX2).
1350 if (MaxVectorRegSizeOption.getNumOccurrences())
1351 MaxVecRegSize = MaxVectorRegSizeOption;
1352 else
1353 MaxVecRegSize =
1355 .getFixedValue();
1356
1357 if (MinVectorRegSizeOption.getNumOccurrences())
1358 MinVecRegSize = MinVectorRegSizeOption;
1359 else
1360 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1361 }
1362
1363 /// Vectorize the tree that starts with the elements in \p VL.
1364 /// Returns the vectorized root.
1366
1367 /// Vectorize the tree but with the list of externally used values \p
1368 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1369 /// generated extractvalue instructions.
1370 Value *
1371 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1372 Instruction *ReductionRoot = nullptr);
1373
1374 /// \returns the cost incurred by unwanted spills and fills, caused by
1375 /// holding live values over call sites.
1377
1378 /// \returns the vectorization cost of the subtree that starts at \p VL.
1379 /// A negative number means that this is profitable.
1380 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1381
1382 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1383 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1384 void buildTree(ArrayRef<Value *> Roots,
1385 const SmallDenseSet<Value *> &UserIgnoreLst);
1386
1387 /// Construct a vectorizable tree that starts at \p Roots.
1388 void buildTree(ArrayRef<Value *> Roots);
1389
1390 /// Returns whether the root node has in-tree uses.
1392 return !VectorizableTree.empty() &&
1393 !VectorizableTree.front()->UserTreeIndices.empty();
1394 }
1395
1396 /// Return the scalars of the root node.
1398 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1399 return VectorizableTree.front()->Scalars;
1400 }
1401
1402 /// Returns the type/is-signed info for the root node in the graph without
1403 /// casting.
1404 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1405 const TreeEntry &Root = *VectorizableTree.front().get();
1406 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1407 !Root.Scalars.front()->getType()->isIntegerTy())
1408 return std::nullopt;
1409 auto It = MinBWs.find(&Root);
1410 if (It != MinBWs.end())
1411 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1412 It->second.first),
1413 It->second.second);
1414 if (Root.getOpcode() == Instruction::ZExt ||
1415 Root.getOpcode() == Instruction::SExt)
1416 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1417 Root.getOpcode() == Instruction::SExt);
1418 return std::nullopt;
1419 }
1420
1421 /// Checks if the root graph node can be emitted with narrower bitwidth at
1422 /// codegen and returns it signedness, if so.
1424 return MinBWs.at(VectorizableTree.front().get()).second;
1425 }
1426
1427 /// Returns reduction type after minbitdth analysis.
1429 if (ReductionBitWidth == 0 ||
1430 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1431 ReductionBitWidth >=
1432 DL->getTypeSizeInBits(
1433 VectorizableTree.front()->Scalars.front()->getType()))
1434 return getWidenedType(
1435 VectorizableTree.front()->Scalars.front()->getType(),
1436 VectorizableTree.front()->getVectorFactor());
1437 return getWidenedType(
1439 VectorizableTree.front()->Scalars.front()->getContext(),
1440 ReductionBitWidth),
1441 VectorizableTree.front()->getVectorFactor());
1442 }
1443
1444 /// Builds external uses of the vectorized scalars, i.e. the list of
1445 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1446 /// ExternallyUsedValues contains additional list of external uses to handle
1447 /// vectorization of reductions.
1448 void
1449 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1450
1451 /// Transforms graph nodes to target specific representations, if profitable.
1452 void transformNodes();
1453
1454 /// Clear the internal data structures that are created by 'buildTree'.
1455 void deleteTree() {
1456 VectorizableTree.clear();
1457 ScalarToTreeEntry.clear();
1458 MultiNodeScalars.clear();
1459 MustGather.clear();
1460 NonScheduledFirst.clear();
1461 EntryToLastInstruction.clear();
1462 LoadEntriesToVectorize.clear();
1463 IsGraphTransformMode = false;
1464 GatheredLoadsEntriesFirst.reset();
1465 ExternalUses.clear();
1466 ExternalUsesAsOriginalScalar.clear();
1467 for (auto &Iter : BlocksSchedules) {
1468 BlockScheduling *BS = Iter.second.get();
1469 BS->clear();
1470 }
1471 MinBWs.clear();
1472 ReductionBitWidth = 0;
1473 BaseGraphSize = 1;
1474 CastMaxMinBWSizes.reset();
1475 ExtraBitWidthNodes.clear();
1476 InstrElementSize.clear();
1477 UserIgnoreList = nullptr;
1478 PostponedGathers.clear();
1479 ValueToGatherNodes.clear();
1480 }
1481
1482 unsigned getTreeSize() const { return VectorizableTree.size(); }
1483
1484 /// Returns the base graph size, before any transformations.
1485 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1486
1487 /// Perform LICM and CSE on the newly generated gather sequences.
1489
1490 /// Does this non-empty order represent an identity order? Identity
1491 /// should be represented as an empty order, so this is used to
1492 /// decide if we can canonicalize a computed order. Undef elements
1493 /// (represented as size) are ignored.
1495 assert(!Order.empty() && "expected non-empty order");
1496 const unsigned Sz = Order.size();
1497 return all_of(enumerate(Order), [&](const auto &P) {
1498 return P.value() == P.index() || P.value() == Sz;
1499 });
1500 }
1501
1502 /// Checks if the specified gather tree entry \p TE can be represented as a
1503 /// shuffled vector entry + (possibly) permutation with other gathers. It
1504 /// implements the checks only for possibly ordered scalars (Loads,
1505 /// ExtractElement, ExtractValue), which can be part of the graph.
1506 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1507
1508 /// Sort loads into increasing pointers offsets to allow greater clustering.
1509 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1510
1511 /// Gets reordering data for the given tree entry. If the entry is vectorized
1512 /// - just return ReorderIndices, otherwise check if the scalars can be
1513 /// reordered and return the most optimal order.
1514 /// \return std::nullopt if ordering is not important, empty order, if
1515 /// identity order is important, or the actual order.
1516 /// \param TopToBottom If true, include the order of vectorized stores and
1517 /// insertelement nodes, otherwise skip them.
1518 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1519 bool TopToBottom);
1520
1521 /// Reorders the current graph to the most profitable order starting from the
1522 /// root node to the leaf nodes. The best order is chosen only from the nodes
1523 /// of the same size (vectorization factor). Smaller nodes are considered
1524 /// parts of subgraph with smaller VF and they are reordered independently. We
1525 /// can make it because we still need to extend smaller nodes to the wider VF
1526 /// and we can merge reordering shuffles with the widening shuffles.
1527 void reorderTopToBottom();
1528
1529 /// Reorders the current graph to the most profitable order starting from
1530 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1531 /// number of reshuffles if the leaf nodes use the same order. In this case we
1532 /// can merge the orders and just shuffle user node instead of shuffling its
1533 /// operands. Plus, even the leaf nodes have different orders, it allows to
1534 /// sink reordering in the graph closer to the root node and merge it later
1535 /// during analysis.
1536 void reorderBottomToTop(bool IgnoreReorder = false);
1537
1538 /// \return The vector element size in bits to use when vectorizing the
1539 /// expression tree ending at \p V. If V is a store, the size is the width of
1540 /// the stored value. Otherwise, the size is the width of the largest loaded
1541 /// value reaching V. This method is used by the vectorizer to calculate
1542 /// vectorization factors.
1543 unsigned getVectorElementSize(Value *V);
1544
1545 /// Compute the minimum type sizes required to represent the entries in a
1546 /// vectorizable tree.
1548
1549 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1550 unsigned getMaxVecRegSize() const {
1551 return MaxVecRegSize;
1552 }
1553
1554 // \returns minimum vector register size as set by cl::opt.
1555 unsigned getMinVecRegSize() const {
1556 return MinVecRegSize;
1557 }
1558
1559 unsigned getMinVF(unsigned Sz) const {
1560 return std::max(2U, getMinVecRegSize() / Sz);
1561 }
1562
1563 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1564 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1565 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1566 return MaxVF ? MaxVF : UINT_MAX;
1567 }
1568
1569 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1570 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1571 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1572 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1573 ///
1574 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1575 unsigned canMapToVector(Type *T) const;
1576
1577 /// \returns True if the VectorizableTree is both tiny and not fully
1578 /// vectorizable. We do not vectorize such trees.
1579 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1580
1581 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1582 /// It may happen, if all gather nodes are loads and they cannot be
1583 /// "clusterized". In this case even subgraphs cannot be vectorized more
1584 /// effectively than the base graph.
1585 bool isTreeNotExtendable() const;
1586
1587 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1588 /// can be load combined in the backend. Load combining may not be allowed in
1589 /// the IR optimizer, so we do not want to alter the pattern. For example,
1590 /// partially transforming a scalar bswap() pattern into vector code is
1591 /// effectively impossible for the backend to undo.
1592 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1593 /// may not be necessary.
1594 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1595
1596 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1597 /// can be load combined in the backend. Load combining may not be allowed in
1598 /// the IR optimizer, so we do not want to alter the pattern. For example,
1599 /// partially transforming a scalar bswap() pattern into vector code is
1600 /// effectively impossible for the backend to undo.
1601 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1602 /// may not be necessary.
1603 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1604
1605 /// Checks if the given array of loads can be represented as a vectorized,
1606 /// scatter or just simple gather.
1607 /// \param VL list of loads.
1608 /// \param VL0 main load value.
1609 /// \param Order returned order of load instructions.
1610 /// \param PointerOps returned list of pointer operands.
1611 /// \param BestVF return best vector factor, if recursive check found better
1612 /// vectorization sequences rather than masked gather.
1613 /// \param TryRecursiveCheck used to check if long masked gather can be
1614 /// represented as a serie of loads/insert subvector, if profitable.
1617 SmallVectorImpl<Value *> &PointerOps,
1618 unsigned *BestVF = nullptr,
1619 bool TryRecursiveCheck = true) const;
1620
1621 /// Registers non-vectorizable sequence of loads
1622 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1623 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1624 }
1625
1626 /// Checks if the given loads sequence is known as not vectorizable
1627 template <typename T>
1629 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1630 }
1631
1633
1634 /// This structure holds any data we need about the edges being traversed
1635 /// during buildTree_rec(). We keep track of:
1636 /// (i) the user TreeEntry index, and
1637 /// (ii) the index of the edge.
1638 struct EdgeInfo {
1639 EdgeInfo() = default;
1640 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1642 /// The user TreeEntry.
1643 TreeEntry *UserTE = nullptr;
1644 /// The operand index of the use.
1645 unsigned EdgeIdx = UINT_MAX;
1646#ifndef NDEBUG
1648 const BoUpSLP::EdgeInfo &EI) {
1649 EI.dump(OS);
1650 return OS;
1651 }
1652 /// Debug print.
1653 void dump(raw_ostream &OS) const {
1654 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1655 << " EdgeIdx:" << EdgeIdx << "}";
1656 }
1657 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1658#endif
1659 bool operator == (const EdgeInfo &Other) const {
1660 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1661 }
1662 };
1663
1664 /// A helper class used for scoring candidates for two consecutive lanes.
1666 const TargetLibraryInfo &TLI;
1667 const DataLayout &DL;
1668 ScalarEvolution &SE;
1669 const BoUpSLP &R;
1670 int NumLanes; // Total number of lanes (aka vectorization factor).
1671 int MaxLevel; // The maximum recursion depth for accumulating score.
1672
1673 public:
1675 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1676 int MaxLevel)
1677 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1678 MaxLevel(MaxLevel) {}
1679
1680 // The hard-coded scores listed here are not very important, though it shall
1681 // be higher for better matches to improve the resulting cost. When
1682 // computing the scores of matching one sub-tree with another, we are
1683 // basically counting the number of values that are matching. So even if all
1684 // scores are set to 1, we would still get a decent matching result.
1685 // However, sometimes we have to break ties. For example we may have to
1686 // choose between matching loads vs matching opcodes. This is what these
1687 // scores are helping us with: they provide the order of preference. Also,
1688 // this is important if the scalar is externally used or used in another
1689 // tree entry node in the different lane.
1690
1691 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1692 static const int ScoreConsecutiveLoads = 4;
1693 /// The same load multiple times. This should have a better score than
1694 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1695 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1696 /// a vector load and 1.0 for a broadcast.
1697 static const int ScoreSplatLoads = 3;
1698 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1699 static const int ScoreReversedLoads = 3;
1700 /// A load candidate for masked gather.
1701 static const int ScoreMaskedGatherCandidate = 1;
1702 /// ExtractElementInst from same vector and consecutive indexes.
1703 static const int ScoreConsecutiveExtracts = 4;
1704 /// ExtractElementInst from same vector and reversed indices.
1705 static const int ScoreReversedExtracts = 3;
1706 /// Constants.
1707 static const int ScoreConstants = 2;
1708 /// Instructions with the same opcode.
1709 static const int ScoreSameOpcode = 2;
1710 /// Instructions with alt opcodes (e.g, add + sub).
1711 static const int ScoreAltOpcodes = 1;
1712 /// Identical instructions (a.k.a. splat or broadcast).
1713 static const int ScoreSplat = 1;
1714 /// Matching with an undef is preferable to failing.
1715 static const int ScoreUndef = 1;
1716 /// Score for failing to find a decent match.
1717 static const int ScoreFail = 0;
1718 /// Score if all users are vectorized.
1719 static const int ScoreAllUserVectorized = 1;
1720
1721 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1722 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1723 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1724 /// MainAltOps.
1726 ArrayRef<Value *> MainAltOps) const {
1727 if (!isValidElementType(V1->getType()) ||
1728 !isValidElementType(V2->getType()))
1730
1731 if (V1 == V2) {
1732 if (isa<LoadInst>(V1)) {
1733 // Retruns true if the users of V1 and V2 won't need to be extracted.
1734 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1735 // Bail out if we have too many uses to save compilation time.
1736 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1737 return false;
1738
1739 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1740 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1741 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1742 });
1743 };
1744 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1745 };
1746 // A broadcast of a load can be cheaper on some targets.
1747 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1748 ElementCount::getFixed(NumLanes)) &&
1749 ((int)V1->getNumUses() == NumLanes ||
1750 AllUsersAreInternal(V1, V2)))
1752 }
1754 }
1755
1756 auto CheckSameEntryOrFail = [&]() {
1757 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1758 TE1 && TE1 == R.getTreeEntry(V2))
1761 };
1762
1763 auto *LI1 = dyn_cast<LoadInst>(V1);
1764 auto *LI2 = dyn_cast<LoadInst>(V2);
1765 if (LI1 && LI2) {
1766 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1767 !LI2->isSimple())
1768 return CheckSameEntryOrFail();
1769
1770 std::optional<int> Dist = getPointersDiff(
1771 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1772 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1773 if (!Dist || *Dist == 0) {
1774 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1775 getUnderlyingObject(LI2->getPointerOperand()) &&
1776 R.TTI->isLegalMaskedGather(
1777 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1779 return CheckSameEntryOrFail();
1780 }
1781 // The distance is too large - still may be profitable to use masked
1782 // loads/gathers.
1783 if (std::abs(*Dist) > NumLanes / 2)
1785 // This still will detect consecutive loads, but we might have "holes"
1786 // in some cases. It is ok for non-power-2 vectorization and may produce
1787 // better results. It should not affect current vectorization.
1790 }
1791
1792 auto *C1 = dyn_cast<Constant>(V1);
1793 auto *C2 = dyn_cast<Constant>(V2);
1794 if (C1 && C2)
1796
1797 // Extracts from consecutive indexes of the same vector better score as
1798 // the extracts could be optimized away.
1799 Value *EV1;
1800 ConstantInt *Ex1Idx;
1801 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1802 // Undefs are always profitable for extractelements.
1803 // Compiler can easily combine poison and extractelement <non-poison> or
1804 // undef and extractelement <poison>. But combining undef +
1805 // extractelement <non-poison-but-may-produce-poison> requires some
1806 // extra operations.
1807 if (isa<UndefValue>(V2))
1808 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1811 Value *EV2 = nullptr;
1812 ConstantInt *Ex2Idx = nullptr;
1813 if (match(V2,
1815 m_Undef())))) {
1816 // Undefs are always profitable for extractelements.
1817 if (!Ex2Idx)
1819 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1821 if (EV2 == EV1) {
1822 int Idx1 = Ex1Idx->getZExtValue();
1823 int Idx2 = Ex2Idx->getZExtValue();
1824 int Dist = Idx2 - Idx1;
1825 // The distance is too large - still may be profitable to use
1826 // shuffles.
1827 if (std::abs(Dist) == 0)
1829 if (std::abs(Dist) > NumLanes / 2)
1833 }
1835 }
1836 return CheckSameEntryOrFail();
1837 }
1838
1839 auto *I1 = dyn_cast<Instruction>(V1);
1840 auto *I2 = dyn_cast<Instruction>(V2);
1841 if (I1 && I2) {
1842 if (I1->getParent() != I2->getParent())
1843 return CheckSameEntryOrFail();
1844 SmallVector<Value *, 4> Ops(MainAltOps);
1845 Ops.push_back(I1);
1846 Ops.push_back(I2);
1847 InstructionsState S = getSameOpcode(Ops, TLI);
1848 // Note: Only consider instructions with <= 2 operands to avoid
1849 // complexity explosion.
1850 if (S.getOpcode() &&
1851 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1852 !S.isAltShuffle()) &&
1853 all_of(Ops, [&S](Value *V) {
1854 return isa<PoisonValue>(V) ||
1855 cast<Instruction>(V)->getNumOperands() ==
1856 S.getMainOp()->getNumOperands();
1857 }))
1858 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1860 }
1861
1862 if (I1 && isa<PoisonValue>(V2))
1864
1865 if (isa<UndefValue>(V2))
1867
1868 return CheckSameEntryOrFail();
1869 }
1870
1871 /// Go through the operands of \p LHS and \p RHS recursively until
1872 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1873 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1874 /// of \p U1 and \p U2), except at the beginning of the recursion where
1875 /// these are set to nullptr.
1876 ///
1877 /// For example:
1878 /// \verbatim
1879 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1880 /// \ / \ / \ / \ /
1881 /// + + + +
1882 /// G1 G2 G3 G4
1883 /// \endverbatim
1884 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1885 /// each level recursively, accumulating the score. It starts from matching
1886 /// the additions at level 0, then moves on to the loads (level 1). The
1887 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1888 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1889 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1890 /// Please note that the order of the operands does not matter, as we
1891 /// evaluate the score of all profitable combinations of operands. In
1892 /// other words the score of G1 and G4 is the same as G1 and G2. This
1893 /// heuristic is based on ideas described in:
1894 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1895 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1896 /// Luís F. W. Góes
1898 Instruction *U2, int CurrLevel,
1899 ArrayRef<Value *> MainAltOps) const {
1900
1901 // Get the shallow score of V1 and V2.
1902 int ShallowScoreAtThisLevel =
1903 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1904
1905 // If reached MaxLevel,
1906 // or if V1 and V2 are not instructions,
1907 // or if they are SPLAT,
1908 // or if they are not consecutive,
1909 // or if profitable to vectorize loads or extractelements, early return
1910 // the current cost.
1911 auto *I1 = dyn_cast<Instruction>(LHS);
1912 auto *I2 = dyn_cast<Instruction>(RHS);
1913 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1914 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1915 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1916 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1917 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1918 ShallowScoreAtThisLevel))
1919 return ShallowScoreAtThisLevel;
1920 assert(I1 && I2 && "Should have early exited.");
1921
1922 // Contains the I2 operand indexes that got matched with I1 operands.
1923 SmallSet<unsigned, 4> Op2Used;
1924
1925 // Recursion towards the operands of I1 and I2. We are trying all possible
1926 // operand pairs, and keeping track of the best score.
1927 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1928 OpIdx1 != NumOperands1; ++OpIdx1) {
1929 // Try to pair op1I with the best operand of I2.
1930 int MaxTmpScore = 0;
1931 unsigned MaxOpIdx2 = 0;
1932 bool FoundBest = false;
1933 // If I2 is commutative try all combinations.
1934 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1935 unsigned ToIdx = isCommutative(I2)
1936 ? I2->getNumOperands()
1937 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1938 assert(FromIdx <= ToIdx && "Bad index");
1939 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1940 // Skip operands already paired with OpIdx1.
1941 if (Op2Used.count(OpIdx2))
1942 continue;
1943 // Recursively calculate the cost at each level
1944 int TmpScore =
1945 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1946 I1, I2, CurrLevel + 1, {});
1947 // Look for the best score.
1948 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1949 TmpScore > MaxTmpScore) {
1950 MaxTmpScore = TmpScore;
1951 MaxOpIdx2 = OpIdx2;
1952 FoundBest = true;
1953 }
1954 }
1955 if (FoundBest) {
1956 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1957 Op2Used.insert(MaxOpIdx2);
1958 ShallowScoreAtThisLevel += MaxTmpScore;
1959 }
1960 }
1961 return ShallowScoreAtThisLevel;
1962 }
1963 };
1964 /// A helper data structure to hold the operands of a vector of instructions.
1965 /// This supports a fixed vector length for all operand vectors.
1967 /// For each operand we need (i) the value, and (ii) the opcode that it
1968 /// would be attached to if the expression was in a left-linearized form.
1969 /// This is required to avoid illegal operand reordering.
1970 /// For example:
1971 /// \verbatim
1972 /// 0 Op1
1973 /// |/
1974 /// Op1 Op2 Linearized + Op2
1975 /// \ / ----------> |/
1976 /// - -
1977 ///
1978 /// Op1 - Op2 (0 + Op1) - Op2
1979 /// \endverbatim
1980 ///
1981 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1982 ///
1983 /// Another way to think of this is to track all the operations across the
1984 /// path from the operand all the way to the root of the tree and to
1985 /// calculate the operation that corresponds to this path. For example, the
1986 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1987 /// corresponding operation is a '-' (which matches the one in the
1988 /// linearized tree, as shown above).
1989 ///
1990 /// For lack of a better term, we refer to this operation as Accumulated
1991 /// Path Operation (APO).
1992 struct OperandData {
1993 OperandData() = default;
1994 OperandData(Value *V, bool APO, bool IsUsed)
1995 : V(V), APO(APO), IsUsed(IsUsed) {}
1996 /// The operand value.
1997 Value *V = nullptr;
1998 /// TreeEntries only allow a single opcode, or an alternate sequence of
1999 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2000 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2001 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2002 /// (e.g., Add/Mul)
2003 bool APO = false;
2004 /// Helper data for the reordering function.
2005 bool IsUsed = false;
2006 };
2007
2008 /// During operand reordering, we are trying to select the operand at lane
2009 /// that matches best with the operand at the neighboring lane. Our
2010 /// selection is based on the type of value we are looking for. For example,
2011 /// if the neighboring lane has a load, we need to look for a load that is
2012 /// accessing a consecutive address. These strategies are summarized in the
2013 /// 'ReorderingMode' enumerator.
2014 enum class ReorderingMode {
2015 Load, ///< Matching loads to consecutive memory addresses
2016 Opcode, ///< Matching instructions based on opcode (same or alternate)
2017 Constant, ///< Matching constants
2018 Splat, ///< Matching the same instruction multiple times (broadcast)
2019 Failed, ///< We failed to create a vectorizable group
2020 };
2021
2023
2024 /// A vector of operand vectors.
2026 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2027 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2028 unsigned ArgSize = 0;
2029
2030 const TargetLibraryInfo &TLI;
2031 const DataLayout &DL;
2032 ScalarEvolution &SE;
2033 const BoUpSLP &R;
2034 const Loop *L = nullptr;
2035
2036 /// \returns the operand data at \p OpIdx and \p Lane.
2037 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2038 return OpsVec[OpIdx][Lane];
2039 }
2040
2041 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2042 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2043 return OpsVec[OpIdx][Lane];
2044 }
2045
2046 /// Clears the used flag for all entries.
2047 void clearUsed() {
2048 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2049 OpIdx != NumOperands; ++OpIdx)
2050 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2051 ++Lane)
2052 OpsVec[OpIdx][Lane].IsUsed = false;
2053 }
2054
2055 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2056 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2057 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2058 }
2059
2060 /// \param Lane lane of the operands under analysis.
2061 /// \param OpIdx operand index in \p Lane lane we're looking the best
2062 /// candidate for.
2063 /// \param Idx operand index of the current candidate value.
2064 /// \returns The additional score due to possible broadcasting of the
2065 /// elements in the lane. It is more profitable to have power-of-2 unique
2066 /// elements in the lane, it will be vectorized with higher probability
2067 /// after removing duplicates. Currently the SLP vectorizer supports only
2068 /// vectorization of the power-of-2 number of unique scalars.
2069 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2070 const SmallBitVector &UsedLanes) const {
2071 Value *IdxLaneV = getData(Idx, Lane).V;
2072 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2073 isa<ExtractElementInst>(IdxLaneV))
2074 return 0;
2076 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2077 if (Ln == Lane)
2078 continue;
2079 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2080 if (!isa<Instruction>(OpIdxLnV))
2081 return 0;
2082 Uniques.try_emplace(OpIdxLnV, Ln);
2083 }
2084 unsigned UniquesCount = Uniques.size();
2085 auto IdxIt = Uniques.find(IdxLaneV);
2086 unsigned UniquesCntWithIdxLaneV =
2087 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2088 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2089 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2090 unsigned UniquesCntWithOpIdxLaneV =
2091 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2092 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2093 return 0;
2094 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2095 UniquesCntWithOpIdxLaneV,
2096 UniquesCntWithOpIdxLaneV -
2097 bit_floor(UniquesCntWithOpIdxLaneV)) -
2098 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2099 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2100 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2101 }
2102
2103 /// \param Lane lane of the operands under analysis.
2104 /// \param OpIdx operand index in \p Lane lane we're looking the best
2105 /// candidate for.
2106 /// \param Idx operand index of the current candidate value.
2107 /// \returns The additional score for the scalar which users are all
2108 /// vectorized.
2109 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2110 Value *IdxLaneV = getData(Idx, Lane).V;
2111 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2112 // Do not care about number of uses for vector-like instructions
2113 // (extractelement/extractvalue with constant indices), they are extracts
2114 // themselves and already externally used. Vectorization of such
2115 // instructions does not add extra extractelement instruction, just may
2116 // remove it.
2117 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2118 isVectorLikeInstWithConstOps(OpIdxLaneV))
2120 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2121 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2122 return 0;
2123 return R.areAllUsersVectorized(IdxLaneI)
2125 : 0;
2126 }
2127
2128 /// Score scaling factor for fully compatible instructions but with
2129 /// different number of external uses. Allows better selection of the
2130 /// instructions with less external uses.
2131 static const int ScoreScaleFactor = 10;
2132
2133 /// \Returns the look-ahead score, which tells us how much the sub-trees
2134 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2135 /// score. This helps break ties in an informed way when we cannot decide on
2136 /// the order of the operands by just considering the immediate
2137 /// predecessors.
2138 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2139 int Lane, unsigned OpIdx, unsigned Idx,
2140 bool &IsUsed, const SmallBitVector &UsedLanes) {
2141 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2143 // Keep track of the instruction stack as we recurse into the operands
2144 // during the look-ahead score exploration.
2145 int Score =
2146 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2147 /*CurrLevel=*/1, MainAltOps);
2148 if (Score) {
2149 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2150 if (Score <= -SplatScore) {
2151 // Failed score.
2152 Score = 0;
2153 } else {
2154 Score += SplatScore;
2155 // Scale score to see the difference between different operands
2156 // and similar operands but all vectorized/not all vectorized
2157 // uses. It does not affect actual selection of the best
2158 // compatible operand in general, just allows to select the
2159 // operand with all vectorized uses.
2160 Score *= ScoreScaleFactor;
2161 Score += getExternalUseScore(Lane, OpIdx, Idx);
2162 IsUsed = true;
2163 }
2164 }
2165 return Score;
2166 }
2167
2168 /// Best defined scores per lanes between the passes. Used to choose the
2169 /// best operand (with the highest score) between the passes.
2170 /// The key - {Operand Index, Lane}.
2171 /// The value - the best score between the passes for the lane and the
2172 /// operand.
2174 BestScoresPerLanes;
2175
2176 // Search all operands in Ops[*][Lane] for the one that matches best
2177 // Ops[OpIdx][LastLane] and return its opreand index.
2178 // If no good match can be found, return std::nullopt.
2179 std::optional<unsigned>
2180 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2181 ArrayRef<ReorderingMode> ReorderingModes,
2182 ArrayRef<Value *> MainAltOps,
2183 const SmallBitVector &UsedLanes) {
2184 unsigned NumOperands = getNumOperands();
2185
2186 // The operand of the previous lane at OpIdx.
2187 Value *OpLastLane = getData(OpIdx, LastLane).V;
2188
2189 // Our strategy mode for OpIdx.
2190 ReorderingMode RMode = ReorderingModes[OpIdx];
2191 if (RMode == ReorderingMode::Failed)
2192 return std::nullopt;
2193
2194 // The linearized opcode of the operand at OpIdx, Lane.
2195 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2196
2197 // The best operand index and its score.
2198 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2199 // are using the score to differentiate between the two.
2200 struct BestOpData {
2201 std::optional<unsigned> Idx;
2202 unsigned Score = 0;
2203 } BestOp;
2204 BestOp.Score =
2205 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2206 .first->second;
2207
2208 // Track if the operand must be marked as used. If the operand is set to
2209 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2210 // want to reestimate the operands again on the following iterations).
2211 bool IsUsed = RMode == ReorderingMode::Splat ||
2212 RMode == ReorderingMode::Constant ||
2213 RMode == ReorderingMode::Load;
2214 // Iterate through all unused operands and look for the best.
2215 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2216 // Get the operand at Idx and Lane.
2217 OperandData &OpData = getData(Idx, Lane);
2218 Value *Op = OpData.V;
2219 bool OpAPO = OpData.APO;
2220
2221 // Skip already selected operands.
2222 if (OpData.IsUsed)
2223 continue;
2224
2225 // Skip if we are trying to move the operand to a position with a
2226 // different opcode in the linearized tree form. This would break the
2227 // semantics.
2228 if (OpAPO != OpIdxAPO)
2229 continue;
2230
2231 // Look for an operand that matches the current mode.
2232 switch (RMode) {
2233 case ReorderingMode::Load:
2234 case ReorderingMode::Opcode: {
2235 bool LeftToRight = Lane > LastLane;
2236 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2237 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2238 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2239 OpIdx, Idx, IsUsed, UsedLanes);
2240 if (Score > static_cast<int>(BestOp.Score) ||
2241 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2242 Idx == OpIdx)) {
2243 BestOp.Idx = Idx;
2244 BestOp.Score = Score;
2245 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2246 }
2247 break;
2248 }
2249 case ReorderingMode::Constant:
2250 if (isa<Constant>(Op) ||
2251 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2252 BestOp.Idx = Idx;
2253 if (isa<Constant>(Op)) {
2255 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2257 }
2258 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2259 IsUsed = false;
2260 }
2261 break;
2262 case ReorderingMode::Splat:
2263 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2264 IsUsed = Op == OpLastLane;
2265 if (Op == OpLastLane) {
2266 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2267 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2269 }
2270 BestOp.Idx = Idx;
2271 }
2272 break;
2273 case ReorderingMode::Failed:
2274 llvm_unreachable("Not expected Failed reordering mode.");
2275 }
2276 }
2277
2278 if (BestOp.Idx) {
2279 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2280 return BestOp.Idx;
2281 }
2282 // If we could not find a good match return std::nullopt.
2283 return std::nullopt;
2284 }
2285
2286 /// Helper for reorderOperandVecs.
2287 /// \returns the lane that we should start reordering from. This is the one
2288 /// which has the least number of operands that can freely move about or
2289 /// less profitable because it already has the most optimal set of operands.
2290 unsigned getBestLaneToStartReordering() const {
2291 unsigned Min = UINT_MAX;
2292 unsigned SameOpNumber = 0;
2293 // std::pair<unsigned, unsigned> is used to implement a simple voting
2294 // algorithm and choose the lane with the least number of operands that
2295 // can freely move about or less profitable because it already has the
2296 // most optimal set of operands. The first unsigned is a counter for
2297 // voting, the second unsigned is the counter of lanes with instructions
2298 // with same/alternate opcodes and same parent basic block.
2300 // Try to be closer to the original results, if we have multiple lanes
2301 // with same cost. If 2 lanes have the same cost, use the one with the
2302 // highest index.
2303 for (int I = getNumLanes(); I > 0; --I) {
2304 unsigned Lane = I - 1;
2305 OperandsOrderData NumFreeOpsHash =
2306 getMaxNumOperandsThatCanBeReordered(Lane);
2307 // Compare the number of operands that can move and choose the one with
2308 // the least number.
2309 if (NumFreeOpsHash.NumOfAPOs < Min) {
2310 Min = NumFreeOpsHash.NumOfAPOs;
2311 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2312 HashMap.clear();
2313 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2314 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2315 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2316 // Select the most optimal lane in terms of number of operands that
2317 // should be moved around.
2318 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2319 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2320 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2321 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2322 auto [It, Inserted] =
2323 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2324 if (!Inserted)
2325 ++It->second.first;
2326 }
2327 }
2328 // Select the lane with the minimum counter.
2329 unsigned BestLane = 0;
2330 unsigned CntMin = UINT_MAX;
2331 for (const auto &Data : reverse(HashMap)) {
2332 if (Data.second.first < CntMin) {
2333 CntMin = Data.second.first;
2334 BestLane = Data.second.second;
2335 }
2336 }
2337 return BestLane;
2338 }
2339
2340 /// Data structure that helps to reorder operands.
2341 struct OperandsOrderData {
2342 /// The best number of operands with the same APOs, which can be
2343 /// reordered.
2344 unsigned NumOfAPOs = UINT_MAX;
2345 /// Number of operands with the same/alternate instruction opcode and
2346 /// parent.
2347 unsigned NumOpsWithSameOpcodeParent = 0;
2348 /// Hash for the actual operands ordering.
2349 /// Used to count operands, actually their position id and opcode
2350 /// value. It is used in the voting mechanism to find the lane with the
2351 /// least number of operands that can freely move about or less profitable
2352 /// because it already has the most optimal set of operands. Can be
2353 /// replaced with SmallVector<unsigned> instead but hash code is faster
2354 /// and requires less memory.
2355 unsigned Hash = 0;
2356 };
2357 /// \returns the maximum number of operands that are allowed to be reordered
2358 /// for \p Lane and the number of compatible instructions(with the same
2359 /// parent/opcode). This is used as a heuristic for selecting the first lane
2360 /// to start operand reordering.
2361 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2362 unsigned CntTrue = 0;
2363 unsigned NumOperands = getNumOperands();
2364 // Operands with the same APO can be reordered. We therefore need to count
2365 // how many of them we have for each APO, like this: Cnt[APO] = x.
2366 // Since we only have two APOs, namely true and false, we can avoid using
2367 // a map. Instead we can simply count the number of operands that
2368 // correspond to one of them (in this case the 'true' APO), and calculate
2369 // the other by subtracting it from the total number of operands.
2370 // Operands with the same instruction opcode and parent are more
2371 // profitable since we don't need to move them in many cases, with a high
2372 // probability such lane already can be vectorized effectively.
2373 bool AllUndefs = true;
2374 unsigned NumOpsWithSameOpcodeParent = 0;
2375 Instruction *OpcodeI = nullptr;
2376 BasicBlock *Parent = nullptr;
2377 unsigned Hash = 0;
2378 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2379 const OperandData &OpData = getData(OpIdx, Lane);
2380 if (OpData.APO)
2381 ++CntTrue;
2382 // Use Boyer-Moore majority voting for finding the majority opcode and
2383 // the number of times it occurs.
2384 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2385 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2386 I->getParent() != Parent) {
2387 if (NumOpsWithSameOpcodeParent == 0) {
2388 NumOpsWithSameOpcodeParent = 1;
2389 OpcodeI = I;
2390 Parent = I->getParent();
2391 } else {
2392 --NumOpsWithSameOpcodeParent;
2393 }
2394 } else {
2395 ++NumOpsWithSameOpcodeParent;
2396 }
2397 }
2398 Hash = hash_combine(
2399 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2400 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2401 }
2402 if (AllUndefs)
2403 return {};
2404 OperandsOrderData Data;
2405 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2406 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2407 Data.Hash = Hash;
2408 return Data;
2409 }
2410
2411 /// Go through the instructions in VL and append their operands.
2412 void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
2413 assert(!VL.empty() && "Bad VL");
2414 assert((empty() || VL.size() == getNumLanes()) &&
2415 "Expected same number of lanes");
2416 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2417 // arguments to the intrinsic produces the same result.
2418 constexpr unsigned IntrinsicNumOperands = 2;
2419 unsigned NumOperands = VL0->getNumOperands();
2420 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2421 OpsVec.resize(NumOperands);
2422 unsigned NumLanes = VL.size();
2423 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2424 OpsVec[OpIdx].resize(NumLanes);
2425 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2426 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2427 "Expected instruction or poison value");
2428 // Our tree has just 3 nodes: the root and two operands.
2429 // It is therefore trivial to get the APO. We only need to check the
2430 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2431 // RHS operand. The LHS operand of both add and sub is never attached
2432 // to an inversese operation in the linearized form, therefore its APO
2433 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2434
2435 // Since operand reordering is performed on groups of commutative
2436 // operations or alternating sequences (e.g., +, -), we can safely
2437 // tell the inverse operations by checking commutativity.
2438 if (isa<PoisonValue>(VL[Lane])) {
2439 OpsVec[OpIdx][Lane] = {
2440 PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
2441 false};
2442 continue;
2443 }
2444 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2445 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2446 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2447 APO, false};
2448 }
2449 }
2450 }
2451
2452 /// \returns the number of operands.
2453 unsigned getNumOperands() const { return ArgSize; }
2454
2455 /// \returns the number of lanes.
2456 unsigned getNumLanes() const { return OpsVec[0].size(); }
2457
2458 /// \returns the operand value at \p OpIdx and \p Lane.
2459 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2460 return getData(OpIdx, Lane).V;
2461 }
2462
2463 /// \returns true if the data structure is empty.
2464 bool empty() const { return OpsVec.empty(); }
2465
2466 /// Clears the data.
2467 void clear() { OpsVec.clear(); }
2468
2469 /// \Returns true if there are enough operands identical to \p Op to fill
2470 /// the whole vector (it is mixed with constants or loop invariant values).
2471 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2472 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2473 assert(Op == getValue(OpIdx, Lane) &&
2474 "Op is expected to be getValue(OpIdx, Lane).");
2475 // Small number of loads - try load matching.
2476 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2477 return false;
2478 bool OpAPO = getData(OpIdx, Lane).APO;
2479 bool IsInvariant = L && L->isLoopInvariant(Op);
2480 unsigned Cnt = 0;
2481 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2482 if (Ln == Lane)
2483 continue;
2484 // This is set to true if we found a candidate for broadcast at Lane.
2485 bool FoundCandidate = false;
2486 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2487 OperandData &Data = getData(OpI, Ln);
2488 if (Data.APO != OpAPO || Data.IsUsed)
2489 continue;
2490 Value *OpILane = getValue(OpI, Lane);
2491 bool IsConstantOp = isa<Constant>(OpILane);
2492 // Consider the broadcast candidate if:
2493 // 1. Same value is found in one of the operands.
2494 if (Data.V == Op ||
2495 // 2. The operand in the given lane is not constant but there is a
2496 // constant operand in another lane (which can be moved to the
2497 // given lane). In this case we can represent it as a simple
2498 // permutation of constant and broadcast.
2499 (!IsConstantOp &&
2500 ((Lns > 2 && isa<Constant>(Data.V)) ||
2501 // 2.1. If we have only 2 lanes, need to check that value in the
2502 // next lane does not build same opcode sequence.
2503 (Lns == 2 &&
2504 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2505 .getOpcode() &&
2506 isa<Constant>(Data.V)))) ||
2507 // 3. The operand in the current lane is loop invariant (can be
2508 // hoisted out) and another operand is also a loop invariant
2509 // (though not a constant). In this case the whole vector can be
2510 // hoisted out.
2511 // FIXME: need to teach the cost model about this case for better
2512 // estimation.
2513 (IsInvariant && !isa<Constant>(Data.V) &&
2514 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2515 L->isLoopInvariant(Data.V))) {
2516 FoundCandidate = true;
2517 Data.IsUsed = Data.V == Op;
2518 if (Data.V == Op)
2519 ++Cnt;
2520 break;
2521 }
2522 }
2523 if (!FoundCandidate)
2524 return false;
2525 }
2526 return getNumLanes() == 2 || Cnt > 1;
2527 }
2528
2529 /// Checks if there is at least single compatible operand in lanes other
2530 /// than \p Lane, compatible with the operand \p Op.
2531 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2532 assert(Op == getValue(OpIdx, Lane) &&
2533 "Op is expected to be getValue(OpIdx, Lane).");
2534 bool OpAPO = getData(OpIdx, Lane).APO;
2535 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2536 if (Ln == Lane)
2537 continue;
2538 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2539 const OperandData &Data = getData(OpI, Ln);
2540 if (Data.APO != OpAPO || Data.IsUsed)
2541 return true;
2542 Value *OpILn = getValue(OpI, Ln);
2543 return (L && L->isLoopInvariant(OpILn)) ||
2544 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2545 allSameBlock({Op, OpILn}));
2546 }))
2547 return true;
2548 }
2549 return false;
2550 }
2551
2552 public:
2553 /// Initialize with all the operands of the instruction vector \p RootVL.
2555 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2556 L(R.LI->getLoopFor((VL0->getParent()))) {
2557 // Append all the operands of RootVL.
2558 appendOperandsOfVL(RootVL, VL0);
2559 }
2560
2561 /// \Returns a value vector with the operands across all lanes for the
2562 /// opearnd at \p OpIdx.
2563 ValueList getVL(unsigned OpIdx) const {
2564 ValueList OpVL(OpsVec[OpIdx].size());
2565 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2566 "Expected same num of lanes across all operands");
2567 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2568 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2569 return OpVL;
2570 }
2571
2572 // Performs operand reordering for 2 or more operands.
2573 // The original operands are in OrigOps[OpIdx][Lane].
2574 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2575 void reorder() {
2576 unsigned NumOperands = getNumOperands();
2577 unsigned NumLanes = getNumLanes();
2578 // Each operand has its own mode. We are using this mode to help us select
2579 // the instructions for each lane, so that they match best with the ones
2580 // we have selected so far.
2581 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2582
2583 // This is a greedy single-pass algorithm. We are going over each lane
2584 // once and deciding on the best order right away with no back-tracking.
2585 // However, in order to increase its effectiveness, we start with the lane
2586 // that has operands that can move the least. For example, given the
2587 // following lanes:
2588 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2589 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2590 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2591 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2592 // we will start at Lane 1, since the operands of the subtraction cannot
2593 // be reordered. Then we will visit the rest of the lanes in a circular
2594 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2595
2596 // Find the first lane that we will start our search from.
2597 unsigned FirstLane = getBestLaneToStartReordering();
2598
2599 // Initialize the modes.
2600 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2601 Value *OpLane0 = getValue(OpIdx, FirstLane);
2602 // Keep track if we have instructions with all the same opcode on one
2603 // side.
2604 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2605 // Check if OpLane0 should be broadcast.
2606 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2607 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2608 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2609 else if (isa<LoadInst>(OpILane0))
2610 ReorderingModes[OpIdx] = ReorderingMode::Load;
2611 else
2612 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2613 } else if (isa<Constant>(OpLane0)) {
2614 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2615 } else if (isa<Argument>(OpLane0)) {
2616 // Our best hope is a Splat. It may save some cost in some cases.
2617 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2618 } else {
2619 llvm_unreachable("Unexpected value kind.");
2620 }
2621 }
2622
2623 // Check that we don't have same operands. No need to reorder if operands
2624 // are just perfect diamond or shuffled diamond match. Do not do it only
2625 // for possible broadcasts or non-power of 2 number of scalars (just for
2626 // now).
2627 auto &&SkipReordering = [this]() {
2628 SmallPtrSet<Value *, 4> UniqueValues;
2629 ArrayRef<OperandData> Op0 = OpsVec.front();
2630 for (const OperandData &Data : Op0)
2631 UniqueValues.insert(Data.V);
2633 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2634 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2635 return !UniqueValues.contains(Data.V);
2636 }))
2637 return false;
2638 }
2639 // TODO: Check if we can remove a check for non-power-2 number of
2640 // scalars after full support of non-power-2 vectorization.
2641 return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2642 };
2643
2644 // If the initial strategy fails for any of the operand indexes, then we
2645 // perform reordering again in a second pass. This helps avoid assigning
2646 // high priority to the failed strategy, and should improve reordering for
2647 // the non-failed operand indexes.
2648 for (int Pass = 0; Pass != 2; ++Pass) {
2649 // Check if no need to reorder operands since they're are perfect or
2650 // shuffled diamond match.
2651 // Need to do it to avoid extra external use cost counting for
2652 // shuffled matches, which may cause regressions.
2653 if (SkipReordering())
2654 break;
2655 // Skip the second pass if the first pass did not fail.
2656 bool StrategyFailed = false;
2657 // Mark all operand data as free to use.
2658 clearUsed();
2659 // We keep the original operand order for the FirstLane, so reorder the
2660 // rest of the lanes. We are visiting the nodes in a circular fashion,
2661 // using FirstLane as the center point and increasing the radius
2662 // distance.
2663 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2664 for (unsigned I = 0; I < NumOperands; ++I)
2665 MainAltOps[I].push_back(getData(I, FirstLane).V);
2666
2667 SmallBitVector UsedLanes(NumLanes);
2668 UsedLanes.set(FirstLane);
2669 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2670 // Visit the lane on the right and then the lane on the left.
2671 for (int Direction : {+1, -1}) {
2672 int Lane = FirstLane + Direction * Distance;
2673 if (Lane < 0 || Lane >= (int)NumLanes)
2674 continue;
2675 UsedLanes.set(Lane);
2676 int LastLane = Lane - Direction;
2677 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2678 "Out of bounds");
2679 // Look for a good match for each operand.
2680 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2681 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2682 std::optional<unsigned> BestIdx =
2683 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2684 MainAltOps[OpIdx], UsedLanes);
2685 // By not selecting a value, we allow the operands that follow to
2686 // select a better matching value. We will get a non-null value in
2687 // the next run of getBestOperand().
2688 if (BestIdx) {
2689 // Swap the current operand with the one returned by
2690 // getBestOperand().
2691 swap(OpIdx, *BestIdx, Lane);
2692 } else {
2693 // Enable the second pass.
2694 StrategyFailed = true;
2695 }
2696 // Try to get the alternate opcode and follow it during analysis.
2697 if (MainAltOps[OpIdx].size() != 2) {
2698 OperandData &AltOp = getData(OpIdx, Lane);
2699 InstructionsState OpS =
2700 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2701 if (OpS.getOpcode() && OpS.isAltShuffle())
2702 MainAltOps[OpIdx].push_back(AltOp.V);
2703 }
2704 }
2705 }
2706 }
2707 // Skip second pass if the strategy did not fail.
2708 if (!StrategyFailed)
2709 break;
2710 }
2711 }
2712
2713#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2714 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2715 switch (RMode) {
2716 case ReorderingMode::Load:
2717 return "Load";
2718 case ReorderingMode::Opcode:
2719 return "Opcode";
2720 case ReorderingMode::Constant:
2721 return "Constant";
2722 case ReorderingMode::Splat:
2723 return "Splat";
2724 case ReorderingMode::Failed:
2725 return "Failed";
2726 }
2727 llvm_unreachable("Unimplemented Reordering Type");
2728 }
2729
2730 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2731 raw_ostream &OS) {
2732 return OS << getModeStr(RMode);
2733 }
2734
2735 /// Debug print.
2736 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2737 printMode(RMode, dbgs());
2738 }
2739
2740 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2741 return printMode(RMode, OS);
2742 }
2743
2745 const unsigned Indent = 2;
2746 unsigned Cnt = 0;
2747 for (const OperandDataVec &OpDataVec : OpsVec) {
2748 OS << "Operand " << Cnt++ << "\n";
2749 for (const OperandData &OpData : OpDataVec) {
2750 OS.indent(Indent) << "{";
2751 if (Value *V = OpData.V)
2752 OS << *V;
2753 else
2754 OS << "null";
2755 OS << ", APO:" << OpData.APO << "}\n";
2756 }
2757 OS << "\n";
2758 }
2759 return OS;
2760 }
2761
2762 /// Debug print.
2763 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2764#endif
2765 };
2766
2767 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2768 /// for a pair which have highest score deemed to have best chance to form
2769 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2770 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2771 /// of the cost, considered to be good enough score.
2772 std::optional<int>
2773 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2774 int Limit = LookAheadHeuristics::ScoreFail) const {
2775 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2777 int BestScore = Limit;
2778 std::optional<int> Index;
2779 for (int I : seq<int>(0, Candidates.size())) {
2780 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2781 Candidates[I].second,
2782 /*U1=*/nullptr, /*U2=*/nullptr,
2783 /*CurrLevel=*/1, {});
2784 if (Score > BestScore) {
2785 BestScore = Score;
2786 Index = I;
2787 }
2788 }
2789 return Index;
2790 }
2791
2792 /// Checks if the instruction is marked for deletion.
2793 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2794
2795 /// Removes an instruction from its block and eventually deletes it.
2796 /// It's like Instruction::eraseFromParent() except that the actual deletion
2797 /// is delayed until BoUpSLP is destructed.
2799 DeletedInstructions.insert(I);
2800 }
2801
2802 /// Remove instructions from the parent function and clear the operands of \p
2803 /// DeadVals instructions, marking for deletion trivially dead operands.
2804 template <typename T>
2807 for (T *V : DeadVals) {
2808 auto *I = cast<Instruction>(V);
2809 DeletedInstructions.insert(I);
2810 }
2811 DenseSet<Value *> Processed;
2812 for (T *V : DeadVals) {
2813 if (!V || !Processed.insert(V).second)
2814 continue;
2815 auto *I = cast<Instruction>(V);
2818 if (const TreeEntry *Entry = getTreeEntry(I)) {
2819 Entries.push_back(Entry);
2820 auto It = MultiNodeScalars.find(I);
2821 if (It != MultiNodeScalars.end())
2822 Entries.append(It->second.begin(), It->second.end());
2823 }
2824 for (Use &U : I->operands()) {
2825 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2826 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2828 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2829 return Entry->VectorizedValue == OpI;
2830 })))
2831 DeadInsts.push_back(OpI);
2832 }
2833 I->dropAllReferences();
2834 }
2835 for (T *V : DeadVals) {
2836 auto *I = cast<Instruction>(V);
2837 if (!I->getParent())
2838 continue;
2839 assert((I->use_empty() || all_of(I->uses(),
2840 [&](Use &U) {
2841 return isDeleted(
2842 cast<Instruction>(U.getUser()));
2843 })) &&
2844 "trying to erase instruction with users.");
2845 I->removeFromParent();
2846 SE->forgetValue(I);
2847 }
2848 // Process the dead instruction list until empty.
2849 while (!DeadInsts.empty()) {
2850 Value *V = DeadInsts.pop_back_val();
2851 Instruction *VI = cast_or_null<Instruction>(V);
2852 if (!VI || !VI->getParent())
2853 continue;
2855 "Live instruction found in dead worklist!");
2856 assert(VI->use_empty() && "Instructions with uses are not dead.");
2857
2858 // Don't lose the debug info while deleting the instructions.
2859 salvageDebugInfo(*VI);
2860
2861 // Null out all of the instruction's operands to see if any operand
2862 // becomes dead as we go.
2863 for (Use &OpU : VI->operands()) {
2864 Value *OpV = OpU.get();
2865 if (!OpV)
2866 continue;
2867 OpU.set(nullptr);
2868
2869 if (!OpV->use_empty())
2870 continue;
2871
2872 // If the operand is an instruction that became dead as we nulled out
2873 // the operand, and if it is 'trivially' dead, delete it in a future
2874 // loop iteration.
2875 if (auto *OpI = dyn_cast<Instruction>(OpV))
2876 if (!DeletedInstructions.contains(OpI) &&
2878 DeadInsts.push_back(OpI);
2879 }
2880
2881 VI->removeFromParent();
2882 DeletedInstructions.insert(VI);
2883 SE->forgetValue(VI);
2884 }
2885 }
2886
2887 /// Checks if the instruction was already analyzed for being possible
2888 /// reduction root.
2890 return AnalyzedReductionsRoots.count(I);
2891 }
2892 /// Register given instruction as already analyzed for being possible
2893 /// reduction root.
2895 AnalyzedReductionsRoots.insert(I);
2896 }
2897 /// Checks if the provided list of reduced values was checked already for
2898 /// vectorization.
2900 return AnalyzedReductionVals.contains(hash_value(VL));
2901 }
2902 /// Adds the list of reduced values to list of already checked values for the
2903 /// vectorization.
2905 AnalyzedReductionVals.insert(hash_value(VL));
2906 }
2907 /// Clear the list of the analyzed reduction root instructions.
2909 AnalyzedReductionsRoots.clear();
2910 AnalyzedReductionVals.clear();
2911 AnalyzedMinBWVals.clear();
2912 }
2913 /// Checks if the given value is gathered in one of the nodes.
2914 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2915 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2916 }
2917 /// Checks if the given value is gathered in one of the nodes.
2918 bool isGathered(const Value *V) const {
2919 return MustGather.contains(V);
2920 }
2921 /// Checks if the specified value was not schedule.
2922 bool isNotScheduled(const Value *V) const {
2923 return NonScheduledFirst.contains(V);
2924 }
2925
2926 /// Check if the value is vectorized in the tree.
2927 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2928
2929 ~BoUpSLP();
2930
2931private:
2932 /// Determine if a node \p E in can be demoted to a smaller type with a
2933 /// truncation. We collect the entries that will be demoted in ToDemote.
2934 /// \param E Node for analysis
2935 /// \param ToDemote indices of the nodes to be demoted.
2936 bool collectValuesToDemote(
2937 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2939 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2940 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2941
2942 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2943 /// reordering (i.e. the operands can be reordered because they have only one
2944 /// user and reordarable).
2945 /// \param ReorderableGathers List of all gather nodes that require reordering
2946 /// (e.g., gather of extractlements or partially vectorizable loads).
2947 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2948 /// reordering, subset of \p NonVectorized.
2949 bool
2950 canReorderOperands(TreeEntry *UserTE,
2951 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2952 ArrayRef<TreeEntry *> ReorderableGathers,
2953 SmallVectorImpl<TreeEntry *> &GatherOps);
2954
2955 /// Checks if the given \p TE is a gather node with clustered reused scalars
2956 /// and reorders it per given \p Mask.
2957 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2958
2959 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2960 /// if any. If it is not vectorized (gather node), returns nullptr.
2961 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2962 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2963 TreeEntry *TE = nullptr;
2964 const auto *It = find_if(VL, [&](Value *V) {
2965 TE = getTreeEntry(V);
2966 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2967 return true;
2968 auto It = MultiNodeScalars.find(V);
2969 if (It != MultiNodeScalars.end()) {
2970 for (TreeEntry *E : It->second) {
2971 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2972 TE = E;
2973 return true;
2974 }
2975 }
2976 }
2977 return false;
2978 });
2979 if (It != VL.end()) {
2980 assert(TE->isSame(VL) && "Expected same scalars.");
2981 return TE;
2982 }
2983 return nullptr;
2984 }
2985
2986 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2987 /// if any. If it is not vectorized (gather node), returns nullptr.
2988 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2989 unsigned OpIdx) const {
2990 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2991 const_cast<TreeEntry *>(UserTE), OpIdx);
2992 }
2993
2994 /// Checks if all users of \p I are the part of the vectorization tree.
2995 bool areAllUsersVectorized(
2996 Instruction *I,
2997 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2998
2999 /// Return information about the vector formed for the specified index
3000 /// of a vector of (the same) instruction.
3002
3003 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3004 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3005
3006 /// Gets the root instruction for the given node. If the node is a strided
3007 /// load/store node with the reverse order, the root instruction is the last
3008 /// one.
3009 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3010
3011 /// \returns Cast context for the given graph node.
3013 getCastContextHint(const TreeEntry &TE) const;
3014
3015 /// \returns the cost of the vectorizable entry.
3016 InstructionCost getEntryCost(const TreeEntry *E,
3017 ArrayRef<Value *> VectorizedVals,
3018 SmallPtrSetImpl<Value *> &CheckedExtracts);
3019
3020 /// This is the recursive part of buildTree.
3021 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3022 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3023
3024 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3025 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3026 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3027 /// returns false, setting \p CurrentOrder to either an empty vector or a
3028 /// non-identity permutation that allows to reuse extract instructions.
3029 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3030 /// extract order.
3031 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
3032 SmallVectorImpl<unsigned> &CurrentOrder,
3033 bool ResizeAllowed = false) const;
3034
3035 /// Vectorize a single entry in the tree.
3036 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3037 /// avoid issues with def-use order.
3038 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3039
3040 /// Returns vectorized operand node, that matches the order of the scalars
3041 /// operand number \p NodeIdx in entry \p E.
3042 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3043 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3044 unsigned NodeIdx) const {
3045 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3046 }
3047
3048 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3049 /// \p E.
3050 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3051 /// avoid issues with def-use order.
3052 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3053
3054 /// Create a new vector from a list of scalar values. Produces a sequence
3055 /// which exploits values reused across lanes, and arranges the inserts
3056 /// for ease of later optimization.
3057 template <typename BVTy, typename ResTy, typename... Args>
3058 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3059
3060 /// Create a new vector from a list of scalar values. Produces a sequence
3061 /// which exploits values reused across lanes, and arranges the inserts
3062 /// for ease of later optimization.
3063 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3064 bool PostponedPHIs);
3065
3066 /// Returns the instruction in the bundle, which can be used as a base point
3067 /// for scheduling. Usually it is the last instruction in the bundle, except
3068 /// for the case when all operands are external (in this case, it is the first
3069 /// instruction in the list).
3070 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3071
3072 /// Tries to find extractelement instructions with constant indices from fixed
3073 /// vector type and gather such instructions into a bunch, which highly likely
3074 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3075 /// was successful, the matched scalars are replaced by poison values in \p VL
3076 /// for future analysis.
3077 std::optional<TargetTransformInfo::ShuffleKind>
3078 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3079 SmallVectorImpl<int> &Mask) const;
3080
3081 /// Tries to find extractelement instructions with constant indices from fixed
3082 /// vector type and gather such instructions into a bunch, which highly likely
3083 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3084 /// was successful, the matched scalars are replaced by poison values in \p VL
3085 /// for future analysis.
3087 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3089 unsigned NumParts) const;
3090
3091 /// Checks if the gathered \p VL can be represented as a single register
3092 /// shuffle(s) of previous tree entries.
3093 /// \param TE Tree entry checked for permutation.
3094 /// \param VL List of scalars (a subset of the TE scalar), checked for
3095 /// permutations. Must form single-register vector.
3096 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3097 /// commands to build the mask using the original vector value, without
3098 /// relying on the potential reordering.
3099 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3100 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3101 std::optional<TargetTransformInfo::ShuffleKind>
3102 isGatherShuffledSingleRegisterEntry(
3103 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3104 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3105 bool ForOrder);
3106
3107 /// Checks if the gathered \p VL can be represented as multi-register
3108 /// shuffle(s) of previous tree entries.
3109 /// \param TE Tree entry checked for permutation.
3110 /// \param VL List of scalars (a subset of the TE scalar), checked for
3111 /// permutations.
3112 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3113 /// commands to build the mask using the original vector value, without
3114 /// relying on the potential reordering.
3115 /// \returns per-register series of ShuffleKind, if gathered values can be
3116 /// represented as shuffles of previous tree entries. \p Mask is filled with
3117 /// the shuffle mask (also on per-register base).
3119 isGatherShuffledEntry(
3120 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3122 unsigned NumParts, bool ForOrder = false);
3123
3124 /// \returns the cost of gathering (inserting) the values in \p VL into a
3125 /// vector.
3126 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3127 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3128 Type *ScalarTy) const;
3129
3130 /// Set the Builder insert point to one after the last instruction in
3131 /// the bundle
3132 void setInsertPointAfterBundle(const TreeEntry *E);
3133
3134 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3135 /// specified, the starting vector value is poison.
3136 Value *
3137 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3138 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3139
3140 /// \returns whether the VectorizableTree is fully vectorizable and will
3141 /// be beneficial even the tree height is tiny.
3142 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3143
3144 /// Run through the list of all gathered loads in the graph and try to find
3145 /// vector loads/masked gathers instead of regular gathers. Later these loads
3146 /// are reshufled to build final gathered nodes.
3147 void tryToVectorizeGatheredLoads(
3148 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3149 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3150 8> &GatheredLoads);
3151
3152 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3153 /// users of \p TE and collects the stores. It returns the map from the store
3154 /// pointers to the collected stores.
3156 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3157
3158 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3159 /// stores in \p StoresVec can form a vector instruction. If so it returns
3160 /// true and populates \p ReorderIndices with the shuffle indices of the
3161 /// stores when compared to the sorted vector.
3162 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3163 OrdersType &ReorderIndices) const;
3164
3165 /// Iterates through the users of \p TE, looking for scalar stores that can be
3166 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3167 /// their order and builds an order index vector for each store bundle. It
3168 /// returns all these order vectors found.
3169 /// We run this after the tree has formed, otherwise we may come across user
3170 /// instructions that are not yet in the tree.
3172 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3173
3174 /// Tries to reorder the gathering node for better vectorization
3175 /// opportunities.
3176 void reorderGatherNode(TreeEntry &TE);
3177
3178 struct TreeEntry {
3179 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3180 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3181
3182 /// \returns Common mask for reorder indices and reused scalars.
3183 SmallVector<int> getCommonMask() const {
3185 inversePermutation(ReorderIndices, Mask);
3186 ::addMask(Mask, ReuseShuffleIndices);
3187 return Mask;
3188 }
3189
3190 /// \returns true if the scalars in VL are equal to this entry.
3191 bool isSame(ArrayRef<Value *> VL) const {
3192 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3193 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3194 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3195 return VL.size() == Mask.size() &&
3196 std::equal(VL.begin(), VL.end(), Mask.begin(),
3197 [Scalars](Value *V, int Idx) {
3198 return (isa<UndefValue>(V) &&
3199 Idx == PoisonMaskElem) ||
3200 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3201 });
3202 };
3203 if (!ReorderIndices.empty()) {
3204 // TODO: implement matching if the nodes are just reordered, still can
3205 // treat the vector as the same if the list of scalars matches VL
3206 // directly, without reordering.
3208 inversePermutation(ReorderIndices, Mask);
3209 if (VL.size() == Scalars.size())
3210 return IsSame(Scalars, Mask);
3211 if (VL.size() == ReuseShuffleIndices.size()) {
3212 ::addMask(Mask, ReuseShuffleIndices);
3213 return IsSame(Scalars, Mask);
3214 }
3215 return false;
3216 }
3217 return IsSame(Scalars, ReuseShuffleIndices);
3218 }
3219
3220 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3221 return isGather() && !UserTreeIndices.empty() &&
3222 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3223 UserTreeIndices.front().UserTE == UserEI.UserTE;
3224 }
3225
3226 /// \returns true if current entry has same operands as \p TE.
3227 bool hasEqualOperands(const TreeEntry &TE) const {
3228 if (TE.getNumOperands() != getNumOperands())
3229 return false;
3230 SmallBitVector Used(getNumOperands());
3231 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3232 unsigned PrevCount = Used.count();
3233 for (unsigned K = 0; K < E; ++K) {
3234 if (Used.test(K))
3235 continue;
3236 if (getOperand(K) == TE.getOperand(I)) {
3237 Used.set(K);
3238 break;
3239 }
3240 }
3241 // Check if we actually found the matching operand.
3242 if (PrevCount == Used.count())
3243 return false;
3244 }
3245 return true;
3246 }
3247
3248 /// \return Final vectorization factor for the node. Defined by the total
3249 /// number of vectorized scalars, including those, used several times in the
3250 /// entry and counted in the \a ReuseShuffleIndices, if any.
3251 unsigned getVectorFactor() const {
3252 if (!ReuseShuffleIndices.empty())
3253 return ReuseShuffleIndices.size();
3254 return Scalars.size();
3255 };
3256
3257 /// Checks if the current node is a gather node.
3258 bool isGather() const {return State == NeedToGather; }
3259
3260 /// A vector of scalars.
3261 ValueList Scalars;
3262
3263 /// The Scalars are vectorized into this value. It is initialized to Null.
3264 WeakTrackingVH VectorizedValue = nullptr;
3265
3266 /// New vector phi instructions emitted for the vectorized phi nodes.
3267 PHINode *PHI = nullptr;
3268
3269 /// Do we need to gather this sequence or vectorize it
3270 /// (either with vector instruction or with scatter/gather
3271 /// intrinsics for store/load)?
3272 enum EntryState {
3273 Vectorize, ///< The node is regularly vectorized.
3274 ScatterVectorize, ///< Masked scatter/gather node.
3275 StridedVectorize, ///< Strided loads (and stores)
3276 NeedToGather, ///< Gather/buildvector node.
3277 CombinedVectorize, ///< Vectorized node, combined with its user into more
3278 ///< complex node like select/cmp to minmax, mul/add to
3279 ///< fma, etc. Must be used for the following nodes in
3280 ///< the pattern, not the very first one.
3281 };
3282 EntryState State;
3283
3284 /// List of combined opcodes supported by the vectorizer.
3285 enum CombinedOpcode {
3286 NotCombinedOp = -1,
3287 MinMax = Instruction::OtherOpsEnd + 1,
3288 };
3289 CombinedOpcode CombinedOp = NotCombinedOp;
3290
3291 /// Does this sequence require some shuffling?
3292 SmallVector<int, 4> ReuseShuffleIndices;
3293
3294 /// Does this entry require reordering?
3295 SmallVector<unsigned, 4> ReorderIndices;
3296
3297 /// Points back to the VectorizableTree.
3298 ///
3299 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3300 /// to be a pointer and needs to be able to initialize the child iterator.
3301 /// Thus we need a reference back to the container to translate the indices
3302 /// to entries.
3303 VecTreeTy &Container;
3304
3305 /// The TreeEntry index containing the user of this entry. We can actually
3306 /// have multiple users so the data structure is not truly a tree.
3307 SmallVector<EdgeInfo, 1> UserTreeIndices;
3308
3309 /// The index of this treeEntry in VectorizableTree.
3310 unsigned Idx = 0;
3311
3312 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3313 /// other nodes as a series of insertvector instructions.
3314 SmallVector<std::pair<unsigned, unsigned>, 0> CombinedEntriesWithIndices;
3315
3316 private:
3317 /// The operands of each instruction in each lane Operands[op_index][lane].
3318 /// Note: This helps avoid the replication of the code that performs the
3319 /// reordering of operands during buildTree_rec() and vectorizeTree().
3321
3322 /// The main/alternate instruction.
3323 Instruction *MainOp = nullptr;
3324 Instruction *AltOp = nullptr;
3325
3326 /// Interleaving factor for interleaved loads Vectorize nodes.
3327 unsigned InterleaveFactor = 0;
3328
3329 public:
3330 /// Returns interleave factor for interleave nodes.
3331 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3332 /// Sets interleaving factor for the interleaving nodes.
3333 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3334
3335 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3336 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3337 if (Operands.size() < OpIdx + 1)
3338 Operands.resize(OpIdx + 1);
3339 assert(Operands[OpIdx].empty() && "Already resized?");
3340 assert(OpVL.size() <= Scalars.size() &&
3341 "Number of operands is greater than the number of scalars.");
3342 Operands[OpIdx].resize(OpVL.size());
3343 copy(OpVL, Operands[OpIdx].begin());
3344 }
3345
3346 /// Set this bundle's operand from Scalars.
3347 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3348 VLOperands Ops(Scalars, MainOp, R);
3349 if (RequireReorder)
3350 Ops.reorder();
3351 for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
3352 setOperand(I, Ops.getVL(I));
3353 }
3354
3355 /// Reorders operands of the node to the given mask \p Mask.
3356 void reorderOperands(ArrayRef<int> Mask) {
3357 for (ValueList &Operand : Operands)
3358 reorderScalars(Operand, Mask);
3359 }
3360
3361 /// \returns the \p OpIdx operand of this TreeEntry.
3362 ValueList &getOperand(unsigned OpIdx) {
3363 assert(OpIdx < Operands.size() && "Off bounds");
3364 return Operands[OpIdx];
3365 }
3366
3367 /// \returns the \p OpIdx operand of this TreeEntry.
3368 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3369 assert(OpIdx < Operands.size() && "Off bounds");
3370 return Operands[OpIdx];
3371 }
3372
3373 /// \returns the number of operands.
3374 unsigned getNumOperands() const { return Operands.size(); }
3375
3376 /// \return the single \p OpIdx operand.
3377 Value *getSingleOperand(unsigned OpIdx) const {
3378 assert(OpIdx < Operands.size() && "Off bounds");
3379 assert(!Operands[OpIdx].empty() && "No operand available");
3380 return Operands[OpIdx][0];
3381 }
3382
3383 /// Some of the instructions in the list have alternate opcodes.
3384 bool isAltShuffle() const { return MainOp != AltOp; }
3385
3386 bool isOpcodeOrAlt(Instruction *I) const {
3387 unsigned CheckedOpcode = I->getOpcode();
3388 return (getOpcode() == CheckedOpcode ||
3389 getAltOpcode() == CheckedOpcode);
3390 }
3391
3392 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3393 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3394 /// \p OpValue.
3395 Value *isOneOf(Value *Op) const {
3396 auto *I = dyn_cast<Instruction>(Op);
3397 if (I && isOpcodeOrAlt(I))
3398 return Op;
3399 return MainOp;
3400 }
3401
3402 void setOperations(const InstructionsState &S) {
3403 MainOp = S.getMainOp();
3404 AltOp = S.getAltOp();
3405 }
3406
3407 Instruction *getMainOp() const {
3408 return MainOp;
3409 }
3410
3411 Instruction *getAltOp() const {
3412 return AltOp;
3413 }
3414
3415 /// The main/alternate opcodes for the list of instructions.
3416 unsigned getOpcode() const {
3417 return MainOp ? MainOp->getOpcode() : 0;
3418 }
3419
3420 unsigned getAltOpcode() const {
3421 return AltOp ? AltOp->getOpcode() : 0;
3422 }
3423
3424 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3425 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3426 int findLaneForValue(Value *V) const {
3427 unsigned FoundLane = getVectorFactor();
3428 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3429 std::advance(It, 1)) {
3430 if (*It != V)
3431 continue;
3432 FoundLane = std::distance(Scalars.begin(), It);
3433 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3434 if (!ReorderIndices.empty())
3435 FoundLane = ReorderIndices[FoundLane];
3436 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3437 if (ReuseShuffleIndices.empty())
3438 break;
3439 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3440 RIt != ReuseShuffleIndices.end()) {
3441 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3442 break;
3443 }
3444 }
3445 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3446 return FoundLane;
3447 }
3448
3449 /// Build a shuffle mask for graph entry which represents a merge of main
3450 /// and alternate operations.
3451 void
3452 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3454 SmallVectorImpl<Value *> *OpScalars = nullptr,
3455 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3456
3457 /// Return true if this is a non-power-of-2 node.
3458 bool isNonPowOf2Vec() const {
3459 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3460 return IsNonPowerOf2;
3461 }
3462
3463 /// Return true if this is a node, which tries to vectorize number of
3464 /// elements, forming whole vectors.
3465 bool
3466 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3467 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3468 TTI, getValueType(Scalars.front()), Scalars.size());
3469 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3470 "Reshuffling not supported with non-power-of-2 vectors yet.");
3471 return IsNonPowerOf2;
3472 }
3473
3474 Value *getOrdered(unsigned Idx) const {
3475 assert(isGather() && "Must be used only for buildvectors/gathers.");
3476 if (ReorderIndices.empty())
3477 return Scalars[Idx];
3479 inversePermutation(ReorderIndices, Mask);
3480 return Scalars[Mask[Idx]];
3481 }
3482
3483#ifndef NDEBUG
3484 /// Debug printer.
3485 LLVM_DUMP_METHOD void dump() const {
3486 dbgs() << Idx << ".\n";
3487 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3488 dbgs() << "Operand " << OpI << ":\n";
3489 for (const Value *V : Operands[OpI])
3490 dbgs().indent(2) << *V << "\n";
3491 }
3492 dbgs() << "Scalars: \n";
3493 for (Value *V : Scalars)
3494 dbgs().indent(2) << *V << "\n";
3495 dbgs() << "State: ";
3496 switch (State) {
3497 case Vectorize:
3498 if (InterleaveFactor > 0) {
3499 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3500 << "\n";
3501 } else {
3502 dbgs() << "Vectorize\n";
3503 }
3504 break;
3505 case ScatterVectorize:
3506 dbgs() << "ScatterVectorize\n";
3507 break;
3508 case StridedVectorize:
3509 dbgs() << "StridedVectorize\n";
3510 break;
3511 case NeedToGather:
3512 dbgs() << "NeedToGather\n";
3513 break;
3514 case CombinedVectorize:
3515 dbgs() << "CombinedVectorize\n";
3516 break;
3517 }
3518 dbgs() << "MainOp: ";
3519 if (MainOp)
3520 dbgs() << *MainOp << "\n";
3521 else
3522 dbgs() << "NULL\n";
3523 dbgs() << "AltOp: ";
3524 if (AltOp)
3525 dbgs() << *AltOp << "\n";
3526 else
3527 dbgs() << "NULL\n";
3528 dbgs() << "VectorizedValue: ";
3529 if (VectorizedValue)
3530 dbgs() << *VectorizedValue << "\n";
3531 else
3532 dbgs() << "NULL\n";
3533 dbgs() << "ReuseShuffleIndices: ";
3534 if (ReuseShuffleIndices.empty())
3535 dbgs() << "Empty";
3536 else
3537 for (int ReuseIdx : ReuseShuffleIndices)
3538 dbgs() << ReuseIdx << ", ";
3539 dbgs() << "\n";
3540 dbgs() << "ReorderIndices: ";
3541 for (unsigned ReorderIdx : ReorderIndices)
3542 dbgs() << ReorderIdx << ", ";
3543 dbgs() << "\n";
3544 dbgs() << "UserTreeIndices: ";
3545 for (const auto &EInfo : UserTreeIndices)
3546 dbgs() << EInfo << ", ";
3547 dbgs() << "\n";
3548 }
3549#endif
3550 };
3551
3552#ifndef NDEBUG
3553 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3554 InstructionCost VecCost, InstructionCost ScalarCost,
3555 StringRef Banner) const {
3556 dbgs() << "SLP: " << Banner << ":\n";
3557 E->dump();
3558 dbgs() << "SLP: Costs:\n";
3559 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3560 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3561 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3562 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3563 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3564 }
3565#endif
3566
3567 /// Create a new VectorizableTree entry.
3568 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3569 std::optional<ScheduleData *> Bundle,
3570 const InstructionsState &S,
3571 const EdgeInfo &UserTreeIdx,
3572 ArrayRef<int> ReuseShuffleIndices = {},
3573 ArrayRef<unsigned> ReorderIndices = {},
3574 unsigned InterleaveFactor = 0) {
3575 TreeEntry::EntryState EntryState =
3576 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3577 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3578 ReuseShuffleIndices, ReorderIndices);
3579 if (E && InterleaveFactor > 0)
3580 E->setInterleave(InterleaveFactor);
3581 return E;
3582 }
3583
3584 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3585 TreeEntry::EntryState EntryState,
3586 std::optional<ScheduleData *> Bundle,
3587 const InstructionsState &S,
3588 const EdgeInfo &UserTreeIdx,
3589 ArrayRef<int> ReuseShuffleIndices = {},
3590 ArrayRef<unsigned> ReorderIndices = {}) {
3591 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3592 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3593 "Need to vectorize gather entry?");
3594 // Gathered loads still gathered? Do not create entry, use the original one.
3595 if (GatheredLoadsEntriesFirst.has_value() &&
3596 EntryState == TreeEntry::NeedToGather &&
3597 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3598 !UserTreeIdx.UserTE)
3599 return nullptr;
3600 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3601 TreeEntry *Last = VectorizableTree.back().get();
3602 Last->Idx = VectorizableTree.size() - 1;
3603 Last->State = EntryState;
3604 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3605 // for non-power-of-two vectors.
3606 assert(
3608 ReuseShuffleIndices.empty()) &&
3609 "Reshuffling scalars not yet supported for nodes with padding");
3610 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3611 ReuseShuffleIndices.end());
3612 if (ReorderIndices.empty()) {
3613 Last->Scalars.assign(VL.begin(), VL.end());
3614 Last->setOperations(S);
3615 } else {
3616 // Reorder scalars and build final mask.
3617 Last->Scalars.assign(VL.size(), nullptr);
3618 transform(ReorderIndices, Last->Scalars.begin(),
3619 [VL](unsigned Idx) -> Value * {
3620 if (Idx >= VL.size())
3621 return UndefValue::get(VL.front()->getType());
3622 return VL[Idx];
3623 });
3624 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3625 Last->setOperations(S);
3626 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3627 }
3628 if (!Last->isGather()) {
3629 for (Value *V : VL) {
3630 const TreeEntry *TE = getTreeEntry(V);
3631 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3632 "Scalar already in tree!");
3633 if (TE) {
3634 if (TE != Last)
3635 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3636 continue;
3637 }
3638 ScalarToTreeEntry[V] = Last;
3639 }
3640 // Update the scheduler bundle to point to this TreeEntry.
3641 ScheduleData *BundleMember = *Bundle;
3642 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3643 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3644 doesNotNeedToSchedule(VL)) &&
3645 "Bundle and VL out of sync");
3646 if (BundleMember) {
3647 for (Value *V : VL) {
3649 continue;
3650 if (!BundleMember)
3651 continue;
3652 BundleMember->TE = Last;
3653 BundleMember = BundleMember->NextInBundle;
3654 }
3655 }
3656 assert(!BundleMember && "Bundle and VL out of sync");
3657 } else {
3658 // Build a map for gathered scalars to the nodes where they are used.
3659 bool AllConstsOrCasts = true;
3660 for (Value *V : VL)
3661 if (!isConstant(V)) {
3662 auto *I = dyn_cast<CastInst>(V);
3663 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3664 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3665 !UserTreeIdx.UserTE->isGather())
3666 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3667 }
3668 if (AllConstsOrCasts)
3669 CastMaxMinBWSizes =
3670 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3671 MustGather.insert(VL.begin(), VL.end());
3672 }
3673
3674 if (UserTreeIdx.UserTE)
3675 Last->UserTreeIndices.push_back(UserTreeIdx);
3676 return Last;
3677 }
3678
3679 /// -- Vectorization State --
3680 /// Holds all of the tree entries.
3681 TreeEntry::VecTreeTy VectorizableTree;
3682
3683#ifndef NDEBUG
3684 /// Debug printer.
3685 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3686 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3687 VectorizableTree[Id]->dump();
3688 dbgs() << "\n";
3689 }
3690 }
3691#endif
3692
3693 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3694
3695 const TreeEntry *getTreeEntry(Value *V) const {
3696 return ScalarToTreeEntry.lookup(V);
3697 }
3698
3699 /// Check that the operand node of alternate node does not generate
3700 /// buildvector sequence. If it is, then probably not worth it to build
3701 /// alternate shuffle, if number of buildvector operands + alternate
3702 /// instruction > than the number of buildvector instructions.
3703 /// \param S the instructions state of the analyzed values.
3704 /// \param VL list of the instructions with alternate opcodes.
3705 bool areAltOperandsProfitable(const InstructionsState &S,
3706 ArrayRef<Value *> VL) const;
3707
3708 /// Checks if the specified list of the instructions/values can be vectorized
3709 /// and fills required data before actual scheduling of the instructions.
3710 TreeEntry::EntryState
3711 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3712 bool IsScatterVectorizeUserTE,
3713 OrdersType &CurrentOrder,
3714 SmallVectorImpl<Value *> &PointerOps);
3715
3716 /// Maps a specific scalar to its tree entry.
3717 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3718
3719 /// List of scalars, used in several vectorize nodes, and the list of the
3720 /// nodes.
3722
3723 /// Maps a value to the proposed vectorizable size.
3724 SmallDenseMap<Value *, unsigned> InstrElementSize;
3725
3726 /// A list of scalars that we found that we need to keep as scalars.
3727 ValueSet MustGather;
3728
3729 /// A set of first non-schedulable values.
3730 ValueSet NonScheduledFirst;
3731
3732 /// A map between the vectorized entries and the last instructions in the
3733 /// bundles. The bundles are built in use order, not in the def order of the
3734 /// instructions. So, we cannot rely directly on the last instruction in the
3735 /// bundle being the last instruction in the program order during
3736 /// vectorization process since the basic blocks are affected, need to
3737 /// pre-gather them before.
3738 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3739
3740 /// List of gather nodes, depending on other gather/vector nodes, which should
3741 /// be emitted after the vector instruction emission process to correctly
3742 /// handle order of the vector instructions and shuffles.
3743 SetVector<const TreeEntry *> PostponedGathers;
3744
3745 using ValueToGatherNodesMap =
3747 ValueToGatherNodesMap ValueToGatherNodes;
3748
3749 /// A list of the load entries (node indices), which can be vectorized using
3750 /// strided or masked gather approach, but attempted to be represented as
3751 /// contiguous loads.
3752 SetVector<unsigned> LoadEntriesToVectorize;
3753
3754 /// true if graph nodes transforming mode is on.
3755 bool IsGraphTransformMode = false;
3756
3757 /// The index of the first gathered load entry in the VectorizeTree.
3758 std::optional<unsigned> GatheredLoadsEntriesFirst;
3759
3760 /// This POD struct describes one external user in the vectorized tree.
3761 struct ExternalUser {
3762 ExternalUser(Value *S, llvm::User *U, int L)
3763 : Scalar(S), User(U), Lane(L) {}
3764
3765 // Which scalar in our function.
3766 Value *Scalar;
3767
3768 // Which user that uses the scalar.
3770
3771 // Which lane does the scalar belong to.
3772 int Lane;
3773 };
3774 using UserList = SmallVector<ExternalUser, 16>;
3775
3776 /// Checks if two instructions may access the same memory.
3777 ///
3778 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3779 /// is invariant in the calling loop.
3780 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3781 Instruction *Inst2) {
3782 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3783 return true;
3784 // First check if the result is already in the cache.
3785 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3786 auto It = AliasCache.find(Key);
3787 if (It != AliasCache.end())
3788 return It->second;
3789 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3790 // Store the result in the cache.
3791 AliasCache.try_emplace(Key, Aliased);
3792 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3793 return Aliased;
3794 }
3795
3796 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3797
3798 /// Cache for alias results.
3799 /// TODO: consider moving this to the AliasAnalysis itself.
3801
3802 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3803 // globally through SLP because we don't perform any action which
3804 // invalidates capture results.
3805 BatchAAResults BatchAA;
3806
3807 /// Temporary store for deleted instructions. Instructions will be deleted
3808 /// eventually when the BoUpSLP is destructed. The deferral is required to
3809 /// ensure that there are no incorrect collisions in the AliasCache, which
3810 /// can happen if a new instruction is allocated at the same address as a
3811 /// previously deleted instruction.
3812 DenseSet<Instruction *> DeletedInstructions;
3813
3814 /// Set of the instruction, being analyzed already for reductions.
3815 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3816
3817 /// Set of hashes for the list of reduction values already being analyzed.
3818 DenseSet<size_t> AnalyzedReductionVals;
3819
3820 /// Values, already been analyzed for mininmal bitwidth and found to be
3821 /// non-profitable.
3822 DenseSet<Value *> AnalyzedMinBWVals;
3823
3824 /// A list of values that need to extracted out of the tree.
3825 /// This list holds pairs of (Internal Scalar : External User). External User
3826 /// can be nullptr, it means that this Internal Scalar will be used later,
3827 /// after vectorization.
3828 UserList ExternalUses;
3829
3830 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3831 /// extractelement instructions.
3832 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3833
3834 /// Values used only by @llvm.assume calls.
3836
3837 /// Holds all of the instructions that we gathered, shuffle instructions and
3838 /// extractelements.
3839 SetVector<Instruction *> GatherShuffleExtractSeq;
3840
3841 /// A list of blocks that we are going to CSE.
3842 DenseSet<BasicBlock *> CSEBlocks;
3843
3844 /// List of hashes of vector of loads, which are known to be non vectorizable.
3845 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3846
3847 /// Contains all scheduling relevant data for an instruction.
3848 /// A ScheduleData either represents a single instruction or a member of an
3849 /// instruction bundle (= a group of instructions which is combined into a
3850 /// vector instruction).
3851 struct ScheduleData {
3852 // The initial value for the dependency counters. It means that the
3853 // dependencies are not calculated yet.
3854 enum { InvalidDeps = -1 };
3855
3856 ScheduleData() = default;
3857
3858 void init(int BlockSchedulingRegionID, Instruction *I) {
3859 FirstInBundle = this;
3860 NextInBundle = nullptr;
3861 NextLoadStore = nullptr;
3862 IsScheduled = false;
3863 SchedulingRegionID = BlockSchedulingRegionID;
3864 clearDependencies();
3865 Inst = I;
3866 TE = nullptr;
3867 }
3868
3869 /// Verify basic self consistency properties
3870 void verify() {
3871 if (hasValidDependencies()) {
3872 assert(UnscheduledDeps <= Dependencies && "invariant");
3873 } else {
3874 assert(UnscheduledDeps == Dependencies && "invariant");
3875 }
3876
3877 if (IsScheduled) {
3878 assert(isSchedulingEntity() &&
3879 "unexpected scheduled state");
3880 for (const ScheduleData *BundleMember = this; BundleMember;
3881 BundleMember = BundleMember->NextInBundle) {
3882 assert(BundleMember->hasValidDependencies() &&
3883 BundleMember->UnscheduledDeps == 0 &&
3884 "unexpected scheduled state");
3885 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3886 "only bundle is marked scheduled");
3887 }
3888 }
3889
3890 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3891 "all bundle members must be in same basic block");
3892 }
3893
3894 /// Returns true if the dependency information has been calculated.
3895 /// Note that depenendency validity can vary between instructions within
3896 /// a single bundle.
3897 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3898
3899 /// Returns true for single instructions and for bundle representatives
3900 /// (= the head of a bundle).
3901 bool isSchedulingEntity() const { return FirstInBundle == this; }
3902
3903 /// Returns true if it represents an instruction bundle and not only a
3904 /// single instruction.
3905 bool isPartOfBundle() const {
3906 return NextInBundle != nullptr || FirstInBundle != this || TE;
3907 }
3908
3909 /// Returns true if it is ready for scheduling, i.e. it has no more
3910 /// unscheduled depending instructions/bundles.
3911 bool isReady() const {
3912 assert(isSchedulingEntity() &&
3913 "can't consider non-scheduling entity for ready list");
3914 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3915 }
3916
3917 /// Modifies the number of unscheduled dependencies for this instruction,
3918 /// and returns the number of remaining dependencies for the containing
3919 /// bundle.
3920 int incrementUnscheduledDeps(int Incr) {
3921 assert(hasValidDependencies() &&
3922 "increment of unscheduled deps would be meaningless");
3923 UnscheduledDeps += Incr;
3924 return FirstInBundle->unscheduledDepsInBundle();
3925 }
3926
3927 /// Sets the number of unscheduled dependencies to the number of
3928 /// dependencies.
3929 void resetUnscheduledDeps() {
3930 UnscheduledDeps = Dependencies;
3931 }
3932
3933 /// Clears all dependency information.
3934 void clearDependencies() {
3935 Dependencies = InvalidDeps;
3936 resetUnscheduledDeps();
3937 MemoryDependencies.clear();
3938 ControlDependencies.clear();
3939 }
3940
3941 int unscheduledDepsInBundle() const {
3942 assert(isSchedulingEntity() && "only meaningful on the bundle");
3943 int Sum = 0;
3944 for (const ScheduleData *BundleMember = this; BundleMember;
3945 BundleMember = BundleMember->NextInBundle) {
3946 if (BundleMember->UnscheduledDeps == InvalidDeps)
3947 return InvalidDeps;
3948 Sum += BundleMember->UnscheduledDeps;
3949 }
3950 return Sum;
3951 }
3952
3953 void dump(raw_ostream &os) const {
3954 if (!isSchedulingEntity()) {
3955 os << "/ " << *Inst;
3956 } else if (NextInBundle) {
3957 os << '[' << *Inst;
3958 ScheduleData *SD = NextInBundle;
3959 while (SD) {
3960 os << ';' << *SD->Inst;
3961 SD = SD->NextInBundle;
3962 }
3963 os << ']';
3964 } else {
3965 os << *Inst;
3966 }
3967 }
3968
3969 Instruction *Inst = nullptr;
3970
3971 /// The TreeEntry that this instruction corresponds to.
3972 TreeEntry *TE = nullptr;
3973
3974 /// Points to the head in an instruction bundle (and always to this for
3975 /// single instructions).
3976 ScheduleData *FirstInBundle = nullptr;
3977
3978 /// Single linked list of all instructions in a bundle. Null if it is a
3979 /// single instruction.
3980 ScheduleData *NextInBundle = nullptr;
3981
3982 /// Single linked list of all memory instructions (e.g. load, store, call)
3983 /// in the block - until the end of the scheduling region.
3984 ScheduleData *NextLoadStore = nullptr;
3985
3986 /// The dependent memory instructions.
3987 /// This list is derived on demand in calculateDependencies().
3988 SmallVector<ScheduleData *, 4> MemoryDependencies;
3989
3990 /// List of instructions which this instruction could be control dependent
3991 /// on. Allowing such nodes to be scheduled below this one could introduce
3992 /// a runtime fault which didn't exist in the original program.
3993 /// ex: this is a load or udiv following a readonly call which inf loops
3994 SmallVector<ScheduleData *, 4> ControlDependencies;
3995
3996 /// This ScheduleData is in the current scheduling region if this matches
3997 /// the current SchedulingRegionID of BlockScheduling.
3998 int SchedulingRegionID = 0;
3999
4000 /// Used for getting a "good" final ordering of instructions.
4001 int SchedulingPriority = 0;
4002
4003 /// The number of dependencies. Constitutes of the number of users of the
4004 /// instruction plus the number of dependent memory instructions (if any).
4005 /// This value is calculated on demand.
4006 /// If InvalidDeps, the number of dependencies is not calculated yet.
4007 int Dependencies = InvalidDeps;
4008
4009 /// The number of dependencies minus the number of dependencies of scheduled
4010 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4011 /// for scheduling.
4012 /// Note that this is negative as long as Dependencies is not calculated.
4013 int UnscheduledDeps = InvalidDeps;
4014
4015 /// True if this instruction is scheduled (or considered as scheduled in the
4016 /// dry-run).
4017 bool IsScheduled = false;
4018 };
4019
4020#ifndef NDEBUG
4022 const BoUpSLP::ScheduleData &SD) {
4023 SD.dump(os);
4024 return os;
4025 }
4026#endif
4027
4028 friend struct GraphTraits<BoUpSLP *>;
4029 friend struct DOTGraphTraits<BoUpSLP *>;
4030
4031 /// Contains all scheduling data for a basic block.
4032 /// It does not schedules instructions, which are not memory read/write
4033 /// instructions and their operands are either constants, or arguments, or
4034 /// phis, or instructions from others blocks, or their users are phis or from
4035 /// the other blocks. The resulting vector instructions can be placed at the
4036 /// beginning of the basic block without scheduling (if operands does not need
4037 /// to be scheduled) or at the end of the block (if users are outside of the
4038 /// block). It allows to save some compile time and memory used by the
4039 /// compiler.
4040 /// ScheduleData is assigned for each instruction in between the boundaries of
4041 /// the tree entry, even for those, which are not part of the graph. It is
4042 /// required to correctly follow the dependencies between the instructions and
4043 /// their correct scheduling. The ScheduleData is not allocated for the
4044 /// instructions, which do not require scheduling, like phis, nodes with
4045 /// extractelements/insertelements only or nodes with instructions, with
4046 /// uses/operands outside of the block.
4047 struct BlockScheduling {
4048 BlockScheduling(BasicBlock *BB)
4049 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4050
4051 void clear() {
4052 ReadyInsts.clear();
4053 ScheduleStart = nullptr;
4054 ScheduleEnd = nullptr;
4055 FirstLoadStoreInRegion = nullptr;
4056 LastLoadStoreInRegion = nullptr;
4057 RegionHasStackSave = false;
4058
4059 // Reduce the maximum schedule region size by the size of the
4060 // previous scheduling run.
4061 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4062 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4063 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4064 ScheduleRegionSize = 0;
4065
4066 // Make a new scheduling region, i.e. all existing ScheduleData is not
4067 // in the new region yet.
4068 ++SchedulingRegionID;
4069 }
4070
4071 ScheduleData *getScheduleData(Instruction *I) {
4072 if (BB != I->getParent())
4073 // Avoid lookup if can't possibly be in map.
4074 return nullptr;
4075 ScheduleData *SD = ScheduleDataMap.lookup(I);
4076 if (SD && isInSchedulingRegion(SD))
4077 return SD;
4078 return nullptr;
4079 }
4080
4081 ScheduleData *getScheduleData(Value *V) {
4082 if (auto *I = dyn_cast<Instruction>(V))
4083 return getScheduleData(I);
4084 return nullptr;
4085 }
4086
4087 bool isInSchedulingRegion(ScheduleData *SD) const {
4088 return SD->SchedulingRegionID == SchedulingRegionID;
4089 }
4090
4091 /// Marks an instruction as scheduled and puts all dependent ready
4092 /// instructions into the ready-list.
4093 template <typename ReadyListType>
4094 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4095 SD->IsScheduled = true;
4096 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4097
4098 for (ScheduleData *BundleMember = SD; BundleMember;
4099 BundleMember = BundleMember->NextInBundle) {
4100
4101 // Handle the def-use chain dependencies.
4102
4103 // Decrement the unscheduled counter and insert to ready list if ready.
4104 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4105 ScheduleData *OpDef = getScheduleData(I);
4106 if (OpDef && OpDef->hasValidDependencies() &&
4107 OpDef->incrementUnscheduledDeps(-1) == 0) {
4108 // There are no more unscheduled dependencies after
4109 // decrementing, so we can put the dependent instruction
4110 // into the ready list.
4111 ScheduleData *DepBundle = OpDef->FirstInBundle;
4112 assert(!DepBundle->IsScheduled &&
4113 "already scheduled bundle gets ready");
4114 ReadyList.insert(DepBundle);
4116 << "SLP: gets ready (def): " << *DepBundle << "\n");
4117 }
4118 };
4119
4120 // If BundleMember is a vector bundle, its operands may have been
4121 // reordered during buildTree(). We therefore need to get its operands
4122 // through the TreeEntry.
4123 if (TreeEntry *TE = BundleMember->TE) {
4124 // Need to search for the lane since the tree entry can be reordered.
4125 int Lane = std::distance(TE->Scalars.begin(),
4126 find(TE->Scalars, BundleMember->Inst));
4127 assert(Lane >= 0 && "Lane not set");
4128
4129 // Since vectorization tree is being built recursively this assertion
4130 // ensures that the tree entry has all operands set before reaching
4131 // this code. Couple of exceptions known at the moment are extracts
4132 // where their second (immediate) operand is not added. Since
4133 // immediates do not affect scheduler behavior this is considered
4134 // okay.
4135 auto *In = BundleMember->Inst;
4136 assert(
4137 In &&
4138 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4139 In->getNumOperands() == TE->getNumOperands()) &&
4140 "Missed TreeEntry operands?");
4141 (void)In; // fake use to avoid build failure when assertions disabled
4142
4143 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
4144 OpIdx != NumOperands; ++OpIdx)
4145 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4146 DecrUnsched(I);
4147 } else {
4148 // If BundleMember is a stand-alone instruction, no operand reordering
4149 // has taken place, so we directly access its operands.
4150 for (Use &U : BundleMember->Inst->operands())
4151 if (auto *I = dyn_cast<Instruction>(U.get()))
4152 DecrUnsched(I);
4153 }
4154 // Handle the memory dependencies.
4155 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4156 if (MemoryDepSD->hasValidDependencies() &&
4157 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4158 // There are no more unscheduled dependencies after decrementing,
4159 // so we can put the dependent instruction into the ready list.
4160 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4161 assert(!DepBundle->IsScheduled &&
4162 "already scheduled bundle gets ready");
4163 ReadyList.insert(DepBundle);
4165 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4166 }
4167 }
4168 // Handle the control dependencies.
4169 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4170 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4171 // There are no more unscheduled dependencies after decrementing,
4172 // so we can put the dependent instruction into the ready list.
4173 ScheduleData *DepBundle = DepSD->FirstInBundle;
4174 assert(!DepBundle->IsScheduled &&
4175 "already scheduled bundle gets ready");
4176 ReadyList.insert(DepBundle);
4178 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4179 }
4180 }
4181 }
4182 }
4183
4184 /// Verify basic self consistency properties of the data structure.
4185 void verify() {
4186 if (!ScheduleStart)
4187 return;
4188
4189 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4190 ScheduleStart->comesBefore(ScheduleEnd) &&
4191 "Not a valid scheduling region?");
4192
4193 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4194 auto *SD = getScheduleData(I);
4195 if (!SD)
4196 continue;
4197 assert(isInSchedulingRegion(SD) &&
4198 "primary schedule data not in window?");
4199 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4200 "entire bundle in window!");
4201 SD->verify();
4202 }
4203
4204 for (auto *SD : ReadyInsts) {
4205 assert(SD->isSchedulingEntity() && SD->isReady() &&
4206 "item in ready list not ready?");
4207 (void)SD;
4208 }
4209 }
4210
4211 /// Put all instructions into the ReadyList which are ready for scheduling.
4212 template <typename ReadyListType>
4213 void initialFillReadyList(ReadyListType &ReadyList) {
4214 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4215 ScheduleData *SD = getScheduleData(I);
4216 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4217 SD->isReady()) {
4218 ReadyList.insert(SD);
4220 << "SLP: initially in ready list: " << *SD << "\n");
4221 }
4222 }
4223 }
4224
4225 /// Build a bundle from the ScheduleData nodes corresponding to the
4226 /// scalar instruction for each lane.
4227 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4228
4229 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4230 /// cyclic dependencies. This is only a dry-run, no instructions are
4231 /// actually moved at this stage.
4232 /// \returns the scheduling bundle. The returned Optional value is not
4233 /// std::nullopt if \p VL is allowed to be scheduled.
4234 std::optional<ScheduleData *>
4235 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4236 const InstructionsState &S);
4237
4238 /// Un-bundles a group of instructions.
4239 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4240
4241 /// Allocates schedule data chunk.
4242 ScheduleData *allocateScheduleDataChunks();
4243
4244 /// Extends the scheduling region so that V is inside the region.
4245 /// \returns true if the region size is within the limit.
4246 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4247
4248 /// Initialize the ScheduleData structures for new instructions in the
4249 /// scheduling region.
4250 void initScheduleData(Instruction *FromI, Instruction *ToI,
4251 ScheduleData *PrevLoadStore,
4252 ScheduleData *NextLoadStore);
4253
4254 /// Updates the dependency information of a bundle and of all instructions/
4255 /// bundles which depend on the original bundle.
4256 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4257 BoUpSLP *SLP);
4258
4259 /// Sets all instruction in the scheduling region to un-scheduled.
4260 void resetSchedule();
4261
4262 BasicBlock *BB;
4263
4264 /// Simple memory allocation for ScheduleData.
4266
4267 /// The size of a ScheduleData array in ScheduleDataChunks.
4268 int ChunkSize;
4269
4270 /// The allocator position in the current chunk, which is the last entry
4271 /// of ScheduleDataChunks.
4272 int ChunkPos;
4273
4274 /// Attaches ScheduleData to Instruction.
4275 /// Note that the mapping survives during all vectorization iterations, i.e.
4276 /// ScheduleData structures are recycled.
4278
4279 /// The ready-list for scheduling (only used for the dry-run).
4280 SetVector<ScheduleData *> ReadyInsts;
4281
4282 /// The first instruction of the scheduling region.
4283 Instruction *ScheduleStart = nullptr;
4284
4285 /// The first instruction _after_ the scheduling region.
4286 Instruction *ScheduleEnd = nullptr;
4287
4288 /// The first memory accessing instruction in the scheduling region
4289 /// (can be null).
4290 ScheduleData *FirstLoadStoreInRegion = nullptr;
4291
4292 /// The last memory accessing instruction in the scheduling region
4293 /// (can be null).
4294 ScheduleData *LastLoadStoreInRegion = nullptr;
4295
4296 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4297 /// region? Used to optimize the dependence calculation for the
4298 /// common case where there isn't.
4299 bool RegionHasStackSave = false;
4300
4301 /// The current size of the scheduling region.
4302 int ScheduleRegionSize = 0;
4303
4304 /// The maximum size allowed for the scheduling region.
4305 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4306
4307 /// The ID of the scheduling region. For a new vectorization iteration this
4308 /// is incremented which "removes" all ScheduleData from the region.
4309 /// Make sure that the initial SchedulingRegionID is greater than the
4310 /// initial SchedulingRegionID in ScheduleData (which is 0).
4311 int SchedulingRegionID = 1;
4312 };
4313
4314 /// Attaches the BlockScheduling structures to basic blocks.
4316
4317 /// Performs the "real" scheduling. Done before vectorization is actually
4318 /// performed in a basic block.
4319 void scheduleBlock(BlockScheduling *BS);
4320
4321 /// List of users to ignore during scheduling and that don't need extracting.
4322 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4323
4324 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4325 /// sorted SmallVectors of unsigned.
4326 struct OrdersTypeDenseMapInfo {
4327 static OrdersType getEmptyKey() {
4328 OrdersType V;
4329 V.push_back(~1U);
4330 return V;
4331 }
4332
4333 static OrdersType getTombstoneKey() {
4334 OrdersType V;
4335 V.push_back(~2U);
4336 return V;
4337 }
4338
4339 static unsigned getHashValue(const OrdersType &V) {
4340 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4341 }
4342
4343 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4344 return LHS == RHS;
4345 }
4346 };
4347
4348 // Analysis and block reference.
4349 Function *F;
4350 ScalarEvolution *SE;
4352 TargetLibraryInfo *TLI;
4353 LoopInfo *LI;
4354 DominatorTree *DT;
4355 AssumptionCache *AC;
4356 DemandedBits *DB;
4357 const DataLayout *DL;
4359
4360 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4361 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4362
4363 /// Instruction builder to construct the vectorized tree.
4365
4366 /// A map of scalar integer values to the smallest bit width with which they
4367 /// can legally be represented. The values map to (width, signed) pairs,
4368 /// where "width" indicates the minimum bit width and "signed" is True if the
4369 /// value must be signed-extended, rather than zero-extended, back to its
4370 /// original width.
4372
4373 /// Final size of the reduced vector, if the current graph represents the
4374 /// input for the reduction and it was possible to narrow the size of the
4375 /// reduction.
4376 unsigned ReductionBitWidth = 0;
4377
4378 /// Canonical graph size before the transformations.
4379 unsigned BaseGraphSize = 1;
4380
4381 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4382 /// type sizes, used in the tree.
4383 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4384
4385 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4386 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4387 DenseSet<unsigned> ExtraBitWidthNodes;
4388};
4389
4390} // end namespace slpvectorizer
4391
4392template <> struct GraphTraits<BoUpSLP *> {
4393 using TreeEntry = BoUpSLP::TreeEntry;
4394
4395 /// NodeRef has to be a pointer per the GraphWriter.
4397
4399
4400 /// Add the VectorizableTree to the index iterator to be able to return
4401 /// TreeEntry pointers.
4402 struct ChildIteratorType
4403 : public iterator_adaptor_base<
4404 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4406
4408 ContainerTy &VT)
4409 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4410
4411 NodeRef operator*() { return I->UserTE; }
4412 };
4413
4415 return R.VectorizableTree[0].get();
4416 }
4417
4418 static ChildIteratorType child_begin(NodeRef N) {
4419 return {N->UserTreeIndices.begin(), N->Container};
4420 }
4421
4422 static ChildIteratorType child_end(NodeRef N) {
4423 return {N->UserTreeIndices.end(), N->Container};
4424 }
4425
4426 /// For the node iterator we just need to turn the TreeEntry iterator into a
4427 /// TreeEntry* iterator so that it dereferences to NodeRef.
4428 class nodes_iterator {
4430 ItTy It;
4431
4432 public:
4433 nodes_iterator(const ItTy &It2) : It(It2) {}
4434 NodeRef operator*() { return It->get(); }
4435 nodes_iterator operator++() {
4436 ++It;
4437 return *this;
4438 }
4439 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4440 };
4441
4442 static nodes_iterator nodes_begin(BoUpSLP *R) {
4443 return nodes_iterator(R->VectorizableTree.begin());
4444 }
4445
4446 static nodes_iterator nodes_end(BoUpSLP *R) {
4447 return nodes_iterator(R->VectorizableTree.end());
4448 }
4449
4450 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4451};
4452
4453template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4454 using TreeEntry = BoUpSLP::TreeEntry;
4455
4456 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4457
4458 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4459 std::string Str;
4461 OS << Entry->Idx << ".\n";
4462 if (isSplat(Entry->Scalars))
4463 OS << "<splat> ";
4464 for (auto *V : Entry->Scalars) {
4465 OS << *V;
4466 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4467 return EU.Scalar == V;
4468 }))
4469 OS << " <extract>";
4470 OS << "\n";
4471 }
4472 return Str;
4473 }
4474
4475 static std::string getNodeAttributes(const TreeEntry *Entry,
4476 const BoUpSLP *) {
4477 if (Entry->isGather())
4478 return "color=red";
4479 if (Entry->State == TreeEntry::ScatterVectorize ||
4480 Entry->State == TreeEntry::StridedVectorize)
4481 return "color=blue";
4482 return "";
4483 }
4484};
4485
4486} // end namespace llvm
4487
4490 for (auto *I : DeletedInstructions) {
4491 if (!I->getParent()) {
4492 // Temporarily insert instruction back to erase them from parent and
4493 // memory later.
4494 if (isa<PHINode>(I))
4495 // Phi nodes must be the very first instructions in the block.
4496 I->insertBefore(F->getEntryBlock(),
4497 F->getEntryBlock().getFirstNonPHIIt());
4498 else
4499 I->insertBefore(F->getEntryBlock().getTerminator());
4500 continue;
4501 }
4502 for (Use &U : I->operands()) {
4503 auto *Op = dyn_cast<Instruction>(U.get());
4504 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4506 DeadInsts.emplace_back(Op);
4507 }
4508 I->dropAllReferences();
4509 }
4510 for (auto *I : DeletedInstructions) {
4511 assert(I->use_empty() &&
4512 "trying to erase instruction with users.");
4513 I->eraseFromParent();
4514 }
4515
4516 // Cleanup any dead scalar code feeding the vectorized instructions
4518
4519#ifdef EXPENSIVE_CHECKS
4520 // If we could guarantee that this call is not extremely slow, we could
4521 // remove the ifdef limitation (see PR47712).
4522 assert(!verifyFunction(*F, &dbgs()));
4523#endif
4524}
4525
4526/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4527/// contains original mask for the scalars reused in the node. Procedure
4528/// transform this mask in accordance with the given \p Mask.
4530 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4531 "Expected non-empty mask.");
4532 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4533 Prev.swap(Reuses);
4534 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4535 if (Mask[I] != PoisonMaskElem)
4536 Reuses[Mask[I]] = Prev[I];
4537}
4538
4539/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4540/// the original order of the scalars. Procedure transforms the provided order
4541/// in accordance with the given \p Mask. If the resulting \p Order is just an
4542/// identity order, \p Order is cleared.
4544 bool BottomOrder = false) {
4545 assert(!Mask.empty() && "Expected non-empty mask.");
4546 unsigned Sz = Mask.size();
4547 if (BottomOrder) {
4548 SmallVector<unsigned> PrevOrder;
4549 if (Order.empty()) {
4550 PrevOrder.resize(Sz);
4551 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4552 } else {
4553 PrevOrder.swap(Order);
4554 }
4555 Order.assign(Sz, Sz);
4556 for (unsigned I = 0; I < Sz; ++I)
4557 if (Mask[I] != PoisonMaskElem)
4558 Order[I] = PrevOrder[Mask[I]];
4559 if (all_of(enumerate(Order), [&](const auto &Data) {
4560 return Data.value() == Sz || Data.index() == Data.value();
4561 })) {
4562 Order.clear();
4563 return;
4564 }
4565 fixupOrderingIndices(Order);
4566 return;
4567 }
4568 SmallVector<int> MaskOrder;
4569 if (Order.empty()) {
4570 MaskOrder.resize(Sz);
4571 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4572 } else {
4573 inversePermutation(Order, MaskOrder);
4574 }
4575 reorderReuses(MaskOrder, Mask);
4576 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4577 Order.clear();
4578 return;
4579 }
4580 Order.assign(Sz, Sz);
4581 for (unsigned I = 0; I < Sz; ++I)
4582 if (MaskOrder[I] != PoisonMaskElem)
4583 Order[MaskOrder[I]] = I;
4584 fixupOrderingIndices(Order);
4585}
4586
4587std::optional<BoUpSLP::OrdersType>
4588BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4589 assert(TE.isGather() && "Expected gather node only.");
4590 // Try to find subvector extract/insert patterns and reorder only such
4591 // patterns.
4592 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4593 Type *ScalarTy = GatheredScalars.front()->getType();
4594 int NumScalars = GatheredScalars.size();
4595 if (!isValidElementType(ScalarTy))
4596 return std::nullopt;
4597 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4598 int NumParts = TTI->getNumberOfParts(VecTy);
4599 if (NumParts == 0 || NumParts >= NumScalars ||
4600 VecTy->getNumElements() % NumParts != 0 ||
4601 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4602 VecTy->getNumElements() / NumParts))
4603 NumParts = 1;
4604 SmallVector<int> ExtractMask;
4605 SmallVector<int> Mask;
4608 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4610 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4611 /*ForOrder=*/true);
4612 // No shuffled operands - ignore.
4613 if (GatherShuffles.empty() && ExtractShuffles.empty())
4614 return std::nullopt;
4615 OrdersType CurrentOrder(NumScalars, NumScalars);
4616 if (GatherShuffles.size() == 1 &&
4617 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4618 Entries.front().front()->isSame(TE.Scalars)) {
4619 // Perfect match in the graph, will reuse the previously vectorized
4620 // node. Cost is 0.
4621 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4622 return CurrentOrder;
4623 }
4624 auto IsSplatMask = [](ArrayRef<int> Mask) {
4625 int SingleElt = PoisonMaskElem;
4626 return all_of(Mask, [&](int I) {
4627 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4628 SingleElt = I;
4629 return I == PoisonMaskElem || I == SingleElt;
4630 });
4631 };
4632 // Exclusive broadcast mask - ignore.
4633 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4634 (Entries.size() != 1 ||
4635 Entries.front().front()->ReorderIndices.empty())) ||
4636 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4637 return std::nullopt;
4638 SmallBitVector ShuffledSubMasks(NumParts);
4639 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4640 ArrayRef<int> Mask, int PartSz, int NumParts,
4641 function_ref<unsigned(unsigned)> GetVF) {
4642 for (int I : seq<int>(0, NumParts)) {
4643 if (ShuffledSubMasks.test(I))
4644 continue;
4645 const int VF = GetVF(I);
4646 if (VF == 0)
4647 continue;
4648 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4649 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4650 // Shuffle of at least 2 vectors - ignore.
4651 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4652 std::fill(Slice.begin(), Slice.end(), NumScalars);
4653 ShuffledSubMasks.set(I);
4654 continue;
4655 }
4656 // Try to include as much elements from the mask as possible.
4657 int FirstMin = INT_MAX;
4658 int SecondVecFound = false;
4659 for (int K : seq<int>(Limit)) {
4660 int Idx = Mask[I * PartSz + K];
4661 if (Idx == PoisonMaskElem) {
4662 Value *V = GatheredScalars[I * PartSz + K];
4663 if (isConstant(V) && !isa<PoisonValue>(V)) {
4664 SecondVecFound = true;
4665 break;
4666 }
4667 continue;
4668 }
4669 if (Idx < VF) {
4670 if (FirstMin > Idx)
4671 FirstMin = Idx;
4672 } else {
4673 SecondVecFound = true;
4674 break;
4675 }
4676 }
4677 FirstMin = (FirstMin / PartSz) * PartSz;
4678 // Shuffle of at least 2 vectors - ignore.
4679 if (SecondVecFound) {
4680 std::fill(Slice.begin(), Slice.end(), NumScalars);
4681 ShuffledSubMasks.set(I);
4682 continue;
4683 }
4684 for (int K : seq<int>(Limit)) {
4685 int Idx = Mask[I * PartSz + K];
4686 if (Idx == PoisonMaskElem)
4687 continue;
4688 Idx -= FirstMin;
4689 if (Idx >= PartSz) {
4690 SecondVecFound = true;
4691 break;
4692 }
4693 if (CurrentOrder[I * PartSz + Idx] >
4694 static_cast<unsigned>(I * PartSz + K) &&
4695 CurrentOrder[I * PartSz + Idx] !=
4696 static_cast<unsigned>(I * PartSz + Idx))
4697 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4698 }
4699 // Shuffle of at least 2 vectors - ignore.
4700 if (SecondVecFound) {
4701 std::fill(Slice.begin(), Slice.end(), NumScalars);
4702 ShuffledSubMasks.set(I);
4703 continue;
4704 }
4705 }
4706 };
4707 int PartSz = getPartNumElems(NumScalars, NumParts);
4708 if (!ExtractShuffles.empty())
4709 TransformMaskToOrder(
4710 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4711 if (!ExtractShuffles[I])
4712 return 0U;
4713 unsigned VF = 0;
4714 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4715 for (unsigned Idx : seq<unsigned>(Sz)) {
4716 int K = I * PartSz + Idx;
4717 if (ExtractMask[K] == PoisonMaskElem)
4718 continue;
4719 if (!TE.ReuseShuffleIndices.empty())
4720 K = TE.ReuseShuffleIndices[K];
4721 if (K == PoisonMaskElem)
4722 continue;
4723 if (!TE.ReorderIndices.empty())
4724 K = std::distance(TE.ReorderIndices.begin(),
4725 find(TE.ReorderIndices, K));
4726 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4727 if (!EI)
4728 continue;
4729 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4730 ->getElementCount()
4731 .getKnownMinValue());
4732 }
4733 return VF;
4734 });
4735 // Check special corner case - single shuffle of the same entry.
4736 if (GatherShuffles.size() == 1 && NumParts != 1) {
4737 if (ShuffledSubMasks.any())
4738 return std::nullopt;
4739 PartSz = NumScalars;
4740 NumParts = 1;
4741 }
4742 if (!Entries.empty())
4743 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4744 if (!GatherShuffles[I])
4745 return 0U;
4746 return std::max(Entries[I].front()->getVectorFactor(),
4747 Entries[I].back()->getVectorFactor());
4748 });
4749 int NumUndefs =
4750 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4751 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4752 return std::nullopt;
4753 return std::move(CurrentOrder);
4754}
4755
4756static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4757 const TargetLibraryInfo &TLI,
4758 bool CompareOpcodes = true) {
4761 return false;
4762 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4763 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4764 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4765 (!GEP2 || GEP2->getNumOperands() == 2) &&
4766 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4767 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4768 !CompareOpcodes ||
4769 (GEP1 && GEP2 &&
4770 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4771 .getOpcode()));
4772}
4773
4774/// Calculates minimal alignment as a common alignment.
4775template <typename T>
4777 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4778 for (Value *V : VL.drop_front())
4779 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4780 return CommonAlignment;
4781}
4782
4783/// Check if \p Order represents reverse order.
4785 assert(!Order.empty() &&
4786 "Order is empty. Please check it before using isReverseOrder.");
4787 unsigned Sz = Order.size();
4788 return all_of(enumerate(Order), [&](const auto &Pair) {
4789 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4790 });
4791}
4792
4793/// Checks if the provided list of pointers \p Pointers represents the strided
4794/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4795/// Otherwise, if \p Inst is not specified, just initialized optional value is
4796/// returned to show that the pointers represent strided pointers. If \p Inst
4797/// specified, the runtime stride is materialized before the given \p Inst.
4798/// \returns std::nullopt if the pointers are not pointers with the runtime
4799/// stride, nullptr or actual stride value, otherwise.
4800static std::optional<Value *>
4802 const DataLayout &DL, ScalarEvolution &SE,
4803 SmallVectorImpl<unsigned> &SortedIndices,
4804 Instruction *Inst = nullptr) {
4806 const SCEV *PtrSCEVLowest = nullptr;
4807 const SCEV *PtrSCEVHighest = nullptr;
4808 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4809 // addresses).
4810 for (Value *Ptr : PointerOps) {
4811 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4812 if (!PtrSCEV)
4813 return std::nullopt;
4814 SCEVs.push_back(PtrSCEV);
4815 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4816 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4817 continue;
4818 }
4819 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4820 if (isa<SCEVCouldNotCompute>(Diff))
4821 return std::nullopt;
4822 if (Diff->isNonConstantNegative()) {
4823 PtrSCEVLowest = PtrSCEV;
4824 continue;
4825 }
4826 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4827 if (isa<SCEVCouldNotCompute>(Diff1))
4828 return std::nullopt;
4829 if (Diff1->isNonConstantNegative()) {
4830 PtrSCEVHighest = PtrSCEV;
4831 continue;
4832 }
4833 }
4834 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4835 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4836 if (isa<SCEVCouldNotCompute>(Dist))
4837 return std::nullopt;
4838 int Size = DL.getTypeStoreSize(ElemTy);
4839 auto TryGetStride = [&](const SCEV *Dist,
4840 const SCEV *Multiplier) -> const SCEV * {
4841 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4842 if (M->getOperand(0) == Multiplier)
4843 return M->getOperand(1);
4844 if (M->getOperand(1) == Multiplier)
4845 return M->getOperand(0);
4846 return nullptr;
4847 }
4848 if (Multiplier == Dist)
4849 return SE.getConstant(Dist->getType(), 1);
4850 return SE.getUDivExactExpr(Dist, Multiplier);
4851 };
4852 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4853 const SCEV *Stride = nullptr;
4854 if (Size != 1 || SCEVs.size() > 2) {
4855 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4856 Stride = TryGetStride(Dist, Sz);
4857 if (!Stride)
4858 return std::nullopt;
4859 }
4860 if (!Stride || isa<SCEVConstant>(Stride))
4861 return std::nullopt;
4862 // Iterate through all pointers and check if all distances are
4863 // unique multiple of Stride.
4864 using DistOrdPair = std::pair<int64_t, int>;
4865 auto Compare = llvm::less_first();
4866 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4867 int Cnt = 0;
4868 bool IsConsecutive = true;
4869 for (const SCEV *PtrSCEV : SCEVs) {
4870 unsigned Dist = 0;
4871 if (PtrSCEV != PtrSCEVLowest) {
4872 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4873 const SCEV *Coeff = TryGetStride(Diff, Stride);
4874 if (!Coeff)
4875 return std::nullopt;
4876 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4877 if (!SC || isa<SCEVCouldNotCompute>(SC))
4878 return std::nullopt;
4879 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4880 SE.getMulExpr(Stride, SC)))
4881 ->isZero())
4882 return std::nullopt;
4883 Dist = SC->getAPInt().getZExtValue();
4884 }
4885 // If the strides are not the same or repeated, we can't vectorize.
4886 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4887 return std::nullopt;
4888 auto Res = Offsets.emplace(Dist, Cnt);
4889 if (!Res.second)
4890 return std::nullopt;
4891 // Consecutive order if the inserted element is the last one.
4892 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4893 ++Cnt;
4894 }
4895 if (Offsets.size() != SCEVs.size())
4896 return std::nullopt;
4897 SortedIndices.clear();
4898 if (!IsConsecutive) {
4899 // Fill SortedIndices array only if it is non-consecutive.
4900 SortedIndices.resize(PointerOps.size());
4901 Cnt = 0;
4902 for (const std::pair<int64_t, int> &Pair : Offsets) {
4903 SortedIndices[Cnt] = Pair.second;
4904 ++Cnt;
4905 }
4906 }
4907 if (!Inst)
4908 return nullptr;
4909 SCEVExpander Expander(SE, DL, "strided-load-vec");
4910 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4911}
4912
4913static std::pair<InstructionCost, InstructionCost>
4915 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4916 Type *ScalarTy, VectorType *VecTy);
4917
4918/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4919/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4920/// subvector pattern.
4921static InstructionCost
4923 VectorType *Tp, ArrayRef<int> Mask = {},
4925 int Index = 0, VectorType *SubTp = nullptr,
4927 if (Kind != TTI::SK_PermuteTwoSrc)
4928 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4929 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4930 int NumSubElts;
4932 Mask, NumSrcElts, NumSubElts, Index)) {
4933 if (Index + NumSubElts > NumSrcElts &&
4934 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4935 return TTI.getShuffleCost(
4937 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4939 }
4940 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4941}
4942
4946 SmallVectorImpl<Value *> &PointerOps,
4947 unsigned *BestVF, bool TryRecursiveCheck) const {
4948 // Check that a vectorized load would load the same memory as a scalar
4949 // load. For example, we don't want to vectorize loads that are smaller
4950 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4951 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4952 // from such a struct, we read/write packed bits disagreeing with the
4953 // unvectorized version.
4954 if (BestVF)
4955 *BestVF = 0;
4957 return LoadsState::Gather;
4958 Type *ScalarTy = VL0->getType();
4959
4960 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4961 return LoadsState::Gather;
4962
4963 // Make sure all loads in the bundle are simple - we can't vectorize
4964 // atomic or volatile loads.
4965 PointerOps.clear();
4966 const unsigned Sz = VL.size();
4967 PointerOps.resize(Sz);
4968 auto *POIter = PointerOps.begin();
4969 for (Value *V : VL) {
4970 auto *L = dyn_cast<LoadInst>(V);
4971 if (!L || !L->isSimple())
4972 return LoadsState::Gather;
4973 *POIter = L->getPointerOperand();
4974 ++POIter;
4975 }
4976
4977 Order.clear();
4978 // Check the order of pointer operands or that all pointers are the same.
4979 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4980
4981 auto *VecTy = getWidenedType(ScalarTy, Sz);
4982 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4983 if (!IsSorted) {
4984 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
4985 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4986 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4988 }
4989
4990 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
4991 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
4992 return LoadsState::Gather;
4993
4994 if (!all_of(PointerOps, [&](Value *P) {
4995 return arePointersCompatible(P, PointerOps.front(), *TLI);
4996 }))
4997 return LoadsState::Gather;
4998
4999 } else {
5000 Value *Ptr0;
5001 Value *PtrN;
5002 if (Order.empty()) {
5003 Ptr0 = PointerOps.front();
5004 PtrN = PointerOps.back();