LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107using namespace std::placeholders;
108
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
111
112STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
113
114DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
115 "Controls which SLP graphs should be vectorized.");
116
117static cl::opt<bool>
118 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
119 cl::desc("Run the SLP vectorization passes"));
120
121static cl::opt<bool>
122 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
123 cl::desc("Enable vectorization for wider vector utilization"));
124
125static cl::opt<int>
127 cl::desc("Only vectorize if you gain more than this "
128 "number "));
129
131 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
132 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
134
135static cl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
137 cl::desc("Attempt to vectorize horizontal reductions"));
138
140 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
141 cl::desc(
142 "Attempt to vectorize horizontal reductions feeding into a store"));
143
144static cl::opt<int>
146 cl::desc("Attempt to vectorize for this register size in bits"));
147
150 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
151
152/// Limits the size of scheduling regions in a block.
153/// It avoid long compile times for _very_ large blocks where vector
154/// instructions are spread over a wide range.
155/// This limit is way higher than needed by real-world functions.
156static cl::opt<int>
157ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
158 cl::desc("Limit the size of the SLP scheduling region per block"));
159
161 "slp-min-reg-size", cl::init(128), cl::Hidden,
162 cl::desc("Attempt to vectorize for this register size in bits"));
163
165 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
166 cl::desc("Limit the recursion depth when building a vectorizable tree"));
167
169 "slp-min-tree-size", cl::init(3), cl::Hidden,
170 cl::desc("Only vectorize small trees if they are fully vectorizable"));
171
172// The maximum depth that the look-ahead score heuristic will explore.
173// The higher this value, the higher the compilation time overhead.
175 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for operand reordering scores"));
177
178// The maximum depth that the look-ahead score heuristic will explore
179// when it probing among candidates for vectorization tree roots.
180// The higher this value, the higher the compilation time overhead but unlike
181// similar limit for operands ordering this is less frequently used, hence
182// impact of higher value is less noticeable.
184 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
185 cl::desc("The maximum look-ahead depth for searching best rooting option"));
186
188 "slp-min-strided-loads", cl::init(2), cl::Hidden,
189 cl::desc("The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
191
193 "slp-max-stride", cl::init(8), cl::Hidden,
194 cl::desc("The maximum stride, considered to be profitable."));
195
196static cl::opt<bool>
197 ViewSLPTree("view-slp-tree", cl::Hidden,
198 cl::desc("Display the SLP trees with Graphviz"));
199
201 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
202 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
203
204// Limit the number of alias checks. The limit is chosen so that
205// it has no negative effect on the llvm benchmarks.
206static const unsigned AliasedCheckLimit = 10;
207
208// Limit of the number of uses for potentially transformed instructions/values,
209// used in checks to avoid compile-time explode.
210static constexpr int UsesLimit = 64;
211
212// Another limit for the alias checks: The maximum distance between load/store
213// instructions where alias checks are done.
214// This limit is useful for very large basic blocks.
215static const unsigned MaxMemDepDistance = 160;
216
217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
218/// regions to be handled.
219static const int MinScheduleRegionSize = 16;
220
221/// Maximum allowed number of operands in the PHI nodes.
222static const unsigned MaxPHINumOperands = 128;
223
224/// Predicate for the element types that the SLP vectorizer supports.
225///
226/// The most important thing to filter here are types which are invalid in LLVM
227/// vectors. We also filter target specific types which have absolutely no
228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
229/// avoids spending time checking the cost model and realizing that they will
230/// be inevitably scalarized.
231static bool isValidElementType(Type *Ty) {
232 // TODO: Support ScalableVectorType.
233 if (SLPReVec && isa<FixedVectorType>(Ty))
234 Ty = Ty->getScalarType();
235 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
236 !Ty->isPPC_FP128Ty();
237}
238
239/// Returns the type of the given value/instruction \p V. If it is store,
240/// returns the type of its value operand, for Cmp - the types of the compare
241/// operands and for insertelement - the type os the inserted operand.
242/// Otherwise, just the type of the value is returned.
244 if (auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
250 return V->getType();
251}
252
253/// \returns the number of elements for Ty.
254static unsigned getNumElements(Type *Ty) {
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
259 return 1;
260}
261
262/// \returns the vector type of ScalarTy based on vectorization factor.
263static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
264 return FixedVectorType::get(ScalarTy->getScalarType(),
265 VF * getNumElements(ScalarTy));
266}
267
268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
269/// which forms type, which splits by \p TTI into whole vector types during
270/// legalization.
272 Type *Ty, unsigned Sz) {
273 if (!isValidElementType(Ty))
274 return bit_ceil(Sz);
275 // Find the number of elements, which forms full vectors.
276 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
277 if (NumParts == 0 || NumParts >= Sz)
278 return bit_ceil(Sz);
279 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
280}
281
282/// Returns the number of elements of the given type \p Ty, not greater than \p
283/// Sz, which forms type, which splits by \p TTI into whole vector types during
284/// legalization.
285static unsigned
287 unsigned Sz) {
288 if (!isValidElementType(Ty))
289 return bit_floor(Sz);
290 // Find the number of elements, which forms full vectors.
291 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
292 if (NumParts == 0 || NumParts >= Sz)
293 return bit_floor(Sz);
294 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
295 if (RegVF > Sz)
296 return bit_floor(Sz);
297 return (Sz / RegVF) * RegVF;
298}
299
300static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
301 SmallVectorImpl<int> &Mask) {
302 // The ShuffleBuilder implementation use shufflevector to splat an "element".
303 // But the element have different meaning for SLP (scalar) and REVEC
304 // (vector). We need to expand Mask into masks which shufflevector can use
305 // directly.
306 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
307 for (unsigned I : seq<unsigned>(Mask.size()))
308 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
309 I * VecTyNumElements, VecTyNumElements)))
310 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
311 : Mask[I] * VecTyNumElements + J;
312 Mask.swap(NewMask);
313}
314
315/// \returns the number of groups of shufflevector
316/// A group has the following features
317/// 1. All of value in a group are shufflevector.
318/// 2. The mask of all shufflevector is isExtractSubvectorMask.
319/// 3. The mask of all shufflevector uses all of the elements of the source.
320/// e.g., it is 1 group (%0)
321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325/// it is 2 groups (%3 and %4)
326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
334/// it is 0 group
335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
340 if (VL.empty())
341 return 0;
342 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
343 return 0;
344 auto *SV = cast<ShuffleVectorInst>(VL.front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
349 return 0;
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
352 return 0;
353 unsigned NumGroup = 0;
354 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[I]);
356 Value *Src = SV->getOperand(0);
357 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
358 SmallBitVector ExpectedIndex(GroupSize);
359 if (!all_of(Group, [&](Value *V) {
360 auto *SV = cast<ShuffleVectorInst>(V);
361 // From the same source.
362 if (SV->getOperand(0) != Src)
363 return false;
364 int Index;
365 if (!SV->isExtractSubvectorMask(Index))
366 return false;
367 ExpectedIndex.set(Index / ShuffleMaskSize);
368 return true;
369 }))
370 return 0;
371 if (!ExpectedIndex.all())
372 return 0;
373 ++NumGroup;
374 }
375 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
376 return NumGroup;
377}
378
379/// \returns a shufflevector mask which is used to vectorize shufflevectors
380/// e.g.,
381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
389/// the result is
390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
392 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
393 auto *SV = cast<ShuffleVectorInst>(VL.front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 SmallVector<int> Mask;
397 unsigned AccumulateLength = 0;
398 for (Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (int M : SV->getShuffleMask())
401 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
404 }
405 return Mask;
406}
407
408/// \returns True if the value is a constant (but not globals/constant
409/// expressions).
410static bool isConstant(Value *V) {
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
412}
413
414/// Checks if \p V is one of vector-like instructions, i.e. undef,
415/// insertelement/extractelement with constant indices for fixed vector type or
416/// extractvalue instruction.
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
420 return false;
421 auto *I = dyn_cast<Instruction>(V);
422 if (!I || isa<ExtractValueInst>(I))
423 return true;
424 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
425 return false;
426 if (isa<ExtractElementInst>(I))
427 return isConstant(I->getOperand(1));
428 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
429 return isConstant(I->getOperand(2));
430}
431
432/// Returns power-of-2 number of elements in a single register (part), given the
433/// total number of elements \p Size and number of registers (parts) \p
434/// NumParts.
435static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
436 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
437}
438
439/// Returns correct remaining number of elements, considering total amount \p
440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
441/// and current register (part) \p Part.
442static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
443 unsigned Part) {
444 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
445}
446
447#if !defined(NDEBUG)
448/// Print a short descriptor of the instruction bundle suitable for debug output.
449static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
450 std::string Result;
451 raw_string_ostream OS(Result);
452 if (Idx >= 0)
453 OS << "Idx: " << Idx << ", ";
454 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
455 return Result;
456}
457#endif
458
459/// \returns true if all of the instructions in \p VL are in the same block or
460/// false otherwise.
462 auto *It = find_if(VL, IsaPred<Instruction>);
463 if (It == VL.end())
464 return false;
465 Instruction *I0 = cast<Instruction>(*It);
467 return true;
468
469 BasicBlock *BB = I0->getParent();
470 for (Value *V : iterator_range(It, VL.end())) {
471 if (isa<PoisonValue>(V))
472 continue;
473 auto *II = dyn_cast<Instruction>(V);
474 if (!II)
475 return false;
476
477 if (BB != II->getParent())
478 return false;
479 }
480 return true;
481}
482
483/// \returns True if all of the values in \p VL are constants (but not
484/// globals/constant expressions).
486 // Constant expressions and globals can't be vectorized like normal integer/FP
487 // constants.
488 return all_of(VL, isConstant);
489}
490
491/// \returns True if all of the values in \p VL are identical or some of them
492/// are UndefValue.
493static bool isSplat(ArrayRef<Value *> VL) {
494 Value *FirstNonUndef = nullptr;
495 for (Value *V : VL) {
496 if (isa<UndefValue>(V))
497 continue;
498 if (!FirstNonUndef) {
499 FirstNonUndef = V;
500 continue;
501 }
502 if (V != FirstNonUndef)
503 return false;
504 }
505 return FirstNonUndef != nullptr;
506}
507
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
510 if (auto *Cmp = dyn_cast<CmpInst>(I))
511 return Cmp->isCommutative();
512 if (auto *BO = dyn_cast<BinaryOperator>(I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
515 !BO->hasNUsesOrMore(UsesLimit) &&
516 all_of(
517 BO->uses(),
518 [](const Use &U) {
519 // Commutative, if icmp eq/ne sub, 0
520 CmpPredicate Pred;
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
524 return true;
525 // Commutative, if abs(sub nsw, true) or abs(sub, false).
526 ConstantInt *Flag;
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
531 Flag->isOne());
532 })) ||
533 (BO->getOpcode() == Instruction::FSub &&
534 !BO->hasNUsesOrMore(UsesLimit) &&
535 all_of(BO->uses(), [](const Use &U) {
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 }));
539 return I->isCommutative();
540}
541
542template <typename T>
543static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
544 unsigned Offset) {
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
547 "unsupported T");
548 int Index = Offset;
549 if (const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
551 if (!VT)
552 return std::nullopt;
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
554 if (!CI)
555 return std::nullopt;
556 if (CI->getValue().uge(VT->getNumElements()))
557 return std::nullopt;
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
560 return Index;
561 }
562 return std::nullopt;
563}
564
565/// \returns inserting or extracting index of InsertElement, ExtractElement or
566/// InsertValue instruction, using Offset as base offset for index.
567/// \returns std::nullopt if the index is not an immediate.
568static std::optional<unsigned> getElementIndex(const Value *Inst,
569 unsigned Offset = 0) {
570 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
571 return Index;
572 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
573 return Index;
574
575 int Index = Offset;
576
577 const auto *IV = dyn_cast<InsertValueInst>(Inst);
578 if (!IV)
579 return std::nullopt;
580
581 Type *CurrentType = IV->getType();
582 for (unsigned I : IV->indices()) {
583 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(I);
586 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
589 } else {
590 return std::nullopt;
591 }
592 Index += I;
593 }
594 return Index;
595}
596
597namespace {
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements
599/// in the shuffle mask.
600enum class UseMask {
601 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
602 ///< check for the mask elements for the first argument (mask
603 ///< indices are in range [0:VF)).
604 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
605 ///< for the mask elements for the second argument (mask indices
606 ///< are in range [VF:2*VF))
607 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
608 ///< future shuffle elements and mark them as ones as being used
609 ///< in future. Non-undef elements are considered as unused since
610 ///< they're already marked as used in the mask.
611};
612} // namespace
613
614/// Prepares a use bitset for the given mask either for the first argument or
615/// for the second.
617 UseMask MaskArg) {
618 SmallBitVector UseMask(VF, true);
619 for (auto [Idx, Value] : enumerate(Mask)) {
620 if (Value == PoisonMaskElem) {
621 if (MaskArg == UseMask::UndefsAsMask)
622 UseMask.reset(Idx);
623 continue;
624 }
625 if (MaskArg == UseMask::FirstArg && Value < VF)
626 UseMask.reset(Value);
627 else if (MaskArg == UseMask::SecondArg && Value >= VF)
628 UseMask.reset(Value - VF);
629 }
630 return UseMask;
631}
632
633/// Checks if the given value is actually an undefined constant vector.
634/// Also, if the \p UseMask is not empty, tries to check if the non-masked
635/// elements actually mask the insertelement buildvector, if any.
636template <bool IsPoisonOnly = false>
638 const SmallBitVector &UseMask = {}) {
639 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
641 if (isa<T>(V))
642 return Res;
643 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
644 if (!VecTy)
645 return Res.reset();
646 auto *C = dyn_cast<Constant>(V);
647 if (!C) {
648 if (!UseMask.empty()) {
649 const Value *Base = V;
650 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
651 Base = II->getOperand(0);
652 if (isa<T>(II->getOperand(1)))
653 continue;
654 std::optional<unsigned> Idx = getElementIndex(II);
655 if (!Idx) {
656 Res.reset();
657 return Res;
658 }
659 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
660 Res.reset(*Idx);
661 }
662 // TODO: Add analysis for shuffles here too.
663 if (V == Base) {
664 Res.reset();
665 } else {
666 SmallBitVector SubMask(UseMask.size(), false);
667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
668 }
669 } else {
670 Res.reset();
671 }
672 return Res;
673 }
674 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
675 if (Constant *Elem = C->getAggregateElement(I))
676 if (!isa<T>(Elem) &&
677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
678 Res.reset(I);
679 }
680 return Res;
681}
682
683/// Checks if the vector of instructions can be represented as a shuffle, like:
684/// %x0 = extractelement <4 x i8> %x, i32 0
685/// %x3 = extractelement <4 x i8> %x, i32 3
686/// %y1 = extractelement <4 x i8> %y, i32 1
687/// %y2 = extractelement <4 x i8> %y, i32 2
688/// %x0x0 = mul i8 %x0, %x0
689/// %x3x3 = mul i8 %x3, %x3
690/// %y1y1 = mul i8 %y1, %y1
691/// %y2y2 = mul i8 %y2, %y2
692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
696/// ret <4 x i8> %ins4
697/// can be transformed into:
698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
699/// i32 6>
700/// %2 = mul <4 x i8> %1, %1
701/// ret <4 x i8> %2
702/// Mask will return the Shuffle Mask equivalent to the extracted elements.
703/// TODO: Can we split off and reuse the shuffle mask detection from
704/// ShuffleVectorInst/getShuffleCost?
705static std::optional<TargetTransformInfo::ShuffleKind>
707 AssumptionCache *AC) {
708 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
709 if (It == VL.end())
710 return std::nullopt;
711 unsigned Size =
712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
714 if (!EI)
715 return S;
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 if (!VTy)
718 return S;
719 return std::max(S, VTy->getNumElements());
720 });
721
722 Value *Vec1 = nullptr;
723 Value *Vec2 = nullptr;
724 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
725 auto *EE = dyn_cast<ExtractElementInst>(V);
726 if (!EE)
727 return false;
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
730 return false;
731 return isGuaranteedNotToBePoison(Vec, AC);
732 });
733 enum ShuffleMode { Unknown, Select, Permute };
734 ShuffleMode CommonShuffleMode = Unknown;
735 Mask.assign(VL.size(), PoisonMaskElem);
736 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
737 // Undef can be represented as an undef element in a vector.
738 if (isa<UndefValue>(VL[I]))
739 continue;
740 auto *EI = cast<ExtractElementInst>(VL[I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 return std::nullopt;
743 auto *Vec = EI->getVectorOperand();
744 // We can extractelement from undef or poison vector.
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746 continue;
747 // All vector operands must have the same number of vector elements.
748 if (isa<UndefValue>(Vec)) {
749 Mask[I] = I;
750 } else {
751 if (isa<UndefValue>(EI->getIndexOperand()))
752 continue;
753 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
754 if (!Idx)
755 return std::nullopt;
756 // Undefined behavior if Idx is negative or >= Size.
757 if (Idx->getValue().uge(Size))
758 continue;
759 unsigned IntIdx = Idx->getValue().getZExtValue();
760 Mask[I] = IntIdx;
761 }
762 if (isUndefVector(Vec).all() && HasNonUndefVec)
763 continue;
764 // For correct shuffling we have to have at most 2 different vector operands
765 // in all extractelement instructions.
766 if (!Vec1 || Vec1 == Vec) {
767 Vec1 = Vec;
768 } else if (!Vec2 || Vec2 == Vec) {
769 Vec2 = Vec;
770 Mask[I] += Size;
771 } else {
772 return std::nullopt;
773 }
774 if (CommonShuffleMode == Permute)
775 continue;
776 // If the extract index is not the same as the operation number, it is a
777 // permutation.
778 if (Mask[I] % Size != I) {
779 CommonShuffleMode = Permute;
780 continue;
781 }
782 CommonShuffleMode = Select;
783 }
784 // If we're not crossing lanes in different vectors, consider it as blending.
785 if (CommonShuffleMode == Select && Vec2)
787 // If Vec2 was never used, we have a permutation of a single vector, otherwise
788 // we have permutation of 2 vectors.
791}
792
793/// \returns True if Extract{Value,Element} instruction extracts element Idx.
794static std::optional<unsigned> getExtractIndex(Instruction *E) {
795 unsigned Opcode = E->getOpcode();
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
801 if (!CI)
802 return std::nullopt;
803 return CI->getZExtValue();
804 }
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
807 return std::nullopt;
808 return *EI->idx_begin();
809}
810
811namespace {
812
813/// Main data required for vectorization of instructions.
814class InstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0.
816 Instruction *MainOp = nullptr;
817 Instruction *AltOp = nullptr;
818
819public:
820 Instruction *getMainOp() const {
821 assert(valid() && "InstructionsState is invalid.");
822 return MainOp;
823 }
824
825 Instruction *getAltOp() const {
826 assert(valid() && "InstructionsState is invalid.");
827 return AltOp;
828 }
829
830 /// The main/alternate opcodes for the list of instructions.
831 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
832
833 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
834
835 /// Some of the instructions in the list have alternate opcodes.
836 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
837
838 bool isOpcodeOrAlt(Instruction *I) const {
839 unsigned CheckedOpcode = I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
841 }
842
843 /// Checks if the current state is valid, i.e. has non-null MainOp
844 bool valid() const { return MainOp && AltOp; }
845
846 explicit operator bool() const { return valid(); }
847
848 InstructionsState() = delete;
849 InstructionsState(Instruction *MainOp, Instruction *AltOp)
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() { return {nullptr, nullptr}; }
852};
853
854} // end anonymous namespace
855
856/// \returns true if \p Opcode is allowed as part of the main/alternate
857/// instruction for SLP vectorization.
858///
859/// Example of unsupported opcode is SDIV that can potentially cause UB if the
860/// "shuffled out" lane would result in division by zero.
861static bool isValidForAlternation(unsigned Opcode) {
862 if (Instruction::isIntDivRem(Opcode))
863 return false;
864
865 return true;
866}
867
868static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
869 const TargetLibraryInfo &TLI);
870
871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
872/// compatible instructions or constants, or just some other regular values.
873static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
874 Value *Op1, const TargetLibraryInfo &TLI) {
875 return (isConstant(BaseOp0) && isConstant(Op0)) ||
876 (isConstant(BaseOp1) && isConstant(Op1)) ||
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
880 getSameOpcode({BaseOp0, Op0}, TLI) ||
881 getSameOpcode({BaseOp1, Op1}, TLI);
882}
883
884/// \returns true if a compare instruction \p CI has similar "look" and
885/// same predicate as \p BaseCI, "as is" or with its operands and predicate
886/// swapped, false otherwise.
887static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
888 const TargetLibraryInfo &TLI) {
889 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
890 "Assessing comparisons of different types?");
891 CmpInst::Predicate BasePred = BaseCI->getPredicate();
892 CmpInst::Predicate Pred = CI->getPredicate();
894
895 Value *BaseOp0 = BaseCI->getOperand(0);
896 Value *BaseOp1 = BaseCI->getOperand(1);
897 Value *Op0 = CI->getOperand(0);
898 Value *Op1 = CI->getOperand(1);
899
900 return (BasePred == Pred &&
901 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
902 (BasePred == SwappedPred &&
903 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
904}
905
906/// \returns analysis of the Instructions in \p VL described in
907/// InstructionsState, the Opcode that we suppose the whole list
908/// could be vectorized even if its structure is diverse.
909static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
910 const TargetLibraryInfo &TLI) {
911 // Make sure these are all Instructions.
912 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
914
915 auto *It = find_if(VL, IsaPred<Instruction>);
916 if (It == VL.end())
917 return InstructionsState::invalid();
918
919 Instruction *MainOp = cast<Instruction>(*It);
920 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
921 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
922 (VL.size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
924
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
928 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
930 Instruction *AltOp = MainOp;
931 unsigned Opcode = MainOp->getOpcode();
932 unsigned AltOpcode = Opcode;
933
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
935 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
936 UniquePreds.insert(BasePred);
937 UniqueNonSwappedPreds.insert(BasePred);
938 for (Value *V : VL) {
939 auto *I = dyn_cast<CmpInst>(V);
940 if (!I)
941 return false;
942 CmpInst::Predicate CurrentPred = I->getPredicate();
943 CmpInst::Predicate SwappedCurrentPred =
944 CmpInst::getSwappedPredicate(CurrentPred);
945 UniqueNonSwappedPreds.insert(CurrentPred);
946 if (!UniquePreds.contains(CurrentPred) &&
947 !UniquePreds.contains(SwappedCurrentPred))
948 UniquePreds.insert(CurrentPred);
949 }
950 // Total number of predicates > 2, but if consider swapped predicates
951 // compatible only 2, consider swappable predicates as compatible opcodes,
952 // not alternate.
953 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
954 }();
955 // Check for one alternate opcode from another BinaryOperator.
956 // TODO - generalize to support all operators (types, calls etc.).
957 Intrinsic::ID BaseID = 0;
958 SmallVector<VFInfo> BaseMappings;
959 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
961 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
962 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
963 return InstructionsState::invalid();
964 }
965 bool AnyPoison = InstCnt != VL.size();
966 // Skip MainOp.
967 for (Value *V : iterator_range(It + 1, VL.end())) {
968 auto *I = dyn_cast<Instruction>(V);
969 if (!I)
970 continue;
971
972 // Cannot combine poison and divisions.
973 // TODO: do some smart analysis of the CallInsts to exclude divide-like
974 // intrinsics/functions only.
975 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
976 return InstructionsState::invalid();
977 unsigned InstOpcode = I->getOpcode();
978 if (IsBinOp && isa<BinaryOperator>(I)) {
979 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
980 continue;
981 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
982 isValidForAlternation(Opcode)) {
983 AltOpcode = InstOpcode;
984 AltOp = I;
985 continue;
986 }
987 } else if (IsCastOp && isa<CastInst>(I)) {
988 Value *Op0 = MainOp->getOperand(0);
989 Type *Ty0 = Op0->getType();
990 Value *Op1 = I->getOperand(0);
991 Type *Ty1 = Op1->getType();
992 if (Ty0 == Ty1) {
993 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
994 continue;
995 if (Opcode == AltOpcode) {
997 isValidForAlternation(InstOpcode) &&
998 "Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1000 AltOp = I;
1001 continue;
1002 }
1003 }
1004 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1005 auto *BaseInst = cast<CmpInst>(MainOp);
1006 Type *Ty0 = BaseInst->getOperand(0)->getType();
1007 Type *Ty1 = Inst->getOperand(0)->getType();
1008 if (Ty0 == Ty1) {
1009 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1010 assert(InstOpcode == AltOpcode &&
1011 "Alternate instructions are only supported by BinaryOperator "
1012 "and CastInst.");
1013 // Check for compatible operands. If the corresponding operands are not
1014 // compatible - need to perform alternate vectorization.
1015 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1016 CmpInst::Predicate SwappedCurrentPred =
1017 CmpInst::getSwappedPredicate(CurrentPred);
1018
1019 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1021 continue;
1022
1023 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1024 continue;
1025 auto *AltInst = cast<CmpInst>(AltOp);
1026 if (MainOp != AltOp) {
1027 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1028 continue;
1029 } else if (BasePred != CurrentPred) {
1030 assert(
1031 isValidForAlternation(InstOpcode) &&
1032 "CmpInst isn't safe for alternation, logic needs to be updated!");
1033 AltOp = I;
1034 continue;
1035 }
1036 CmpInst::Predicate AltPred = AltInst->getPredicate();
1037 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1039 continue;
1040 }
1041 } else if (InstOpcode == Opcode) {
1042 assert(InstOpcode == AltOpcode &&
1043 "Alternate instructions are only supported by BinaryOperator and "
1044 "CastInst.");
1045 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1046 if (Gep->getNumOperands() != 2 ||
1047 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1048 return InstructionsState::invalid();
1049 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1051 return InstructionsState::invalid();
1052 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1053 auto *BaseLI = cast<LoadInst>(MainOp);
1054 if (!LI->isSimple() || !BaseLI->isSimple())
1055 return InstructionsState::invalid();
1056 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1057 auto *CallBase = cast<CallInst>(MainOp);
1058 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1059 return InstructionsState::invalid();
1060 if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1064 CallBase->op_begin() +
1066 return InstructionsState::invalid();
1068 if (ID != BaseID)
1069 return InstructionsState::invalid();
1070 if (!ID) {
1071 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1072 if (Mappings.size() != BaseMappings.size() ||
1073 Mappings.front().ISA != BaseMappings.front().ISA ||
1074 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1075 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1076 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1077 Mappings.front().Shape.Parameters !=
1078 BaseMappings.front().Shape.Parameters)
1079 return InstructionsState::invalid();
1080 }
1081 }
1082 continue;
1083 }
1084 return InstructionsState::invalid();
1085 }
1086
1087 return InstructionsState(MainOp, AltOp);
1088}
1089
1090/// \returns true if all of the values in \p VL have the same type or false
1091/// otherwise.
1093 Type *Ty = VL.front()->getType();
1094 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1095}
1096
1097/// \returns True if in-tree use also needs extract. This refers to
1098/// possible scalar operand in vectorized instruction.
1099static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1100 TargetLibraryInfo *TLI,
1101 const TargetTransformInfo *TTI) {
1102 if (!UserInst)
1103 return false;
1104 unsigned Opcode = UserInst->getOpcode();
1105 switch (Opcode) {
1106 case Instruction::Load: {
1107 LoadInst *LI = cast<LoadInst>(UserInst);
1108 return (LI->getPointerOperand() == Scalar);
1109 }
1110 case Instruction::Store: {
1111 StoreInst *SI = cast<StoreInst>(UserInst);
1112 return (SI->getPointerOperand() == Scalar);
1113 }
1114 case Instruction::Call: {
1115 CallInst *CI = cast<CallInst>(UserInst);
1117 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1120 });
1121 }
1122 default:
1123 return false;
1124 }
1125}
1126
1127/// \returns the AA location that is being access by the instruction.
1129 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1130 return MemoryLocation::get(SI);
1131 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1132 return MemoryLocation::get(LI);
1133 return MemoryLocation();
1134}
1135
1136/// \returns True if the instruction is not a volatile or atomic load/store.
1137static bool isSimple(Instruction *I) {
1138 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1139 return LI->isSimple();
1140 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1141 return SI->isSimple();
1142 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1143 return !MI->isVolatile();
1144 return true;
1145}
1146
1147/// Shuffles \p Mask in accordance with the given \p SubMask.
1148/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1149/// one but two input vectors.
1150static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1151 bool ExtendingManyInputs = false) {
1152 if (SubMask.empty())
1153 return;
1154 assert(
1155 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1156 // Check if input scalars were extended to match the size of other node.
1157 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1158 "SubMask with many inputs support must be larger than the mask.");
1159 if (Mask.empty()) {
1160 Mask.append(SubMask.begin(), SubMask.end());
1161 return;
1162 }
1163 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1164 int TermValue = std::min(Mask.size(), SubMask.size());
1165 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1166 if (SubMask[I] == PoisonMaskElem ||
1167 (!ExtendingManyInputs &&
1168 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1169 continue;
1170 NewMask[I] = Mask[SubMask[I]];
1171 }
1172 Mask.swap(NewMask);
1173}
1174
1175/// Order may have elements assigned special value (size) which is out of
1176/// bounds. Such indices only appear on places which correspond to undef values
1177/// (see canReuseExtract for details) and used in order to avoid undef values
1178/// have effect on operands ordering.
1179/// The first loop below simply finds all unused indices and then the next loop
1180/// nest assigns these indices for undef values positions.
1181/// As an example below Order has two undef positions and they have assigned
1182/// values 3 and 7 respectively:
1183/// before: 6 9 5 4 9 2 1 0
1184/// after: 6 3 5 4 7 2 1 0
1186 const unsigned Sz = Order.size();
1187 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1188 SmallBitVector MaskedIndices(Sz);
1189 for (unsigned I = 0; I < Sz; ++I) {
1190 if (Order[I] < Sz)
1191 UnusedIndices.reset(Order[I]);
1192 else
1193 MaskedIndices.set(I);
1194 }
1195 if (MaskedIndices.none())
1196 return;
1197 assert(UnusedIndices.count() == MaskedIndices.count() &&
1198 "Non-synced masked/available indices.");
1199 int Idx = UnusedIndices.find_first();
1200 int MIdx = MaskedIndices.find_first();
1201 while (MIdx >= 0) {
1202 assert(Idx >= 0 && "Indices must be synced.");
1203 Order[MIdx] = Idx;
1204 Idx = UnusedIndices.find_next(Idx);
1205 MIdx = MaskedIndices.find_next(MIdx);
1206 }
1207}
1208
1209/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1210/// Opcode1.
1212 unsigned Opcode1) {
1213 Type *ScalarTy = VL[0]->getType();
1214 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1215 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1216 for (unsigned Lane : seq<unsigned>(VL.size())) {
1217 if (isa<PoisonValue>(VL[Lane]))
1218 continue;
1219 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1220 OpcodeMask.set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1222 }
1223 return OpcodeMask;
1224}
1225
1226namespace llvm {
1227
1229 SmallVectorImpl<int> &Mask) {
1230 Mask.clear();
1231 const unsigned E = Indices.size();
1232 Mask.resize(E, PoisonMaskElem);
1233 for (unsigned I = 0; I < E; ++I)
1234 Mask[Indices[I]] = I;
1235}
1236
1237/// Reorders the list of scalars in accordance with the given \p Mask.
1239 ArrayRef<int> Mask) {
1240 assert(!Mask.empty() && "Expected non-empty mask.");
1241 SmallVector<Value *> Prev(Scalars.size(),
1242 PoisonValue::get(Scalars.front()->getType()));
1243 Prev.swap(Scalars);
1244 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1245 if (Mask[I] != PoisonMaskElem)
1246 Scalars[Mask[I]] = Prev[I];
1247}
1248
1249/// Checks if the provided value does not require scheduling. It does not
1250/// require scheduling if this is not an instruction or it is an instruction
1251/// that does not read/write memory and all operands are either not instructions
1252/// or phi nodes or instructions from different blocks.
1254 auto *I = dyn_cast<Instruction>(V);
1255 if (!I)
1256 return true;
1257 return !mayHaveNonDefUseDependency(*I) &&
1258 all_of(I->operands(), [I](Value *V) {
1259 auto *IO = dyn_cast<Instruction>(V);
1260 if (!IO)
1261 return true;
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1263 });
1264}
1265
1266/// Checks if the provided value does not require scheduling. It does not
1267/// require scheduling if this is not an instruction or it is an instruction
1268/// that does not read/write memory and all users are phi nodes or instructions
1269/// from the different blocks.
1270static bool isUsedOutsideBlock(Value *V) {
1271 auto *I = dyn_cast<Instruction>(V);
1272 if (!I)
1273 return true;
1274 // Limits the number of uses to save compile time.
1275 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1276 all_of(I->users(), [I](User *U) {
1277 auto *IU = dyn_cast<Instruction>(U);
1278 if (!IU)
1279 return true;
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1281 });
1282}
1283
1284/// Checks if the specified value does not require scheduling. It does not
1285/// require scheduling if all operands and all users do not need to be scheduled
1286/// in the current basic block.
1289}
1290
1291/// Checks if the specified array of instructions does not require scheduling.
1292/// It is so if all either instructions have operands that do not require
1293/// scheduling or their users do not require scheduling since they are phis or
1294/// in other basic blocks.
1296 return !VL.empty() &&
1298}
1299
1300/// Returns true if widened type of \p Ty elements with size \p Sz represents
1301/// full vector type, i.e. adding extra element results in extra parts upon type
1302/// legalization.
1304 unsigned Sz) {
1305 if (Sz <= 1)
1306 return false;
1307 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1308 return false;
1309 if (has_single_bit(Sz))
1310 return true;
1311 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1312 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1313 Sz % NumParts == 0;
1314}
1315
1316namespace slpvectorizer {
1317
1318/// Bottom Up SLP Vectorizer.
1319class BoUpSLP {
1320 struct TreeEntry;
1321 struct ScheduleData;
1324
1325public:
1326 /// Tracks the state we can represent the loads in the given sequence.
1327 enum class LoadsState {
1328 Gather,
1329 Vectorize,
1332 };
1333
1340
1342 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1345 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB), DL(DL), ORE(ORE),
1347 Builder(Se->getContext(), TargetFolder(*DL)) {
1348 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1349 // Use the vector register size specified by the target unless overridden
1350 // by a command-line option.
1351 // TODO: It would be better to limit the vectorization factor based on
1352 // data type rather than just register size. For example, x86 AVX has
1353 // 256-bit registers, but it does not support integer operations
1354 // at that width (that requires AVX2).
1355 if (MaxVectorRegSizeOption.getNumOccurrences())
1356 MaxVecRegSize = MaxVectorRegSizeOption;
1357 else
1358 MaxVecRegSize =
1360 .getFixedValue();
1361
1362 if (MinVectorRegSizeOption.getNumOccurrences())
1363 MinVecRegSize = MinVectorRegSizeOption;
1364 else
1365 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1366 }
1367
1368 /// Vectorize the tree that starts with the elements in \p VL.
1369 /// Returns the vectorized root.
1371
1372 /// Vectorize the tree but with the list of externally used values \p
1373 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1374 /// generated extractvalue instructions.
1375 Value *
1376 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1377 Instruction *ReductionRoot = nullptr);
1378
1379 /// \returns the cost incurred by unwanted spills and fills, caused by
1380 /// holding live values over call sites.
1382
1383 /// \returns the vectorization cost of the subtree that starts at \p VL.
1384 /// A negative number means that this is profitable.
1385 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1386
1387 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1388 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1389 void buildTree(ArrayRef<Value *> Roots,
1390 const SmallDenseSet<Value *> &UserIgnoreLst);
1391
1392 /// Construct a vectorizable tree that starts at \p Roots.
1393 void buildTree(ArrayRef<Value *> Roots);
1394
1395 /// Returns whether the root node has in-tree uses.
1397 return !VectorizableTree.empty() &&
1398 !VectorizableTree.front()->UserTreeIndices.empty();
1399 }
1400
1401 /// Return the scalars of the root node.
1403 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1404 return VectorizableTree.front()->Scalars;
1405 }
1406
1407 /// Returns the type/is-signed info for the root node in the graph without
1408 /// casting.
1409 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1410 const TreeEntry &Root = *VectorizableTree.front().get();
1411 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413 return std::nullopt;
1414 auto It = MinBWs.find(&Root);
1415 if (It != MinBWs.end())
1416 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1417 It->second.first),
1418 It->second.second);
1419 if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423 return std::nullopt;
1424 }
1425
1426 /// Checks if the root graph node can be emitted with narrower bitwidth at
1427 /// codegen and returns it signedness, if so.
1429 return MinBWs.at(VectorizableTree.front().get()).second;
1430 }
1431
1432 /// Returns reduction type after minbitdth analysis.
1434 if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437 DL->getTypeSizeInBits(
1438 VectorizableTree.front()->Scalars.front()->getType()))
1439 return getWidenedType(
1440 VectorizableTree.front()->Scalars.front()->getType(),
1441 VectorizableTree.front()->getVectorFactor());
1442 return getWidenedType(
1444 VectorizableTree.front()->Scalars.front()->getContext(),
1445 ReductionBitWidth),
1446 VectorizableTree.front()->getVectorFactor());
1447 }
1448
1449 /// Builds external uses of the vectorized scalars, i.e. the list of
1450 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1451 /// ExternallyUsedValues contains additional list of external uses to handle
1452 /// vectorization of reductions.
1453 void
1454 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1455
1456 /// Transforms graph nodes to target specific representations, if profitable.
1457 void transformNodes();
1458
1459 /// Clear the internal data structures that are created by 'buildTree'.
1460 void deleteTree() {
1461 VectorizableTree.clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1464 MustGather.clear();
1465 NonScheduledFirst.clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.clear();
1468 IsGraphTransformMode = false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472 for (auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1474 BS->clear();
1475 }
1476 MinBWs.clear();
1477 ReductionBitWidth = 0;
1478 BaseGraphSize = 1;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList = nullptr;
1483 PostponedGathers.clear();
1484 ValueToGatherNodes.clear();
1485 }
1486
1487 unsigned getTreeSize() const { return VectorizableTree.size(); }
1488
1489 /// Returns the base graph size, before any transformations.
1490 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1491
1492 /// Perform LICM and CSE on the newly generated gather sequences.
1494
1495 /// Does this non-empty order represent an identity order? Identity
1496 /// should be represented as an empty order, so this is used to
1497 /// decide if we can canonicalize a computed order. Undef elements
1498 /// (represented as size) are ignored.
1500 assert(!Order.empty() && "expected non-empty order");
1501 const unsigned Sz = Order.size();
1502 return all_of(enumerate(Order), [&](const auto &P) {
1503 return P.value() == P.index() || P.value() == Sz;
1504 });
1505 }
1506
1507 /// Checks if the specified gather tree entry \p TE can be represented as a
1508 /// shuffled vector entry + (possibly) permutation with other gathers. It
1509 /// implements the checks only for possibly ordered scalars (Loads,
1510 /// ExtractElement, ExtractValue), which can be part of the graph.
1511 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1512
1513 /// Sort loads into increasing pointers offsets to allow greater clustering.
1514 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1515
1516 /// Gets reordering data for the given tree entry. If the entry is vectorized
1517 /// - just return ReorderIndices, otherwise check if the scalars can be
1518 /// reordered and return the most optimal order.
1519 /// \return std::nullopt if ordering is not important, empty order, if
1520 /// identity order is important, or the actual order.
1521 /// \param TopToBottom If true, include the order of vectorized stores and
1522 /// insertelement nodes, otherwise skip them.
1523 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1524 bool TopToBottom);
1525
1526 /// Reorders the current graph to the most profitable order starting from the
1527 /// root node to the leaf nodes. The best order is chosen only from the nodes
1528 /// of the same size (vectorization factor). Smaller nodes are considered
1529 /// parts of subgraph with smaller VF and they are reordered independently. We
1530 /// can make it because we still need to extend smaller nodes to the wider VF
1531 /// and we can merge reordering shuffles with the widening shuffles.
1532 void reorderTopToBottom();
1533
1534 /// Reorders the current graph to the most profitable order starting from
1535 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1536 /// number of reshuffles if the leaf nodes use the same order. In this case we
1537 /// can merge the orders and just shuffle user node instead of shuffling its
1538 /// operands. Plus, even the leaf nodes have different orders, it allows to
1539 /// sink reordering in the graph closer to the root node and merge it later
1540 /// during analysis.
1541 void reorderBottomToTop(bool IgnoreReorder = false);
1542
1543 /// \return The vector element size in bits to use when vectorizing the
1544 /// expression tree ending at \p V. If V is a store, the size is the width of
1545 /// the stored value. Otherwise, the size is the width of the largest loaded
1546 /// value reaching V. This method is used by the vectorizer to calculate
1547 /// vectorization factors.
1548 unsigned getVectorElementSize(Value *V);
1549
1550 /// Compute the minimum type sizes required to represent the entries in a
1551 /// vectorizable tree.
1553
1554 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1555 unsigned getMaxVecRegSize() const {
1556 return MaxVecRegSize;
1557 }
1558
1559 // \returns minimum vector register size as set by cl::opt.
1560 unsigned getMinVecRegSize() const {
1561 return MinVecRegSize;
1562 }
1563
1564 unsigned getMinVF(unsigned Sz) const {
1565 return std::max(2U, getMinVecRegSize() / Sz);
1566 }
1567
1568 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1569 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1570 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1571 return MaxVF ? MaxVF : UINT_MAX;
1572 }
1573
1574 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1575 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1576 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1577 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1578 ///
1579 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1580 unsigned canMapToVector(Type *T) const;
1581
1582 /// \returns True if the VectorizableTree is both tiny and not fully
1583 /// vectorizable. We do not vectorize such trees.
1584 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1585
1586 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1587 /// It may happen, if all gather nodes are loads and they cannot be
1588 /// "clusterized". In this case even subgraphs cannot be vectorized more
1589 /// effectively than the base graph.
1590 bool isTreeNotExtendable() const;
1591
1592 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1593 /// can be load combined in the backend. Load combining may not be allowed in
1594 /// the IR optimizer, so we do not want to alter the pattern. For example,
1595 /// partially transforming a scalar bswap() pattern into vector code is
1596 /// effectively impossible for the backend to undo.
1597 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1598 /// may not be necessary.
1599 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1600
1601 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1602 /// can be load combined in the backend. Load combining may not be allowed in
1603 /// the IR optimizer, so we do not want to alter the pattern. For example,
1604 /// partially transforming a scalar bswap() pattern into vector code is
1605 /// effectively impossible for the backend to undo.
1606 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1607 /// may not be necessary.
1608 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1609
1610 /// Checks if the given array of loads can be represented as a vectorized,
1611 /// scatter or just simple gather.
1612 /// \param VL list of loads.
1613 /// \param VL0 main load value.
1614 /// \param Order returned order of load instructions.
1615 /// \param PointerOps returned list of pointer operands.
1616 /// \param BestVF return best vector factor, if recursive check found better
1617 /// vectorization sequences rather than masked gather.
1618 /// \param TryRecursiveCheck used to check if long masked gather can be
1619 /// represented as a serie of loads/insert subvector, if profitable.
1622 SmallVectorImpl<Value *> &PointerOps,
1623 unsigned *BestVF = nullptr,
1624 bool TryRecursiveCheck = true) const;
1625
1626 /// Registers non-vectorizable sequence of loads
1627 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1628 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1629 }
1630
1631 /// Checks if the given loads sequence is known as not vectorizable
1632 template <typename T>
1634 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1635 }
1636
1638
1639 /// This structure holds any data we need about the edges being traversed
1640 /// during buildTree_rec(). We keep track of:
1641 /// (i) the user TreeEntry index, and
1642 /// (ii) the index of the edge.
1643 struct EdgeInfo {
1644 EdgeInfo() = default;
1645 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1647 /// The user TreeEntry.
1648 TreeEntry *UserTE = nullptr;
1649 /// The operand index of the use.
1650 unsigned EdgeIdx = UINT_MAX;
1651#ifndef NDEBUG
1653 const BoUpSLP::EdgeInfo &EI) {
1654 EI.dump(OS);
1655 return OS;
1656 }
1657 /// Debug print.
1658 void dump(raw_ostream &OS) const {
1659 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1660 << " EdgeIdx:" << EdgeIdx << "}";
1661 }
1662 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1663#endif
1664 bool operator == (const EdgeInfo &Other) const {
1665 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1666 }
1667 };
1668
1669 /// A helper class used for scoring candidates for two consecutive lanes.
1671 const TargetLibraryInfo &TLI;
1672 const DataLayout &DL;
1673 ScalarEvolution &SE;
1674 const BoUpSLP &R;
1675 int NumLanes; // Total number of lanes (aka vectorization factor).
1676 int MaxLevel; // The maximum recursion depth for accumulating score.
1677
1678 public:
1680 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1681 int MaxLevel)
1682 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1684
1685 // The hard-coded scores listed here are not very important, though it shall
1686 // be higher for better matches to improve the resulting cost. When
1687 // computing the scores of matching one sub-tree with another, we are
1688 // basically counting the number of values that are matching. So even if all
1689 // scores are set to 1, we would still get a decent matching result.
1690 // However, sometimes we have to break ties. For example we may have to
1691 // choose between matching loads vs matching opcodes. This is what these
1692 // scores are helping us with: they provide the order of preference. Also,
1693 // this is important if the scalar is externally used or used in another
1694 // tree entry node in the different lane.
1695
1696 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1697 static const int ScoreConsecutiveLoads = 4;
1698 /// The same load multiple times. This should have a better score than
1699 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1700 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1701 /// a vector load and 1.0 for a broadcast.
1702 static const int ScoreSplatLoads = 3;
1703 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1704 static const int ScoreReversedLoads = 3;
1705 /// A load candidate for masked gather.
1706 static const int ScoreMaskedGatherCandidate = 1;
1707 /// ExtractElementInst from same vector and consecutive indexes.
1708 static const int ScoreConsecutiveExtracts = 4;
1709 /// ExtractElementInst from same vector and reversed indices.
1710 static const int ScoreReversedExtracts = 3;
1711 /// Constants.
1712 static const int ScoreConstants = 2;
1713 /// Instructions with the same opcode.
1714 static const int ScoreSameOpcode = 2;
1715 /// Instructions with alt opcodes (e.g, add + sub).
1716 static const int ScoreAltOpcodes = 1;
1717 /// Identical instructions (a.k.a. splat or broadcast).
1718 static const int ScoreSplat = 1;
1719 /// Matching with an undef is preferable to failing.
1720 static const int ScoreUndef = 1;
1721 /// Score for failing to find a decent match.
1722 static const int ScoreFail = 0;
1723 /// Score if all users are vectorized.
1724 static const int ScoreAllUserVectorized = 1;
1725
1726 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1727 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1728 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1729 /// MainAltOps.
1731 ArrayRef<Value *> MainAltOps) const {
1732 if (!isValidElementType(V1->getType()) ||
1733 !isValidElementType(V2->getType()))
1735
1736 if (V1 == V2) {
1737 if (isa<LoadInst>(V1)) {
1738 // Retruns true if the users of V1 and V2 won't need to be extracted.
1739 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1740 // Bail out if we have too many uses to save compilation time.
1741 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1742 return false;
1743
1744 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1745 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1747 });
1748 };
1749 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1750 };
1751 // A broadcast of a load can be cheaper on some targets.
1752 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1753 ElementCount::getFixed(NumLanes)) &&
1754 ((int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1757 }
1759 }
1760
1761 auto CheckSameEntryOrFail = [&]() {
1762 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1766 };
1767
1768 auto *LI1 = dyn_cast<LoadInst>(V1);
1769 auto *LI2 = dyn_cast<LoadInst>(V2);
1770 if (LI1 && LI2) {
1771 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1772 !LI2->isSimple())
1773 return CheckSameEntryOrFail();
1774
1775 std::optional<int> Dist = getPointersDiff(
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1778 if (!Dist || *Dist == 0) {
1779 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1780 getUnderlyingObject(LI2->getPointerOperand()) &&
1781 R.TTI->isLegalMaskedGather(
1782 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1784 return CheckSameEntryOrFail();
1785 }
1786 // The distance is too large - still may be profitable to use masked
1787 // loads/gathers.
1788 if (std::abs(*Dist) > NumLanes / 2)
1790 // This still will detect consecutive loads, but we might have "holes"
1791 // in some cases. It is ok for non-power-2 vectorization and may produce
1792 // better results. It should not affect current vectorization.
1795 }
1796
1797 auto *C1 = dyn_cast<Constant>(V1);
1798 auto *C2 = dyn_cast<Constant>(V2);
1799 if (C1 && C2)
1801
1802 // Extracts from consecutive indexes of the same vector better score as
1803 // the extracts could be optimized away.
1804 Value *EV1;
1805 ConstantInt *Ex1Idx;
1806 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1807 // Undefs are always profitable for extractelements.
1808 // Compiler can easily combine poison and extractelement <non-poison> or
1809 // undef and extractelement <poison>. But combining undef +
1810 // extractelement <non-poison-but-may-produce-poison> requires some
1811 // extra operations.
1812 if (isa<UndefValue>(V2))
1813 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1816 Value *EV2 = nullptr;
1817 ConstantInt *Ex2Idx = nullptr;
1818 if (match(V2,
1820 m_Undef())))) {
1821 // Undefs are always profitable for extractelements.
1822 if (!Ex2Idx)
1824 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1826 if (EV2 == EV1) {
1827 int Idx1 = Ex1Idx->getZExtValue();
1828 int Idx2 = Ex2Idx->getZExtValue();
1829 int Dist = Idx2 - Idx1;
1830 // The distance is too large - still may be profitable to use
1831 // shuffles.
1832 if (std::abs(Dist) == 0)
1834 if (std::abs(Dist) > NumLanes / 2)
1838 }
1840 }
1841 return CheckSameEntryOrFail();
1842 }
1843
1844 auto *I1 = dyn_cast<Instruction>(V1);
1845 auto *I2 = dyn_cast<Instruction>(V2);
1846 if (I1 && I2) {
1847 if (I1->getParent() != I2->getParent())
1848 return CheckSameEntryOrFail();
1849 SmallVector<Value *, 4> Ops(MainAltOps);
1850 Ops.push_back(I1);
1851 Ops.push_back(I2);
1852 InstructionsState S = getSameOpcode(Ops, TLI);
1853 // Note: Only consider instructions with <= 2 operands to avoid
1854 // complexity explosion.
1855 if (S &&
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1857 !S.isAltShuffle()) &&
1858 all_of(Ops, [&S](Value *V) {
1859 return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1862 }))
1863 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1865 }
1866
1867 if (I1 && isa<PoisonValue>(V2))
1869
1870 if (isa<UndefValue>(V2))
1872
1873 return CheckSameEntryOrFail();
1874 }
1875
1876 /// Go through the operands of \p LHS and \p RHS recursively until
1877 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1878 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1879 /// of \p U1 and \p U2), except at the beginning of the recursion where
1880 /// these are set to nullptr.
1881 ///
1882 /// For example:
1883 /// \verbatim
1884 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1885 /// \ / \ / \ / \ /
1886 /// + + + +
1887 /// G1 G2 G3 G4
1888 /// \endverbatim
1889 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1890 /// each level recursively, accumulating the score. It starts from matching
1891 /// the additions at level 0, then moves on to the loads (level 1). The
1892 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1893 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1894 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1895 /// Please note that the order of the operands does not matter, as we
1896 /// evaluate the score of all profitable combinations of operands. In
1897 /// other words the score of G1 and G4 is the same as G1 and G2. This
1898 /// heuristic is based on ideas described in:
1899 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1900 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1901 /// Luís F. W. Góes
1903 Instruction *U2, int CurrLevel,
1904 ArrayRef<Value *> MainAltOps) const {
1905
1906 // Get the shallow score of V1 and V2.
1907 int ShallowScoreAtThisLevel =
1908 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1909
1910 // If reached MaxLevel,
1911 // or if V1 and V2 are not instructions,
1912 // or if they are SPLAT,
1913 // or if they are not consecutive,
1914 // or if profitable to vectorize loads or extractelements, early return
1915 // the current cost.
1916 auto *I1 = dyn_cast<Instruction>(LHS);
1917 auto *I2 = dyn_cast<Instruction>(RHS);
1918 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1919 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924 return ShallowScoreAtThisLevel;
1925 assert(I1 && I2 && "Should have early exited.");
1926
1927 // Contains the I2 operand indexes that got matched with I1 operands.
1928 SmallSet<unsigned, 4> Op2Used;
1929
1930 // Recursion towards the operands of I1 and I2. We are trying all possible
1931 // operand pairs, and keeping track of the best score.
1932 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1934 // Try to pair op1I with the best operand of I2.
1935 int MaxTmpScore = 0;
1936 unsigned MaxOpIdx2 = 0;
1937 bool FoundBest = false;
1938 // If I2 is commutative try all combinations.
1939 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1940 unsigned ToIdx = isCommutative(I2)
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943 assert(FromIdx <= ToIdx && "Bad index");
1944 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1945 // Skip operands already paired with OpIdx1.
1946 if (Op2Used.count(OpIdx2))
1947 continue;
1948 // Recursively calculate the cost at each level
1949 int TmpScore =
1950 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1951 I1, I2, CurrLevel + 1, {});
1952 // Look for the best score.
1953 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1956 MaxOpIdx2 = OpIdx2;
1957 FoundBest = true;
1958 }
1959 }
1960 if (FoundBest) {
1961 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1962 Op2Used.insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1964 }
1965 }
1966 return ShallowScoreAtThisLevel;
1967 }
1968 };
1969 /// A helper data structure to hold the operands of a vector of instructions.
1970 /// This supports a fixed vector length for all operand vectors.
1972 /// For each operand we need (i) the value, and (ii) the opcode that it
1973 /// would be attached to if the expression was in a left-linearized form.
1974 /// This is required to avoid illegal operand reordering.
1975 /// For example:
1976 /// \verbatim
1977 /// 0 Op1
1978 /// |/
1979 /// Op1 Op2 Linearized + Op2
1980 /// \ / ----------> |/
1981 /// - -
1982 ///
1983 /// Op1 - Op2 (0 + Op1) - Op2
1984 /// \endverbatim
1985 ///
1986 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1987 ///
1988 /// Another way to think of this is to track all the operations across the
1989 /// path from the operand all the way to the root of the tree and to
1990 /// calculate the operation that corresponds to this path. For example, the
1991 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1992 /// corresponding operation is a '-' (which matches the one in the
1993 /// linearized tree, as shown above).
1994 ///
1995 /// For lack of a better term, we refer to this operation as Accumulated
1996 /// Path Operation (APO).
1997 struct OperandData {
1998 OperandData() = default;
1999 OperandData(Value *V, bool APO, bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2001 /// The operand value.
2002 Value *V = nullptr;
2003 /// TreeEntries only allow a single opcode, or an alternate sequence of
2004 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2005 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2006 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2007 /// (e.g., Add/Mul)
2008 bool APO = false;
2009 /// Helper data for the reordering function.
2010 bool IsUsed = false;
2011 };
2012
2013 /// During operand reordering, we are trying to select the operand at lane
2014 /// that matches best with the operand at the neighboring lane. Our
2015 /// selection is based on the type of value we are looking for. For example,
2016 /// if the neighboring lane has a load, we need to look for a load that is
2017 /// accessing a consecutive address. These strategies are summarized in the
2018 /// 'ReorderingMode' enumerator.
2019 enum class ReorderingMode {
2020 Load, ///< Matching loads to consecutive memory addresses
2021 Opcode, ///< Matching instructions based on opcode (same or alternate)
2022 Constant, ///< Matching constants
2023 Splat, ///< Matching the same instruction multiple times (broadcast)
2024 Failed, ///< We failed to create a vectorizable group
2025 };
2026
2028
2029 /// A vector of operand vectors.
2031 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2032 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2033 unsigned ArgSize = 0;
2034
2035 const TargetLibraryInfo &TLI;
2036 const DataLayout &DL;
2037 ScalarEvolution &SE;
2038 const BoUpSLP &R;
2039 const Loop *L = nullptr;
2040
2041 /// \returns the operand data at \p OpIdx and \p Lane.
2042 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2043 return OpsVec[OpIdx][Lane];
2044 }
2045
2046 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2047 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2048 return OpsVec[OpIdx][Lane];
2049 }
2050
2051 /// Clears the used flag for all entries.
2052 void clearUsed() {
2053 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2056 ++Lane)
2057 OpsVec[OpIdx][Lane].IsUsed = false;
2058 }
2059
2060 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2061 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2062 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2063 }
2064
2065 /// \param Lane lane of the operands under analysis.
2066 /// \param OpIdx operand index in \p Lane lane we're looking the best
2067 /// candidate for.
2068 /// \param Idx operand index of the current candidate value.
2069 /// \returns The additional score due to possible broadcasting of the
2070 /// elements in the lane. It is more profitable to have power-of-2 unique
2071 /// elements in the lane, it will be vectorized with higher probability
2072 /// after removing duplicates. Currently the SLP vectorizer supports only
2073 /// vectorization of the power-of-2 number of unique scalars.
2074 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2075 const SmallBitVector &UsedLanes) const {
2076 Value *IdxLaneV = getData(Idx, Lane).V;
2077 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2079 return 0;
2081 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2082 if (Ln == Lane)
2083 continue;
2084 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085 if (!isa<Instruction>(OpIdxLnV))
2086 return 0;
2087 Uniques.try_emplace(OpIdxLnV, Ln);
2088 }
2089 unsigned UniquesCount = Uniques.size();
2090 auto IdxIt = Uniques.find(IdxLaneV);
2091 unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2093 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2095 unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2097 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2098 return 0;
2099 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2102 bit_floor(UniquesCntWithOpIdxLaneV)) -
2103 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2105 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2106 }
2107
2108 /// \param Lane lane of the operands under analysis.
2109 /// \param OpIdx operand index in \p Lane lane we're looking the best
2110 /// candidate for.
2111 /// \param Idx operand index of the current candidate value.
2112 /// \returns The additional score for the scalar which users are all
2113 /// vectorized.
2114 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2115 Value *IdxLaneV = getData(Idx, Lane).V;
2116 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2117 // Do not care about number of uses for vector-like instructions
2118 // (extractelement/extractvalue with constant indices), they are extracts
2119 // themselves and already externally used. Vectorization of such
2120 // instructions does not add extra extractelement instruction, just may
2121 // remove it.
2122 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2123 isVectorLikeInstWithConstOps(OpIdxLaneV))
2125 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2127 return 0;
2128 return R.areAllUsersVectorized(IdxLaneI)
2130 : 0;
2131 }
2132
2133 /// Score scaling factor for fully compatible instructions but with
2134 /// different number of external uses. Allows better selection of the
2135 /// instructions with less external uses.
2136 static const int ScoreScaleFactor = 10;
2137
2138 /// \Returns the look-ahead score, which tells us how much the sub-trees
2139 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2140 /// score. This helps break ties in an informed way when we cannot decide on
2141 /// the order of the operands by just considering the immediate
2142 /// predecessors.
2143 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2144 int Lane, unsigned OpIdx, unsigned Idx,
2145 bool &IsUsed, const SmallBitVector &UsedLanes) {
2146 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2148 // Keep track of the instruction stack as we recurse into the operands
2149 // during the look-ahead score exploration.
2150 int Score =
2151 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2152 /*CurrLevel=*/1, MainAltOps);
2153 if (Score) {
2154 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2155 if (Score <= -SplatScore) {
2156 // Failed score.
2157 Score = 0;
2158 } else {
2159 Score += SplatScore;
2160 // Scale score to see the difference between different operands
2161 // and similar operands but all vectorized/not all vectorized
2162 // uses. It does not affect actual selection of the best
2163 // compatible operand in general, just allows to select the
2164 // operand with all vectorized uses.
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx, Idx);
2167 IsUsed = true;
2168 }
2169 }
2170 return Score;
2171 }
2172
2173 /// Best defined scores per lanes between the passes. Used to choose the
2174 /// best operand (with the highest score) between the passes.
2175 /// The key - {Operand Index, Lane}.
2176 /// The value - the best score between the passes for the lane and the
2177 /// operand.
2179 BestScoresPerLanes;
2180
2181 // Search all operands in Ops[*][Lane] for the one that matches best
2182 // Ops[OpIdx][LastLane] and return its opreand index.
2183 // If no good match can be found, return std::nullopt.
2184 std::optional<unsigned>
2185 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2186 ArrayRef<ReorderingMode> ReorderingModes,
2187 ArrayRef<Value *> MainAltOps,
2188 const SmallBitVector &UsedLanes) {
2189 unsigned NumOperands = getNumOperands();
2190
2191 // The operand of the previous lane at OpIdx.
2192 Value *OpLastLane = getData(OpIdx, LastLane).V;
2193
2194 // Our strategy mode for OpIdx.
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196 if (RMode == ReorderingMode::Failed)
2197 return std::nullopt;
2198
2199 // The linearized opcode of the operand at OpIdx, Lane.
2200 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2201
2202 // The best operand index and its score.
2203 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2204 // are using the score to differentiate between the two.
2205 struct BestOpData {
2206 std::optional<unsigned> Idx;
2207 unsigned Score = 0;
2208 } BestOp;
2209 BestOp.Score =
2210 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2211 .first->second;
2212
2213 // Track if the operand must be marked as used. If the operand is set to
2214 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2215 // want to reestimate the operands again on the following iterations).
2216 bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2219 // Iterate through all unused operands and look for the best.
2220 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2221 // Get the operand at Idx and Lane.
2222 OperandData &OpData = getData(Idx, Lane);
2223 Value *Op = OpData.V;
2224 bool OpAPO = OpData.APO;
2225
2226 // Skip already selected operands.
2227 if (OpData.IsUsed)
2228 continue;
2229
2230 // Skip if we are trying to move the operand to a position with a
2231 // different opcode in the linearized tree form. This would break the
2232 // semantics.
2233 if (OpAPO != OpIdxAPO)
2234 continue;
2235
2236 // Look for an operand that matches the current mode.
2237 switch (RMode) {
2238 case ReorderingMode::Load:
2239 case ReorderingMode::Opcode: {
2240 bool LeftToRight = Lane > LastLane;
2241 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2242 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2243 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx, Idx, IsUsed, UsedLanes);
2245 if (Score > static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2247 Idx == OpIdx)) {
2248 BestOp.Idx = Idx;
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2251 }
2252 break;
2253 }
2254 case ReorderingMode::Constant:
2255 if (isa<Constant>(Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2257 BestOp.Idx = Idx;
2258 if (isa<Constant>(Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2262 }
2263 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2264 IsUsed = false;
2265 }
2266 break;
2267 case ReorderingMode::Splat:
2268 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2269 IsUsed = Op == OpLastLane;
2270 if (Op == OpLastLane) {
2271 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2274 }
2275 BestOp.Idx = Idx;
2276 }
2277 break;
2278 case ReorderingMode::Failed:
2279 llvm_unreachable("Not expected Failed reordering mode.");
2280 }
2281 }
2282
2283 if (BestOp.Idx) {
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2285 return BestOp.Idx;
2286 }
2287 // If we could not find a good match return std::nullopt.
2288 return std::nullopt;
2289 }
2290
2291 /// Helper for reorderOperandVecs.
2292 /// \returns the lane that we should start reordering from. This is the one
2293 /// which has the least number of operands that can freely move about or
2294 /// less profitable because it already has the most optimal set of operands.
2295 unsigned getBestLaneToStartReordering() const {
2296 unsigned Min = UINT_MAX;
2297 unsigned SameOpNumber = 0;
2298 // std::pair<unsigned, unsigned> is used to implement a simple voting
2299 // algorithm and choose the lane with the least number of operands that
2300 // can freely move about or less profitable because it already has the
2301 // most optimal set of operands. The first unsigned is a counter for
2302 // voting, the second unsigned is the counter of lanes with instructions
2303 // with same/alternate opcodes and same parent basic block.
2305 // Try to be closer to the original results, if we have multiple lanes
2306 // with same cost. If 2 lanes have the same cost, use the one with the
2307 // highest index.
2308 for (int I = getNumLanes(); I > 0; --I) {
2309 unsigned Lane = I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2312 // Compare the number of operands that can move and choose the one with
2313 // the least number.
2314 if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2317 HashMap.clear();
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2321 // Select the most optimal lane in terms of number of operands that
2322 // should be moved around.
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327 auto [It, Inserted] =
2328 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2329 if (!Inserted)
2330 ++It->second.first;
2331 }
2332 }
2333 // Select the lane with the minimum counter.
2334 unsigned BestLane = 0;
2335 unsigned CntMin = UINT_MAX;
2336 for (const auto &Data : reverse(HashMap)) {
2337 if (Data.second.first < CntMin) {
2338 CntMin = Data.second.first;
2339 BestLane = Data.second.second;
2340 }
2341 }
2342 return BestLane;
2343 }
2344
2345 /// Data structure that helps to reorder operands.
2346 struct OperandsOrderData {
2347 /// The best number of operands with the same APOs, which can be
2348 /// reordered.
2349 unsigned NumOfAPOs = UINT_MAX;
2350 /// Number of operands with the same/alternate instruction opcode and
2351 /// parent.
2352 unsigned NumOpsWithSameOpcodeParent = 0;
2353 /// Hash for the actual operands ordering.
2354 /// Used to count operands, actually their position id and opcode
2355 /// value. It is used in the voting mechanism to find the lane with the
2356 /// least number of operands that can freely move about or less profitable
2357 /// because it already has the most optimal set of operands. Can be
2358 /// replaced with SmallVector<unsigned> instead but hash code is faster
2359 /// and requires less memory.
2360 unsigned Hash = 0;
2361 };
2362 /// \returns the maximum number of operands that are allowed to be reordered
2363 /// for \p Lane and the number of compatible instructions(with the same
2364 /// parent/opcode). This is used as a heuristic for selecting the first lane
2365 /// to start operand reordering.
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2367 unsigned CntTrue = 0;
2368 unsigned NumOperands = getNumOperands();
2369 // Operands with the same APO can be reordered. We therefore need to count
2370 // how many of them we have for each APO, like this: Cnt[APO] = x.
2371 // Since we only have two APOs, namely true and false, we can avoid using
2372 // a map. Instead we can simply count the number of operands that
2373 // correspond to one of them (in this case the 'true' APO), and calculate
2374 // the other by subtracting it from the total number of operands.
2375 // Operands with the same instruction opcode and parent are more
2376 // profitable since we don't need to move them in many cases, with a high
2377 // probability such lane already can be vectorized effectively.
2378 bool AllUndefs = true;
2379 unsigned NumOpsWithSameOpcodeParent = 0;
2380 Instruction *OpcodeI = nullptr;
2381 BasicBlock *Parent = nullptr;
2382 unsigned Hash = 0;
2383 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 const OperandData &OpData = getData(OpIdx, Lane);
2385 if (OpData.APO)
2386 ++CntTrue;
2387 // Use Boyer-Moore majority voting for finding the majority opcode and
2388 // the number of times it occurs.
2389 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2390 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
2391 I->getParent() != Parent) {
2392 if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2394 OpcodeI = I;
2395 Parent = I->getParent();
2396 } else {
2397 --NumOpsWithSameOpcodeParent;
2398 }
2399 } else {
2400 ++NumOpsWithSameOpcodeParent;
2401 }
2402 }
2403 Hash = hash_combine(
2404 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2406 }
2407 if (AllUndefs)
2408 return {};
2409 OperandsOrderData Data;
2410 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2412 Data.Hash = Hash;
2413 return Data;
2414 }
2415
2416 /// Go through the instructions in VL and append their operands.
2417 void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
2418 assert(!VL.empty() && "Bad VL");
2419 assert((empty() || VL.size() == getNumLanes()) &&
2420 "Expected same number of lanes");
2421 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2422 // arguments to the intrinsic produces the same result.
2423 constexpr unsigned IntrinsicNumOperands = 2;
2424 unsigned NumOperands = VL0->getNumOperands();
2425 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2426 OpsVec.resize(NumOperands);
2427 unsigned NumLanes = VL.size();
2428 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2429 OpsVec[OpIdx].resize(NumLanes);
2430 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2431 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2432 "Expected instruction or poison value");
2433 // Our tree has just 3 nodes: the root and two operands.
2434 // It is therefore trivial to get the APO. We only need to check the
2435 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2436 // RHS operand. The LHS operand of both add and sub is never attached
2437 // to an inversese operation in the linearized form, therefore its APO
2438 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2439
2440 // Since operand reordering is performed on groups of commutative
2441 // operations or alternating sequences (e.g., +, -), we can safely
2442 // tell the inverse operations by checking commutativity.
2443 if (isa<PoisonValue>(VL[Lane])) {
2444 if (auto *EI = dyn_cast<ExtractElementInst>(VL0)) {
2445 if (OpIdx == 0) {
2446 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
2447 continue;
2448 }
2449 } else if (auto *EV = dyn_cast<ExtractValueInst>(VL0)) {
2450 if (OpIdx == 0) {
2451 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
2452 continue;
2453 }
2454 }
2455 OpsVec[OpIdx][Lane] = {
2456 PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
2457 false};
2458 continue;
2459 }
2460 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2461 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2462 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2463 APO, false};
2464 }
2465 }
2466 }
2467
2468 /// \returns the number of operands.
2469 unsigned getNumOperands() const { return ArgSize; }
2470
2471 /// \returns the number of lanes.
2472 unsigned getNumLanes() const { return OpsVec[0].size(); }
2473
2474 /// \returns the operand value at \p OpIdx and \p Lane.
2475 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2476 return getData(OpIdx, Lane).V;
2477 }
2478
2479 /// \returns true if the data structure is empty.
2480 bool empty() const { return OpsVec.empty(); }
2481
2482 /// Clears the data.
2483 void clear() { OpsVec.clear(); }
2484
2485 /// \Returns true if there are enough operands identical to \p Op to fill
2486 /// the whole vector (it is mixed with constants or loop invariant values).
2487 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2488 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2489 assert(Op == getValue(OpIdx, Lane) &&
2490 "Op is expected to be getValue(OpIdx, Lane).");
2491 // Small number of loads - try load matching.
2492 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2493 return false;
2494 bool OpAPO = getData(OpIdx, Lane).APO;
2495 bool IsInvariant = L && L->isLoopInvariant(Op);
2496 unsigned Cnt = 0;
2497 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2498 if (Ln == Lane)
2499 continue;
2500 // This is set to true if we found a candidate for broadcast at Lane.
2501 bool FoundCandidate = false;
2502 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2503 OperandData &Data = getData(OpI, Ln);
2504 if (Data.APO != OpAPO || Data.IsUsed)
2505 continue;
2506 Value *OpILane = getValue(OpI, Lane);
2507 bool IsConstantOp = isa<Constant>(OpILane);
2508 // Consider the broadcast candidate if:
2509 // 1. Same value is found in one of the operands.
2510 if (Data.V == Op ||
2511 // 2. The operand in the given lane is not constant but there is a
2512 // constant operand in another lane (which can be moved to the
2513 // given lane). In this case we can represent it as a simple
2514 // permutation of constant and broadcast.
2515 (!IsConstantOp &&
2516 ((Lns > 2 && isa<Constant>(Data.V)) ||
2517 // 2.1. If we have only 2 lanes, need to check that value in the
2518 // next lane does not build same opcode sequence.
2519 (Lns == 2 &&
2520 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2521 isa<Constant>(Data.V)))) ||
2522 // 3. The operand in the current lane is loop invariant (can be
2523 // hoisted out) and another operand is also a loop invariant
2524 // (though not a constant). In this case the whole vector can be
2525 // hoisted out.
2526 // FIXME: need to teach the cost model about this case for better
2527 // estimation.
2528 (IsInvariant && !isa<Constant>(Data.V) &&
2529 !getSameOpcode({Op, Data.V}, TLI) &&
2530 L->isLoopInvariant(Data.V))) {
2531 FoundCandidate = true;
2532 Data.IsUsed = Data.V == Op;
2533 if (Data.V == Op)
2534 ++Cnt;
2535 break;
2536 }
2537 }
2538 if (!FoundCandidate)
2539 return false;
2540 }
2541 return getNumLanes() == 2 || Cnt > 1;
2542 }
2543
2544 /// Checks if there is at least single compatible operand in lanes other
2545 /// than \p Lane, compatible with the operand \p Op.
2546 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2547 assert(Op == getValue(OpIdx, Lane) &&
2548 "Op is expected to be getValue(OpIdx, Lane).");
2549 bool OpAPO = getData(OpIdx, Lane).APO;
2550 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2551 if (Ln == Lane)
2552 continue;
2553 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2554 const OperandData &Data = getData(OpI, Ln);
2555 if (Data.APO != OpAPO || Data.IsUsed)
2556 return true;
2557 Value *OpILn = getValue(OpI, Ln);
2558 return (L && L->isLoopInvariant(OpILn)) ||
2559 (getSameOpcode({Op, OpILn}, TLI) &&
2560 allSameBlock({Op, OpILn}));
2561 }))
2562 return true;
2563 }
2564 return false;
2565 }
2566
2567 public:
2568 /// Initialize with all the operands of the instruction vector \p RootVL.
2570 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2571 L(R.LI->getLoopFor((VL0->getParent()))) {
2572 // Append all the operands of RootVL.
2573 appendOperandsOfVL(RootVL, VL0);
2574 }
2575
2576 /// \Returns a value vector with the operands across all lanes for the
2577 /// opearnd at \p OpIdx.
2578 ValueList getVL(unsigned OpIdx) const {
2579 ValueList OpVL(OpsVec[OpIdx].size());
2580 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2581 "Expected same num of lanes across all operands");
2582 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2583 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2584 return OpVL;
2585 }
2586
2587 // Performs operand reordering for 2 or more operands.
2588 // The original operands are in OrigOps[OpIdx][Lane].
2589 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2590 void reorder() {
2591 unsigned NumOperands = getNumOperands();
2592 unsigned NumLanes = getNumLanes();
2593 // Each operand has its own mode. We are using this mode to help us select
2594 // the instructions for each lane, so that they match best with the ones
2595 // we have selected so far.
2596 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2597
2598 // This is a greedy single-pass algorithm. We are going over each lane
2599 // once and deciding on the best order right away with no back-tracking.
2600 // However, in order to increase its effectiveness, we start with the lane
2601 // that has operands that can move the least. For example, given the
2602 // following lanes:
2603 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2604 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2605 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2606 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2607 // we will start at Lane 1, since the operands of the subtraction cannot
2608 // be reordered. Then we will visit the rest of the lanes in a circular
2609 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2610
2611 // Find the first lane that we will start our search from.
2612 unsigned FirstLane = getBestLaneToStartReordering();
2613
2614 // Initialize the modes.
2615 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2616 Value *OpLane0 = getValue(OpIdx, FirstLane);
2617 // Keep track if we have instructions with all the same opcode on one
2618 // side.
2619 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2620 // Check if OpLane0 should be broadcast.
2621 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2622 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2623 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2624 else if (isa<LoadInst>(OpILane0))
2625 ReorderingModes[OpIdx] = ReorderingMode::Load;
2626 else
2627 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2628 } else if (isa<Constant>(OpLane0)) {
2629 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2630 } else if (isa<Argument>(OpLane0)) {
2631 // Our best hope is a Splat. It may save some cost in some cases.
2632 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2633 } else {
2634 llvm_unreachable("Unexpected value kind.");
2635 }
2636 }
2637
2638 // Check that we don't have same operands. No need to reorder if operands
2639 // are just perfect diamond or shuffled diamond match. Do not do it only
2640 // for possible broadcasts or non-power of 2 number of scalars (just for
2641 // now).
2642 auto &&SkipReordering = [this]() {
2643 SmallPtrSet<Value *, 4> UniqueValues;
2644 ArrayRef<OperandData> Op0 = OpsVec.front();
2645 for (const OperandData &Data : Op0)
2646 UniqueValues.insert(Data.V);
2648 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2649 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2650 return !UniqueValues.contains(Data.V);
2651 }))
2652 return false;
2653 }
2654 // TODO: Check if we can remove a check for non-power-2 number of
2655 // scalars after full support of non-power-2 vectorization.
2656 return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2657 };
2658
2659 // If the initial strategy fails for any of the operand indexes, then we
2660 // perform reordering again in a second pass. This helps avoid assigning
2661 // high priority to the failed strategy, and should improve reordering for
2662 // the non-failed operand indexes.
2663 for (int Pass = 0; Pass != 2; ++Pass) {
2664 // Check if no need to reorder operands since they're are perfect or
2665 // shuffled diamond match.
2666 // Need to do it to avoid extra external use cost counting for
2667 // shuffled matches, which may cause regressions.
2668 if (SkipReordering())
2669 break;
2670 // Skip the second pass if the first pass did not fail.
2671 bool StrategyFailed = false;
2672 // Mark all operand data as free to use.
2673 clearUsed();
2674 // We keep the original operand order for the FirstLane, so reorder the
2675 // rest of the lanes. We are visiting the nodes in a circular fashion,
2676 // using FirstLane as the center point and increasing the radius
2677 // distance.
2678 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2679 for (unsigned I = 0; I < NumOperands; ++I)
2680 MainAltOps[I].push_back(getData(I, FirstLane).V);
2681
2682 SmallBitVector UsedLanes(NumLanes);
2683 UsedLanes.set(FirstLane);
2684 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2685 // Visit the lane on the right and then the lane on the left.
2686 for (int Direction : {+1, -1}) {
2687 int Lane = FirstLane + Direction * Distance;
2688 if (Lane < 0 || Lane >= (int)NumLanes)
2689 continue;
2690 UsedLanes.set(Lane);
2691 int LastLane = Lane - Direction;
2692 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2693 "Out of bounds");
2694 // Look for a good match for each operand.
2695 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2696 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2697 std::optional<unsigned> BestIdx =
2698 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2699 MainAltOps[OpIdx], UsedLanes);
2700 // By not selecting a value, we allow the operands that follow to
2701 // select a better matching value. We will get a non-null value in
2702 // the next run of getBestOperand().
2703 if (BestIdx) {
2704 // Swap the current operand with the one returned by
2705 // getBestOperand().
2706 swap(OpIdx, *BestIdx, Lane);
2707 } else {
2708 // Enable the second pass.
2709 StrategyFailed = true;
2710 }
2711 // Try to get the alternate opcode and follow it during analysis.
2712 if (MainAltOps[OpIdx].size() != 2) {
2713 OperandData &AltOp = getData(OpIdx, Lane);
2714 InstructionsState OpS =
2715 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2716 if (OpS && OpS.isAltShuffle())
2717 MainAltOps[OpIdx].push_back(AltOp.V);
2718 }
2719 }
2720 }
2721 }
2722 // Skip second pass if the strategy did not fail.
2723 if (!StrategyFailed)
2724 break;
2725 }
2726 }
2727
2728#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2729 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2730 switch (RMode) {
2731 case ReorderingMode::Load:
2732 return "Load";
2733 case ReorderingMode::Opcode:
2734 return "Opcode";
2735 case ReorderingMode::Constant:
2736 return "Constant";
2737 case ReorderingMode::Splat:
2738 return "Splat";
2739 case ReorderingMode::Failed:
2740 return "Failed";
2741 }
2742 llvm_unreachable("Unimplemented Reordering Type");
2743 }
2744
2745 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2746 raw_ostream &OS) {
2747 return OS << getModeStr(RMode);
2748 }
2749
2750 /// Debug print.
2751 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2752 printMode(RMode, dbgs());
2753 }
2754
2755 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2756 return printMode(RMode, OS);
2757 }
2758
2760 const unsigned Indent = 2;
2761 unsigned Cnt = 0;
2762 for (const OperandDataVec &OpDataVec : OpsVec) {
2763 OS << "Operand " << Cnt++ << "\n";
2764 for (const OperandData &OpData : OpDataVec) {
2765 OS.indent(Indent) << "{";
2766 if (Value *V = OpData.V)
2767 OS << *V;
2768 else
2769 OS << "null";
2770 OS << ", APO:" << OpData.APO << "}\n";
2771 }
2772 OS << "\n";
2773 }
2774 return OS;
2775 }
2776
2777 /// Debug print.
2778 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2779#endif
2780 };
2781
2782 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2783 /// for a pair which have highest score deemed to have best chance to form
2784 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2785 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2786 /// of the cost, considered to be good enough score.
2787 std::optional<int>
2788 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2789 int Limit = LookAheadHeuristics::ScoreFail) const {
2790 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2792 int BestScore = Limit;
2793 std::optional<int> Index;
2794 for (int I : seq<int>(0, Candidates.size())) {
2795 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2796 Candidates[I].second,
2797 /*U1=*/nullptr, /*U2=*/nullptr,
2798 /*CurrLevel=*/1, {});
2799 if (Score > BestScore) {
2800 BestScore = Score;
2801 Index = I;
2802 }
2803 }
2804 return Index;
2805 }
2806
2807 /// Checks if the instruction is marked for deletion.
2808 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2809
2810 /// Removes an instruction from its block and eventually deletes it.
2811 /// It's like Instruction::eraseFromParent() except that the actual deletion
2812 /// is delayed until BoUpSLP is destructed.
2814 DeletedInstructions.insert(I);
2815 }
2816
2817 /// Remove instructions from the parent function and clear the operands of \p
2818 /// DeadVals instructions, marking for deletion trivially dead operands.
2819 template <typename T>
2822 for (T *V : DeadVals) {
2823 auto *I = cast<Instruction>(V);
2824 DeletedInstructions.insert(I);
2825 }
2826 DenseSet<Value *> Processed;
2827 for (T *V : DeadVals) {
2828 if (!V || !Processed.insert(V).second)
2829 continue;
2830 auto *I = cast<Instruction>(V);
2833 if (const TreeEntry *Entry = getTreeEntry(I)) {
2834 Entries.push_back(Entry);
2835 auto It = MultiNodeScalars.find(I);
2836 if (It != MultiNodeScalars.end())
2837 Entries.append(It->second.begin(), It->second.end());
2838 }
2839 for (Use &U : I->operands()) {
2840 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2841 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2843 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2844 return Entry->VectorizedValue == OpI;
2845 })))
2846 DeadInsts.push_back(OpI);
2847 }
2848 I->dropAllReferences();
2849 }
2850 for (T *V : DeadVals) {
2851 auto *I = cast<Instruction>(V);
2852 if (!I->getParent())
2853 continue;
2854 assert((I->use_empty() || all_of(I->uses(),
2855 [&](Use &U) {
2856 return isDeleted(
2857 cast<Instruction>(U.getUser()));
2858 })) &&
2859 "trying to erase instruction with users.");
2860 I->removeFromParent();
2861 SE->forgetValue(I);
2862 }
2863 // Process the dead instruction list until empty.
2864 while (!DeadInsts.empty()) {
2865 Value *V = DeadInsts.pop_back_val();
2866 Instruction *VI = cast_or_null<Instruction>(V);
2867 if (!VI || !VI->getParent())
2868 continue;
2870 "Live instruction found in dead worklist!");
2871 assert(VI->use_empty() && "Instructions with uses are not dead.");
2872
2873 // Don't lose the debug info while deleting the instructions.
2874 salvageDebugInfo(*VI);
2875
2876 // Null out all of the instruction's operands to see if any operand
2877 // becomes dead as we go.
2878 for (Use &OpU : VI->operands()) {
2879 Value *OpV = OpU.get();
2880 if (!OpV)
2881 continue;
2882 OpU.set(nullptr);
2883
2884 if (!OpV->use_empty())
2885 continue;
2886
2887 // If the operand is an instruction that became dead as we nulled out
2888 // the operand, and if it is 'trivially' dead, delete it in a future
2889 // loop iteration.
2890 if (auto *OpI = dyn_cast<Instruction>(OpV))
2891 if (!DeletedInstructions.contains(OpI) &&
2893 DeadInsts.push_back(OpI);
2894 }
2895
2896 VI->removeFromParent();
2897 DeletedInstructions.insert(VI);
2898 SE->forgetValue(VI);
2899 }
2900 }
2901
2902 /// Checks if the instruction was already analyzed for being possible
2903 /// reduction root.
2905 return AnalyzedReductionsRoots.count(I);
2906 }
2907 /// Register given instruction as already analyzed for being possible
2908 /// reduction root.
2910 AnalyzedReductionsRoots.insert(I);
2911 }
2912 /// Checks if the provided list of reduced values was checked already for
2913 /// vectorization.
2915 return AnalyzedReductionVals.contains(hash_value(VL));
2916 }
2917 /// Adds the list of reduced values to list of already checked values for the
2918 /// vectorization.
2920 AnalyzedReductionVals.insert(hash_value(VL));
2921 }
2922 /// Clear the list of the analyzed reduction root instructions.
2924 AnalyzedReductionsRoots.clear();
2925 AnalyzedReductionVals.clear();
2926 AnalyzedMinBWVals.clear();
2927 }
2928 /// Checks if the given value is gathered in one of the nodes.
2929 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2930 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2931 }
2932 /// Checks if the given value is gathered in one of the nodes.
2933 bool isGathered(const Value *V) const {
2934 return MustGather.contains(V);
2935 }
2936 /// Checks if the specified value was not schedule.
2937 bool isNotScheduled(const Value *V) const {
2938 return NonScheduledFirst.contains(V);
2939 }
2940
2941 /// Check if the value is vectorized in the tree.
2942 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2943
2944 ~BoUpSLP();
2945
2946private:
2947 /// Determine if a node \p E in can be demoted to a smaller type with a
2948 /// truncation. We collect the entries that will be demoted in ToDemote.
2949 /// \param E Node for analysis
2950 /// \param ToDemote indices of the nodes to be demoted.
2951 bool collectValuesToDemote(
2952 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2954 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2955 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2956
2957 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2958 /// reordering (i.e. the operands can be reordered because they have only one
2959 /// user and reordarable).
2960 /// \param ReorderableGathers List of all gather nodes that require reordering
2961 /// (e.g., gather of extractlements or partially vectorizable loads).
2962 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2963 /// reordering, subset of \p NonVectorized.
2964 bool
2965 canReorderOperands(TreeEntry *UserTE,
2966 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2967 ArrayRef<TreeEntry *> ReorderableGathers,
2968 SmallVectorImpl<TreeEntry *> &GatherOps);
2969
2970 /// Checks if the given \p TE is a gather node with clustered reused scalars
2971 /// and reorders it per given \p Mask.
2972 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2973
2974 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2975 /// if any. If it is not vectorized (gather node), returns nullptr.
2976 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2977 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2978 TreeEntry *TE = nullptr;
2979 const auto *It = find_if(VL, [&](Value *V) {
2980 TE = getTreeEntry(V);
2981 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2982 return true;
2983 auto It = MultiNodeScalars.find(V);
2984 if (It != MultiNodeScalars.end()) {
2985 for (TreeEntry *E : It->second) {
2986 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2987 TE = E;
2988 return true;
2989 }
2990 }
2991 }
2992 return false;
2993 });
2994 if (It != VL.end()) {
2995 assert(TE->isSame(VL) && "Expected same scalars.");
2996 return TE;
2997 }
2998 return nullptr;
2999 }
3000
3001 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
3002 /// if any. If it is not vectorized (gather node), returns nullptr.
3003 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
3004 unsigned OpIdx) const {
3005 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
3006 const_cast<TreeEntry *>(UserTE), OpIdx);
3007 }
3008
3009 /// Checks if all users of \p I are the part of the vectorization tree.
3010 bool areAllUsersVectorized(
3011 Instruction *I,
3012 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3013
3014 /// Return information about the vector formed for the specified index
3015 /// of a vector of (the same) instruction.
3017
3018 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3019 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3020
3021 /// Gets the root instruction for the given node. If the node is a strided
3022 /// load/store node with the reverse order, the root instruction is the last
3023 /// one.
3024 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3025
3026 /// \returns Cast context for the given graph node.
3028 getCastContextHint(const TreeEntry &TE) const;
3029
3030 /// \returns the cost of the vectorizable entry.
3031 InstructionCost getEntryCost(const TreeEntry *E,
3032 ArrayRef<Value *> VectorizedVals,
3033 SmallPtrSetImpl<Value *> &CheckedExtracts);
3034
3035 /// This is the recursive part of buildTree.
3036 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3037 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3038
3039 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3040 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3041 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3042 /// returns false, setting \p CurrentOrder to either an empty vector or a
3043 /// non-identity permutation that allows to reuse extract instructions.
3044 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3045 /// extract order.
3046 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
3047 SmallVectorImpl<unsigned> &CurrentOrder,
3048 bool ResizeAllowed = false) const;
3049
3050 /// Vectorize a single entry in the tree.
3051 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3052 /// avoid issues with def-use order.
3053 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3054
3055 /// Returns vectorized operand node, that matches the order of the scalars
3056 /// operand number \p NodeIdx in entry \p E.
3057 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3058 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3059 unsigned NodeIdx) const {
3060 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3061 }
3062
3063 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3064 /// \p E.
3065 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3066 /// avoid issues with def-use order.
3067 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3068
3069 /// Create a new vector from a list of scalar values. Produces a sequence
3070 /// which exploits values reused across lanes, and arranges the inserts
3071 /// for ease of later optimization.
3072 template <typename BVTy, typename ResTy, typename... Args>
3073 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3074
3075 /// Create a new vector from a list of scalar values. Produces a sequence
3076 /// which exploits values reused across lanes, and arranges the inserts
3077 /// for ease of later optimization.
3078 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3079 bool PostponedPHIs);
3080
3081 /// Returns the instruction in the bundle, which can be used as a base point
3082 /// for scheduling. Usually it is the last instruction in the bundle, except
3083 /// for the case when all operands are external (in this case, it is the first
3084 /// instruction in the list).
3085 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3086
3087 /// Tries to find extractelement instructions with constant indices from fixed
3088 /// vector type and gather such instructions into a bunch, which highly likely
3089 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3090 /// was successful, the matched scalars are replaced by poison values in \p VL
3091 /// for future analysis.
3092 std::optional<TargetTransformInfo::ShuffleKind>
3093 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3094 SmallVectorImpl<int> &Mask) const;
3095
3096 /// Tries to find extractelement instructions with constant indices from fixed
3097 /// vector type and gather such instructions into a bunch, which highly likely
3098 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3099 /// was successful, the matched scalars are replaced by poison values in \p VL
3100 /// for future analysis.
3102 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3104 unsigned NumParts) const;
3105
3106 /// Checks if the gathered \p VL can be represented as a single register
3107 /// shuffle(s) of previous tree entries.
3108 /// \param TE Tree entry checked for permutation.
3109 /// \param VL List of scalars (a subset of the TE scalar), checked for
3110 /// permutations. Must form single-register vector.
3111 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3112 /// commands to build the mask using the original vector value, without
3113 /// relying on the potential reordering.
3114 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3115 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3116 std::optional<TargetTransformInfo::ShuffleKind>
3117 isGatherShuffledSingleRegisterEntry(
3118 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3119 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3120 bool ForOrder);
3121
3122 /// Checks if the gathered \p VL can be represented as multi-register
3123 /// shuffle(s) of previous tree entries.
3124 /// \param TE Tree entry checked for permutation.
3125 /// \param VL List of scalars (a subset of the TE scalar), checked for
3126 /// permutations.
3127 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3128 /// commands to build the mask using the original vector value, without
3129 /// relying on the potential reordering.
3130 /// \returns per-register series of ShuffleKind, if gathered values can be
3131 /// represented as shuffles of previous tree entries. \p Mask is filled with
3132 /// the shuffle mask (also on per-register base).
3134 isGatherShuffledEntry(
3135 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3137 unsigned NumParts, bool ForOrder = false);
3138
3139 /// \returns the cost of gathering (inserting) the values in \p VL into a
3140 /// vector.
3141 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3142 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3143 Type *ScalarTy) const;
3144
3145 /// Set the Builder insert point to one after the last instruction in
3146 /// the bundle
3147 void setInsertPointAfterBundle(const TreeEntry *E);
3148
3149 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3150 /// specified, the starting vector value is poison.
3151 Value *
3152 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3153 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3154
3155 /// \returns whether the VectorizableTree is fully vectorizable and will
3156 /// be beneficial even the tree height is tiny.
3157 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3158
3159 /// Run through the list of all gathered loads in the graph and try to find
3160 /// vector loads/masked gathers instead of regular gathers. Later these loads
3161 /// are reshufled to build final gathered nodes.
3162 void tryToVectorizeGatheredLoads(
3163 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3164 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3165 8> &GatheredLoads);
3166
3167 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3168 /// users of \p TE and collects the stores. It returns the map from the store
3169 /// pointers to the collected stores.
3171 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3172
3173 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3174 /// stores in \p StoresVec can form a vector instruction. If so it returns
3175 /// true and populates \p ReorderIndices with the shuffle indices of the
3176 /// stores when compared to the sorted vector.
3177 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3178 OrdersType &ReorderIndices) const;
3179
3180 /// Iterates through the users of \p TE, looking for scalar stores that can be
3181 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3182 /// their order and builds an order index vector for each store bundle. It
3183 /// returns all these order vectors found.
3184 /// We run this after the tree has formed, otherwise we may come across user
3185 /// instructions that are not yet in the tree.
3187 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3188
3189 /// Tries to reorder the gathering node for better vectorization
3190 /// opportunities.
3191 void reorderGatherNode(TreeEntry &TE);
3192
3193 struct TreeEntry {
3194 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3195 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3196
3197 /// \returns Common mask for reorder indices and reused scalars.
3198 SmallVector<int> getCommonMask() const {
3200 inversePermutation(ReorderIndices, Mask);
3201 ::addMask(Mask, ReuseShuffleIndices);
3202 return Mask;
3203 }
3204
3205 /// \returns true if the scalars in VL are equal to this entry.
3206 bool isSame(ArrayRef<Value *> VL) const {
3207 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3208 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3209 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3210 return VL.size() == Mask.size() &&
3211 std::equal(VL.begin(), VL.end(), Mask.begin(),
3212 [Scalars](Value *V, int Idx) {
3213 return (isa<UndefValue>(V) &&
3214 Idx == PoisonMaskElem) ||
3215 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3216 });
3217 };
3218 if (!ReorderIndices.empty()) {
3219 // TODO: implement matching if the nodes are just reordered, still can
3220 // treat the vector as the same if the list of scalars matches VL
3221 // directly, without reordering.
3223 inversePermutation(ReorderIndices, Mask);
3224 if (VL.size() == Scalars.size())
3225 return IsSame(Scalars, Mask);
3226 if (VL.size() == ReuseShuffleIndices.size()) {
3227 ::addMask(Mask, ReuseShuffleIndices);
3228 return IsSame(Scalars, Mask);
3229 }
3230 return false;
3231 }
3232 return IsSame(Scalars, ReuseShuffleIndices);
3233 }
3234
3235 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3236 return isGather() && !UserTreeIndices.empty() &&
3237 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3238 UserTreeIndices.front().UserTE == UserEI.UserTE;
3239 }
3240
3241 /// \returns true if current entry has same operands as \p TE.
3242 bool hasEqualOperands(const TreeEntry &TE) const {
3243 if (TE.getNumOperands() != getNumOperands())
3244 return false;
3245 SmallBitVector Used(getNumOperands());
3246 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3247 unsigned PrevCount = Used.count();
3248 for (unsigned K = 0; K < E; ++K) {
3249 if (Used.test(K))
3250 continue;
3251 if (getOperand(K) == TE.getOperand(I)) {
3252 Used.set(K);
3253 break;
3254 }
3255 }
3256 // Check if we actually found the matching operand.
3257 if (PrevCount == Used.count())
3258 return false;
3259 }
3260 return true;
3261 }
3262
3263 /// \return Final vectorization factor for the node. Defined by the total
3264 /// number of vectorized scalars, including those, used several times in the
3265 /// entry and counted in the \a ReuseShuffleIndices, if any.
3266 unsigned getVectorFactor() const {
3267 if (!ReuseShuffleIndices.empty())
3268 return ReuseShuffleIndices.size();
3269 return Scalars.size();
3270 };
3271
3272 /// Checks if the current node is a gather node.
3273 bool isGather() const {return State == NeedToGather; }
3274
3275 /// A vector of scalars.
3276 ValueList Scalars;
3277
3278 /// The Scalars are vectorized into this value. It is initialized to Null.
3279 WeakTrackingVH VectorizedValue = nullptr;
3280
3281 /// New vector phi instructions emitted for the vectorized phi nodes.
3282 PHINode *PHI = nullptr;
3283
3284 /// Do we need to gather this sequence or vectorize it
3285 /// (either with vector instruction or with scatter/gather
3286 /// intrinsics for store/load)?
3287 enum EntryState {
3288 Vectorize, ///< The node is regularly vectorized.
3289 ScatterVectorize, ///< Masked scatter/gather node.
3290 StridedVectorize, ///< Strided loads (and stores)
3291 NeedToGather, ///< Gather/buildvector node.
3292 CombinedVectorize, ///< Vectorized node, combined with its user into more
3293 ///< complex node like select/cmp to minmax, mul/add to
3294 ///< fma, etc. Must be used for the following nodes in
3295 ///< the pattern, not the very first one.
3296 };
3297 EntryState State;
3298
3299 /// List of combined opcodes supported by the vectorizer.
3300 enum CombinedOpcode {
3301 NotCombinedOp = -1,
3302 MinMax = Instruction::OtherOpsEnd + 1,
3303 };
3304 CombinedOpcode CombinedOp = NotCombinedOp;
3305
3306 /// Does this sequence require some shuffling?
3307 SmallVector<int, 4> ReuseShuffleIndices;
3308
3309 /// Does this entry require reordering?
3310 SmallVector<unsigned, 4> ReorderIndices;
3311
3312 /// Points back to the VectorizableTree.
3313 ///
3314 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3315 /// to be a pointer and needs to be able to initialize the child iterator.
3316 /// Thus we need a reference back to the container to translate the indices
3317 /// to entries.
3318 VecTreeTy &Container;
3319
3320 /// The TreeEntry index containing the user of this entry. We can actually
3321 /// have multiple users so the data structure is not truly a tree.
3322 SmallVector<EdgeInfo, 1> UserTreeIndices;
3323
3324 /// The index of this treeEntry in VectorizableTree.
3325 unsigned Idx = 0;
3326
3327 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3328 /// other nodes as a series of insertvector instructions.
3329 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3330
3331 private:
3332 /// The operands of each instruction in each lane Operands[op_index][lane].
3333 /// Note: This helps avoid the replication of the code that performs the
3334 /// reordering of operands during buildTree_rec() and vectorizeTree().
3336
3337 /// The main/alternate instruction.
3338 Instruction *MainOp = nullptr;
3339 Instruction *AltOp = nullptr;
3340
3341 /// Interleaving factor for interleaved loads Vectorize nodes.
3342 unsigned InterleaveFactor = 0;
3343
3344 public:
3345 /// Returns interleave factor for interleave nodes.
3346 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3347 /// Sets interleaving factor for the interleaving nodes.
3348 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3349
3350 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3351 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3352 if (Operands.size() < OpIdx + 1)
3353 Operands.resize(OpIdx + 1);
3354 assert(Operands[OpIdx].empty() && "Already resized?");
3355 assert(OpVL.size() <= Scalars.size() &&
3356 "Number of operands is greater than the number of scalars.");
3357 Operands[OpIdx].resize(OpVL.size());
3358 copy(OpVL, Operands[OpIdx].begin());
3359 }
3360
3361 /// Set this bundle's operand from Scalars.
3362 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3363 VLOperands Ops(Scalars, MainOp, R);
3364 if (RequireReorder)
3365 Ops.reorder();
3366 for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
3367 setOperand(I, Ops.getVL(I));
3368 }
3369
3370 /// Reorders operands of the node to the given mask \p Mask.
3371 void reorderOperands(ArrayRef<int> Mask) {
3372 for (ValueList &Operand : Operands)
3373 reorderScalars(Operand, Mask);
3374 }
3375
3376 /// \returns the \p OpIdx operand of this TreeEntry.
3377 ValueList &getOperand(unsigned OpIdx) {
3378 assert(OpIdx < Operands.size() && "Off bounds");
3379 return Operands[OpIdx];
3380 }
3381
3382 /// \returns the \p OpIdx operand of this TreeEntry.
3383 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3384 assert(OpIdx < Operands.size() && "Off bounds");
3385 return Operands[OpIdx];
3386 }
3387
3388 /// \returns the number of operands.
3389 unsigned getNumOperands() const { return Operands.size(); }
3390
3391 /// \return the single \p OpIdx operand.
3392 Value *getSingleOperand(unsigned OpIdx) const {
3393 assert(OpIdx < Operands.size() && "Off bounds");
3394 assert(!Operands[OpIdx].empty() && "No operand available");
3395 return Operands[OpIdx][0];
3396 }
3397
3398 /// Some of the instructions in the list have alternate opcodes.
3399 bool isAltShuffle() const { return MainOp != AltOp; }
3400
3401 bool isOpcodeOrAlt(Instruction *I) const {
3402 unsigned CheckedOpcode = I->getOpcode();
3403 return (getOpcode() == CheckedOpcode ||
3404 getAltOpcode() == CheckedOpcode);
3405 }
3406
3407 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3408 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3409 /// \p OpValue.
3410 Value *isOneOf(Value *Op) const {
3411 auto *I = dyn_cast<Instruction>(Op);
3412 if (I && isOpcodeOrAlt(I))
3413 return Op;
3414 return MainOp;
3415 }
3416
3417 void setOperations(const InstructionsState &S) {
3418 assert(S && "InstructionsState is invalid.");
3419 MainOp = S.getMainOp();
3420 AltOp = S.getAltOp();
3421 }
3422
3423 Instruction *getMainOp() const {
3424 return MainOp;
3425 }
3426
3427 Instruction *getAltOp() const {
3428 return AltOp;
3429 }
3430
3431 /// The main/alternate opcodes for the list of instructions.
3432 unsigned getOpcode() const {
3433 return MainOp ? MainOp->getOpcode() : 0;
3434 }
3435
3436 unsigned getAltOpcode() const {
3437 return AltOp ? AltOp->getOpcode() : 0;
3438 }
3439
3440 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3441 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3442 int findLaneForValue(Value *V) const {
3443 unsigned FoundLane = getVectorFactor();
3444 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3445 std::advance(It, 1)) {
3446 if (*It != V)
3447 continue;
3448 FoundLane = std::distance(Scalars.begin(), It);
3449 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3450 if (!ReorderIndices.empty())
3451 FoundLane = ReorderIndices[FoundLane];
3452 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3453 if (ReuseShuffleIndices.empty())
3454 break;
3455 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3456 RIt != ReuseShuffleIndices.end()) {
3457 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3458 break;
3459 }
3460 }
3461 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3462 return FoundLane;
3463 }
3464
3465 /// Build a shuffle mask for graph entry which represents a merge of main
3466 /// and alternate operations.
3467 void
3468 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3470 SmallVectorImpl<Value *> *OpScalars = nullptr,
3471 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3472
3473 /// Return true if this is a non-power-of-2 node.
3474 bool isNonPowOf2Vec() const {
3475 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3476 return IsNonPowerOf2;
3477 }
3478
3479 /// Return true if this is a node, which tries to vectorize number of
3480 /// elements, forming whole vectors.
3481 bool
3482 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3483 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3484 TTI, getValueType(Scalars.front()), Scalars.size());
3485 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3486 "Reshuffling not supported with non-power-of-2 vectors yet.");
3487 return IsNonPowerOf2;
3488 }
3489
3490 Value *getOrdered(unsigned Idx) const {
3491 assert(isGather() && "Must be used only for buildvectors/gathers.");
3492 if (ReorderIndices.empty())
3493 return Scalars[Idx];
3495 inversePermutation(ReorderIndices, Mask);
3496 return Scalars[Mask[Idx]];
3497 }
3498
3499#ifndef NDEBUG
3500 /// Debug printer.
3501 LLVM_DUMP_METHOD void dump() const {
3502 dbgs() << Idx << ".\n";
3503 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3504 dbgs() << "Operand " << OpI << ":\n";
3505 for (const Value *V : Operands[OpI])
3506 dbgs().indent(2) << *V << "\n";
3507 }
3508 dbgs() << "Scalars: \n";
3509 for (Value *V : Scalars)
3510 dbgs().indent(2) << *V << "\n";
3511 dbgs() << "State: ";
3512 switch (State) {
3513 case Vectorize:
3514 if (InterleaveFactor > 0) {
3515 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3516 << "\n";
3517 } else {
3518 dbgs() << "Vectorize\n";
3519 }
3520 break;
3521 case ScatterVectorize:
3522 dbgs() << "ScatterVectorize\n";
3523 break;
3524 case StridedVectorize:
3525 dbgs() << "StridedVectorize\n";
3526 break;
3527 case NeedToGather:
3528 dbgs() << "NeedToGather\n";
3529 break;
3530 case CombinedVectorize:
3531 dbgs() << "CombinedVectorize\n";
3532 break;
3533 }
3534 dbgs() << "MainOp: ";
3535 if (MainOp)
3536 dbgs() << *MainOp << "\n";
3537 else
3538 dbgs() << "NULL\n";
3539 dbgs() << "AltOp: ";
3540 if (AltOp)
3541 dbgs() << *AltOp << "\n";
3542 else
3543 dbgs() << "NULL\n";
3544 dbgs() << "VectorizedValue: ";
3545 if (VectorizedValue)
3546 dbgs() << *VectorizedValue << "\n";
3547 else
3548 dbgs() << "NULL\n";
3549 dbgs() << "ReuseShuffleIndices: ";
3550 if (ReuseShuffleIndices.empty())
3551 dbgs() << "Empty";
3552 else
3553 for (int ReuseIdx : ReuseShuffleIndices)
3554 dbgs() << ReuseIdx << ", ";
3555 dbgs() << "\n";
3556 dbgs() << "ReorderIndices: ";
3557 for (unsigned ReorderIdx : ReorderIndices)
3558 dbgs() << ReorderIdx << ", ";
3559 dbgs() << "\n";
3560 dbgs() << "UserTreeIndices: ";
3561 for (const auto &EInfo : UserTreeIndices)
3562 dbgs() << EInfo << ", ";
3563 dbgs() << "\n";
3564 if (!CombinedEntriesWithIndices.empty()) {
3565 dbgs() << "Combined entries: ";
3566 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
3567 dbgs() << "Entry index " << P.first << " with offset " << P.second;
3568 });
3569 dbgs() << "\n";
3570 }
3571 }
3572#endif
3573 };
3574
3575#ifndef NDEBUG
3576 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3577 InstructionCost VecCost, InstructionCost ScalarCost,
3578 StringRef Banner) const {
3579 dbgs() << "SLP: " << Banner << ":\n";
3580 E->dump();
3581 dbgs() << "SLP: Costs:\n";
3582 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3583 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3584 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3585 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3586 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3587 }
3588#endif
3589
3590 /// Create a new VectorizableTree entry.
3591 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3592 std::optional<ScheduleData *> Bundle,
3593 const InstructionsState &S,
3594 const EdgeInfo &UserTreeIdx,
3595 ArrayRef<int> ReuseShuffleIndices = {},
3596 ArrayRef<unsigned> ReorderIndices = {},
3597 unsigned InterleaveFactor = 0) {
3598 TreeEntry::EntryState EntryState =
3599 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3600 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3601 ReuseShuffleIndices, ReorderIndices);
3602 if (E && InterleaveFactor > 0)
3603 E->setInterleave(InterleaveFactor);
3604 return E;
3605 }
3606
3607 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3608 TreeEntry::EntryState EntryState,
3609 std::optional<ScheduleData *> Bundle,
3610 const InstructionsState &S,
3611 const EdgeInfo &UserTreeIdx,
3612 ArrayRef<int> ReuseShuffleIndices = {},
3613 ArrayRef<unsigned> ReorderIndices = {}) {
3614 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3615 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3616 "Need to vectorize gather entry?");
3617 // Gathered loads still gathered? Do not create entry, use the original one.
3618 if (GatheredLoadsEntriesFirst.has_value() &&
3619 EntryState == TreeEntry::NeedToGather && S &&
3620 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3621 !UserTreeIdx.UserTE)
3622 return nullptr;
3623 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3624 TreeEntry *Last = VectorizableTree.back().get();
3625 Last->Idx = VectorizableTree.size() - 1;
3626 Last->State = EntryState;
3627 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3628 // for non-power-of-two vectors.
3629 assert(
3631 ReuseShuffleIndices.empty()) &&
3632 "Reshuffling scalars not yet supported for nodes with padding");
3633 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3634 ReuseShuffleIndices.end());
3635 if (ReorderIndices.empty()) {
3636 Last->Scalars.assign(VL.begin(), VL.end());
3637 if (S)
3638 Last->setOperations(S);
3639 } else {
3640 // Reorder scalars and build final mask.
3641 Last->Scalars.assign(VL.size(), nullptr);
3642 transform(ReorderIndices, Last->Scalars.begin(),
3643 [VL](unsigned Idx) -> Value * {
3644 if (Idx >= VL.size())
3645 return UndefValue::get(VL.front()->getType());
3646 return VL[Idx];
3647 });
3648 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3649 if (S)
3650 Last->setOperations(S);
3651 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3652 }
3653 if (!Last->isGather()) {
3654 for (Value *V : VL) {
3655 const TreeEntry *TE = getTreeEntry(V);
3656 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3657 "Scalar already in tree!");
3658 if (TE) {
3659 if (TE != Last)
3660 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3661 continue;
3662 }
3663 ScalarToTreeEntry[V] = Last;
3664 }
3665 // Update the scheduler bundle to point to this TreeEntry.
3666 ScheduleData *BundleMember = *Bundle;
3667 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3668 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3669 doesNotNeedToSchedule(VL)) &&
3670 "Bundle and VL out of sync");
3671 if (BundleMember) {
3672 for (Value *V : VL) {
3674 continue;
3675 if (!BundleMember)
3676 continue;
3677 BundleMember->TE = Last;
3678 BundleMember = BundleMember->NextInBundle;
3679 }
3680 }
3681 assert(!BundleMember && "Bundle and VL out of sync");
3682 } else {
3683 // Build a map for gathered scalars to the nodes where they are used.
3684 bool AllConstsOrCasts = true;
3685 for (Value *V : VL)
3686 if (!isConstant(V)) {
3687 auto *I = dyn_cast<CastInst>(V);
3688 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3689 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3690 !UserTreeIdx.UserTE->isGather())
3691 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3692 }
3693 if (AllConstsOrCasts)
3694 CastMaxMinBWSizes =
3695 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3696 MustGather.insert(VL.begin(), VL.end());
3697 }
3698
3699 if (UserTreeIdx.UserTE)
3700 Last->UserTreeIndices.push_back(UserTreeIdx);
3701 return Last;
3702 }
3703
3704 /// -- Vectorization State --
3705 /// Holds all of the tree entries.
3706 TreeEntry::VecTreeTy VectorizableTree;
3707
3708#ifndef NDEBUG
3709 /// Debug printer.
3710 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3711 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3712 VectorizableTree[Id]->dump();
3713 dbgs() << "\n";
3714 }
3715 }
3716#endif
3717
3718 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3719
3720 const TreeEntry *getTreeEntry(Value *V) const {
3721 return ScalarToTreeEntry.lookup(V);
3722 }
3723
3724 /// Check that the operand node of alternate node does not generate
3725 /// buildvector sequence. If it is, then probably not worth it to build
3726 /// alternate shuffle, if number of buildvector operands + alternate
3727 /// instruction > than the number of buildvector instructions.
3728 /// \param S the instructions state of the analyzed values.
3729 /// \param VL list of the instructions with alternate opcodes.
3730 bool areAltOperandsProfitable(const InstructionsState &S,
3731 ArrayRef<Value *> VL) const;
3732
3733 /// Checks if the specified list of the instructions/values can be vectorized
3734 /// and fills required data before actual scheduling of the instructions.
3735 TreeEntry::EntryState
3736 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3737 bool IsScatterVectorizeUserTE,
3738 OrdersType &CurrentOrder,
3739 SmallVectorImpl<Value *> &PointerOps);
3740
3741 /// Maps a specific scalar to its tree entry.
3742 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3743
3744 /// List of scalars, used in several vectorize nodes, and the list of the
3745 /// nodes.
3747
3748 /// Maps a value to the proposed vectorizable size.
3749 SmallDenseMap<Value *, unsigned> InstrElementSize;
3750
3751 /// A list of scalars that we found that we need to keep as scalars.
3752 ValueSet MustGather;
3753
3754 /// A set of first non-schedulable values.
3755 ValueSet NonScheduledFirst;
3756
3757 /// A map between the vectorized entries and the last instructions in the
3758 /// bundles. The bundles are built in use order, not in the def order of the
3759 /// instructions. So, we cannot rely directly on the last instruction in the
3760 /// bundle being the last instruction in the program order during
3761 /// vectorization process since the basic blocks are affected, need to
3762 /// pre-gather them before.
3763 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3764
3765 /// List of gather nodes, depending on other gather/vector nodes, which should
3766 /// be emitted after the vector instruction emission process to correctly
3767 /// handle order of the vector instructions and shuffles.
3768 SetVector<const TreeEntry *> PostponedGathers;
3769
3770 using ValueToGatherNodesMap =
3772 ValueToGatherNodesMap ValueToGatherNodes;
3773
3774 /// A list of the load entries (node indices), which can be vectorized using
3775 /// strided or masked gather approach, but attempted to be represented as
3776 /// contiguous loads.
3777 SetVector<unsigned> LoadEntriesToVectorize;
3778
3779 /// true if graph nodes transforming mode is on.
3780 bool IsGraphTransformMode = false;
3781
3782 /// The index of the first gathered load entry in the VectorizeTree.
3783 std::optional<unsigned> GatheredLoadsEntriesFirst;
3784
3785 /// This POD struct describes one external user in the vectorized tree.
3786 struct ExternalUser {
3787 ExternalUser(Value *S, llvm::User *U, int L)
3788 : Scalar(S), User(U), Lane(L) {}
3789
3790 // Which scalar in our function.
3791 Value *Scalar;
3792
3793 // Which user that uses the scalar.
3795
3796 // Which lane does the scalar belong to.
3797 int Lane;
3798 };
3799 using UserList = SmallVector<ExternalUser, 16>;
3800
3801 /// Checks if two instructions may access the same memory.
3802 ///
3803 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3804 /// is invariant in the calling loop.
3805 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3806 Instruction *Inst2) {
3807 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3808 return true;
3809 // First check if the result is already in the cache.
3810 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3811 auto It = AliasCache.find(Key);
3812 if (It != AliasCache.end())
3813 return It->second;
3814 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3815 // Store the result in the cache.
3816 AliasCache.try_emplace(Key, Aliased);
3817 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3818 return Aliased;
3819 }
3820
3821 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3822
3823 /// Cache for alias results.
3824 /// TODO: consider moving this to the AliasAnalysis itself.
3826
3827 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3828 // globally through SLP because we don't perform any action which
3829 // invalidates capture results.
3830 BatchAAResults BatchAA;
3831
3832 /// Temporary store for deleted instructions. Instructions will be deleted
3833 /// eventually when the BoUpSLP is destructed. The deferral is required to
3834 /// ensure that there are no incorrect collisions in the AliasCache, which
3835 /// can happen if a new instruction is allocated at the same address as a
3836 /// previously deleted instruction.
3837 DenseSet<Instruction *> DeletedInstructions;
3838
3839 /// Set of the instruction, being analyzed already for reductions.
3840 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3841
3842 /// Set of hashes for the list of reduction values already being analyzed.
3843 DenseSet<size_t> AnalyzedReductionVals;
3844
3845 /// Values, already been analyzed for mininmal bitwidth and found to be
3846 /// non-profitable.
3847 DenseSet<Value *> AnalyzedMinBWVals;
3848
3849 /// A list of values that need to extracted out of the tree.
3850 /// This list holds pairs of (Internal Scalar : External User). External User
3851 /// can be nullptr, it means that this Internal Scalar will be used later,
3852 /// after vectorization.
3853 UserList ExternalUses;
3854
3855 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3856 /// extractelement instructions.
3857 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3858
3859 /// Values used only by @llvm.assume calls.
3861
3862 /// Holds all of the instructions that we gathered, shuffle instructions and
3863 /// extractelements.
3864 SetVector<Instruction *> GatherShuffleExtractSeq;
3865
3866 /// A list of blocks that we are going to CSE.
3867 DenseSet<BasicBlock *> CSEBlocks;
3868
3869 /// List of hashes of vector of loads, which are known to be non vectorizable.
3870 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3871
3872 /// Contains all scheduling relevant data for an instruction.
3873 /// A ScheduleData either represents a single instruction or a member of an
3874 /// instruction bundle (= a group of instructions which is combined into a
3875 /// vector instruction).
3876 struct ScheduleData {
3877 // The initial value for the dependency counters. It means that the
3878 // dependencies are not calculated yet.
3879 enum { InvalidDeps = -1 };
3880
3881 ScheduleData() = default;
3882
3883 void init(int BlockSchedulingRegionID, Instruction *I) {
3884 FirstInBundle = this;
3885 NextInBundle = nullptr;
3886 NextLoadStore = nullptr;
3887 IsScheduled = false;
3888 SchedulingRegionID = BlockSchedulingRegionID;
3889 clearDependencies();
3890 Inst = I;
3891 TE = nullptr;
3892 }
3893
3894 /// Verify basic self consistency properties
3895 void verify() {
3896 if (hasValidDependencies()) {
3897 assert(UnscheduledDeps <= Dependencies && "invariant");
3898 } else {
3899 assert(UnscheduledDeps == Dependencies && "invariant");
3900 }
3901
3902 if (IsScheduled) {
3903 assert(isSchedulingEntity() &&
3904 "unexpected scheduled state");
3905 for (const ScheduleData *BundleMember = this; BundleMember;
3906 BundleMember = BundleMember->NextInBundle) {
3907 assert(BundleMember->hasValidDependencies() &&
3908 BundleMember->UnscheduledDeps == 0 &&
3909 "unexpected scheduled state");
3910 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3911 "only bundle is marked scheduled");
3912 }
3913 }
3914
3915 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3916 "all bundle members must be in same basic block");
3917 }
3918
3919 /// Returns true if the dependency information has been calculated.
3920 /// Note that depenendency validity can vary between instructions within
3921 /// a single bundle.
3922 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3923
3924 /// Returns true for single instructions and for bundle representatives
3925 /// (= the head of a bundle).
3926 bool isSchedulingEntity() const { return FirstInBundle == this; }
3927
3928 /// Returns true if it represents an instruction bundle and not only a
3929 /// single instruction.
3930 bool isPartOfBundle() const {
3931 return NextInBundle != nullptr || FirstInBundle != this || TE;
3932 }
3933
3934 /// Returns true if it is ready for scheduling, i.e. it has no more
3935 /// unscheduled depending instructions/bundles.
3936 bool isReady() const {
3937 assert(isSchedulingEntity() &&
3938 "can't consider non-scheduling entity for ready list");
3939 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3940 }
3941
3942 /// Modifies the number of unscheduled dependencies for this instruction,
3943 /// and returns the number of remaining dependencies for the containing
3944 /// bundle.
3945 int incrementUnscheduledDeps(int Incr) {
3946 assert(hasValidDependencies() &&
3947 "increment of unscheduled deps would be meaningless");
3948 UnscheduledDeps += Incr;
3949 return FirstInBundle->unscheduledDepsInBundle();
3950 }
3951
3952 /// Sets the number of unscheduled dependencies to the number of
3953 /// dependencies.
3954 void resetUnscheduledDeps() {
3955 UnscheduledDeps = Dependencies;
3956 }
3957
3958 /// Clears all dependency information.
3959 void clearDependencies() {
3960 Dependencies = InvalidDeps;
3961 resetUnscheduledDeps();
3962 MemoryDependencies.clear();
3963 ControlDependencies.clear();
3964 }
3965
3966 int unscheduledDepsInBundle() const {
3967 assert(isSchedulingEntity() && "only meaningful on the bundle");
3968 int Sum = 0;
3969 for (const ScheduleData *BundleMember = this; BundleMember;
3970 BundleMember = BundleMember->NextInBundle) {
3971 if (BundleMember->UnscheduledDeps == InvalidDeps)
3972 return InvalidDeps;
3973 Sum += BundleMember->UnscheduledDeps;
3974 }
3975 return Sum;
3976 }
3977
3978 void dump(raw_ostream &os) const {
3979 if (!isSchedulingEntity()) {
3980 os << "/ " << *Inst;
3981 } else if (NextInBundle) {
3982 os << '[' << *Inst;
3983 ScheduleData *SD = NextInBundle;
3984 while (SD) {
3985 os << ';' << *SD->Inst;
3986 SD = SD->NextInBundle;
3987 }
3988 os << ']';
3989 } else {
3990 os << *Inst;
3991 }
3992 }
3993
3994 Instruction *Inst = nullptr;
3995
3996 /// The TreeEntry that this instruction corresponds to.
3997 TreeEntry *TE = nullptr;
3998
3999 /// Points to the head in an instruction bundle (and always to this for
4000 /// single instructions).
4001 ScheduleData *FirstInBundle = nullptr;
4002
4003 /// Single linked list of all instructions in a bundle. Null if it is a
4004 /// single instruction.
4005 ScheduleData *NextInBundle = nullptr;
4006
4007 /// Single linked list of all memory instructions (e.g. load, store, call)
4008 /// in the block - until the end of the scheduling region.
4009 ScheduleData *NextLoadStore = nullptr;
4010
4011 /// The dependent memory instructions.
4012 /// This list is derived on demand in calculateDependencies().
4013 SmallVector<ScheduleData *, 4> MemoryDependencies;
4014
4015 /// List of instructions which this instruction could be control dependent
4016 /// on. Allowing such nodes to be scheduled below this one could introduce
4017 /// a runtime fault which didn't exist in the original program.
4018 /// ex: this is a load or udiv following a readonly call which inf loops
4019 SmallVector<ScheduleData *, 4> ControlDependencies;
4020
4021 /// This ScheduleData is in the current scheduling region if this matches
4022 /// the current SchedulingRegionID of BlockScheduling.
4023 int SchedulingRegionID = 0;
4024
4025 /// Used for getting a "good" final ordering of instructions.
4026 int SchedulingPriority = 0;
4027
4028 /// The number of dependencies. Constitutes of the number of users of the
4029 /// instruction plus the number of dependent memory instructions (if any).
4030 /// This value is calculated on demand.
4031 /// If InvalidDeps, the number of dependencies is not calculated yet.
4032 int Dependencies = InvalidDeps;
4033
4034 /// The number of dependencies minus the number of dependencies of scheduled
4035 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4036 /// for scheduling.
4037 /// Note that this is negative as long as Dependencies is not calculated.
4038 int UnscheduledDeps = InvalidDeps;
4039
4040 /// True if this instruction is scheduled (or considered as scheduled in the
4041 /// dry-run).
4042 bool IsScheduled = false;
4043 };
4044
4045#ifndef NDEBUG
4047 const BoUpSLP::ScheduleData &SD) {
4048 SD.dump(os);
4049 return os;
4050 }
4051#endif
4052
4053 friend struct GraphTraits<BoUpSLP *>;
4054 friend struct DOTGraphTraits<BoUpSLP *>;
4055
4056 /// Contains all scheduling data for a basic block.
4057 /// It does not schedules instructions, which are not memory read/write
4058 /// instructions and their operands are either constants, or arguments, or
4059 /// phis, or instructions from others blocks, or their users are phis or from
4060 /// the other blocks. The resulting vector instructions can be placed at the
4061 /// beginning of the basic block without scheduling (if operands does not need
4062 /// to be scheduled) or at the end of the block (if users are outside of the
4063 /// block). It allows to save some compile time and memory used by the
4064 /// compiler.
4065 /// ScheduleData is assigned for each instruction in between the boundaries of
4066 /// the tree entry, even for those, which are not part of the graph. It is
4067 /// required to correctly follow the dependencies between the instructions and
4068 /// their correct scheduling. The ScheduleData is not allocated for the
4069 /// instructions, which do not require scheduling, like phis, nodes with
4070 /// extractelements/insertelements only or nodes with instructions, with
4071 /// uses/operands outside of the block.
4072 struct BlockScheduling {
4073 BlockScheduling(BasicBlock *BB)
4074 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4075
4076 void clear() {
4077 ReadyInsts.clear();
4078 ScheduleStart = nullptr;
4079 ScheduleEnd = nullptr;
4080 FirstLoadStoreInRegion = nullptr;
4081 LastLoadStoreInRegion = nullptr;
4082 RegionHasStackSave = false;
4083
4084 // Reduce the maximum schedule region size by the size of the
4085 // previous scheduling run.
4086 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4087 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4088 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4089 ScheduleRegionSize = 0;
4090
4091 // Make a new scheduling region, i.e. all existing ScheduleData is not
4092 // in the new region yet.
4093 ++SchedulingRegionID;
4094 }
4095
4096 ScheduleData *getScheduleData(Instruction *I) {
4097 if (BB != I->getParent())
4098 // Avoid lookup if can't possibly be in map.
4099 return nullptr;
4100 ScheduleData *SD = ScheduleDataMap.lookup(I);
4101 if (SD && isInSchedulingRegion(SD))
4102 return SD;
4103 return nullptr;
4104 }
4105
4106 ScheduleData *getScheduleData(Value *V) {
4107 if (auto *I = dyn_cast<Instruction>(V))
4108 return getScheduleData(I);
4109 return nullptr;
4110 }
4111
4112 bool isInSchedulingRegion(ScheduleData *SD) const {
4113 return SD->SchedulingRegionID == SchedulingRegionID;
4114 }
4115
4116 /// Marks an instruction as scheduled and puts all dependent ready
4117 /// instructions into the ready-list.
4118 template <typename ReadyListType>
4119 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4120 SD->IsScheduled = true;
4121 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4122
4123 for (ScheduleData *BundleMember = SD; BundleMember;
4124 BundleMember = BundleMember->NextInBundle) {
4125
4126 // Handle the def-use chain dependencies.
4127
4128 // Decrement the unscheduled counter and insert to ready list if ready.
4129 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4130 ScheduleData *OpDef = getScheduleData(I);
4131 if (OpDef && OpDef->hasValidDependencies() &&
4132 OpDef->incrementUnscheduledDeps(-1) == 0) {
4133 // There are no more unscheduled dependencies after
4134 // decrementing, so we can put the dependent instruction
4135 // into the ready list.
4136 ScheduleData *DepBundle = OpDef->FirstInBundle;
4137 assert(!DepBundle->IsScheduled &&
4138 "already scheduled bundle gets ready");
4139 ReadyList.insert(DepBundle);
4141 << "SLP: gets ready (def): " << *DepBundle << "\n");
4142 }
4143 };
4144
4145 // If BundleMember is a vector bundle, its operands may have been
4146 // reordered during buildTree(). We therefore need to get its operands
4147 // through the TreeEntry.
4148 if (TreeEntry *TE = BundleMember->TE) {
4149 // Need to search for the lane since the tree entry can be reordered.
4150 int Lane = std::distance(TE->Scalars.begin(),
4151 find(TE->Scalars, BundleMember->Inst));
4152 assert(Lane >= 0 && "Lane not set");
4153
4154 // Since vectorization tree is being built recursively this assertion
4155 // ensures that the tree entry has all operands set before reaching
4156 // this code. Couple of exceptions known at the moment are extracts
4157 // where their second (immediate) operand is not added. Since
4158 // immediates do not affect scheduler behavior this is considered
4159 // okay.
4160 auto *In = BundleMember->Inst;
4161 assert(
4162 In &&
4163 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4164 In->getNumOperands() == TE->getNumOperands()) &&
4165 "Missed TreeEntry operands?");
4166 (void)In; // fake use to avoid build failure when assertions disabled
4167
4168 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
4169 OpIdx != NumOperands; ++OpIdx)
4170 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4171 DecrUnsched(I);
4172 } else {
4173 // If BundleMember is a stand-alone instruction, no operand reordering
4174 // has taken place, so we directly access its operands.
4175 for (Use &U : BundleMember->Inst->operands())
4176 if (auto *I = dyn_cast<Instruction>(U.get()))
4177 DecrUnsched(I);
4178 }
4179 // Handle the memory dependencies.
4180 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4181 if (MemoryDepSD->hasValidDependencies() &&
4182 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4183 // There are no more unscheduled dependencies after decrementing,
4184 // so we can put the dependent instruction into the ready list.
4185 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4186 assert(!DepBundle->IsScheduled &&
4187 "already scheduled bundle gets ready");
4188 ReadyList.insert(DepBundle);
4190 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4191 }
4192 }
4193 // Handle the control dependencies.
4194 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4195 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4196 // There are no more unscheduled dependencies after decrementing,
4197 // so we can put the dependent instruction into the ready list.
4198 ScheduleData *DepBundle = DepSD->FirstInBundle;
4199 assert(!DepBundle->IsScheduled &&
4200 "already scheduled bundle gets ready");
4201 ReadyList.insert(DepBundle);
4203 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4204 }
4205 }
4206 }
4207 }
4208
4209 /// Verify basic self consistency properties of the data structure.
4210 void verify() {
4211 if (!ScheduleStart)
4212 return;
4213
4214 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4215 ScheduleStart->comesBefore(ScheduleEnd) &&
4216 "Not a valid scheduling region?");
4217
4218 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4219 auto *SD = getScheduleData(I);
4220 if (!SD)
4221 continue;
4222 assert(isInSchedulingRegion(SD) &&
4223 "primary schedule data not in window?");
4224 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4225 "entire bundle in window!");
4226 SD->verify();
4227 }
4228
4229 for (auto *SD : ReadyInsts) {
4230 assert(SD->isSchedulingEntity() && SD->isReady() &&
4231 "item in ready list not ready?");
4232 (void)SD;
4233 }
4234 }
4235
4236 /// Put all instructions into the ReadyList which are ready for scheduling.
4237 template <typename ReadyListType>
4238 void initialFillReadyList(ReadyListType &ReadyList) {
4239 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4240 ScheduleData *SD = getScheduleData(I);
4241 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4242 SD->isReady()) {
4243 ReadyList.insert(SD);
4245 << "SLP: initially in ready list: " << *SD << "\n");
4246 }
4247 }
4248 }
4249
4250 /// Build a bundle from the ScheduleData nodes corresponding to the
4251 /// scalar instruction for each lane.
4252 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4253
4254 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4255 /// cyclic dependencies. This is only a dry-run, no instructions are
4256 /// actually moved at this stage.
4257 /// \returns the scheduling bundle. The returned Optional value is not
4258 /// std::nullopt if \p VL is allowed to be scheduled.
4259 std::optional<ScheduleData *>
4260 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4261 const InstructionsState &S);
4262
4263 /// Un-bundles a group of instructions.
4264 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4265
4266 /// Allocates schedule data chunk.
4267 ScheduleData *allocateScheduleDataChunks();
4268
4269 /// Extends the scheduling region so that V is inside the region.
4270 /// \returns true if the region size is within the limit.
4271 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4272
4273 /// Initialize the ScheduleData structures for new instructions in the
4274 /// scheduling region.
4275 void initScheduleData(Instruction *FromI, Instruction *ToI,
4276 ScheduleData *PrevLoadStore,
4277 ScheduleData *NextLoadStore);
4278
4279 /// Updates the dependency information of a bundle and of all instructions/
4280 /// bundles which depend on the original bundle.
4281 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4282 BoUpSLP *SLP);
4283
4284 /// Sets all instruction in the scheduling region to un-scheduled.
4285 void resetSchedule();
4286
4287 BasicBlock *BB;
4288
4289 /// Simple memory allocation for ScheduleData.
4291
4292 /// The size of a ScheduleData array in ScheduleDataChunks.
4293 int ChunkSize;
4294
4295 /// The allocator position in the current chunk, which is the last entry
4296 /// of ScheduleDataChunks.
4297 int ChunkPos;
4298
4299 /// Attaches ScheduleData to Instruction.
4300 /// Note that the mapping survives during all vectorization iterations, i.e.
4301 /// ScheduleData structures are recycled.
4303
4304 /// The ready-list for scheduling (only used for the dry-run).
4305 SetVector<ScheduleData *> ReadyInsts;
4306
4307 /// The first instruction of the scheduling region.
4308 Instruction *ScheduleStart = nullptr;
4309
4310 /// The first instruction _after_ the scheduling region.
4311 Instruction *ScheduleEnd = nullptr;
4312
4313 /// The first memory accessing instruction in the scheduling region
4314 /// (can be null).
4315 ScheduleData *FirstLoadStoreInRegion = nullptr;
4316
4317 /// The last memory accessing instruction in the scheduling region
4318 /// (can be null).
4319 ScheduleData *LastLoadStoreInRegion = nullptr;
4320
4321 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4322 /// region? Used to optimize the dependence calculation for the
4323 /// common case where there isn't.
4324 bool RegionHasStackSave = false;
4325
4326 /// The current size of the scheduling region.
4327 int ScheduleRegionSize = 0;
4328
4329 /// The maximum size allowed for the scheduling region.
4330 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4331
4332 /// The ID of the scheduling region. For a new vectorization iteration this
4333 /// is incremented which "removes" all ScheduleData from the region.
4334 /// Make sure that the initial SchedulingRegionID is greater than the
4335 /// initial SchedulingRegionID in ScheduleData (which is 0).
4336 int SchedulingRegionID = 1;
4337 };
4338
4339 /// Attaches the BlockScheduling structures to basic blocks.
4341
4342 /// Performs the "real" scheduling. Done before vectorization is actually
4343 /// performed in a basic block.
4344 void scheduleBlock(BlockScheduling *BS);
4345
4346 /// List of users to ignore during scheduling and that don't need extracting.
4347 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4348
4349 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4350 /// sorted SmallVectors of unsigned.
4351 struct OrdersTypeDenseMapInfo {
4352 static OrdersType getEmptyKey() {
4353 OrdersType V;
4354 V.push_back(~1U);
4355 return V;
4356 }
4357
4358 static OrdersType getTombstoneKey() {
4359 OrdersType V;
4360 V.push_back(~2U);
4361 return V;
4362 }
4363
4364 static unsigned getHashValue(const OrdersType &V) {
4365 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4366 }
4367
4368 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4369 return LHS == RHS;
4370 }
4371 };
4372
4373 // Analysis and block reference.
4374 Function *F;
4375 ScalarEvolution *SE;
4377 TargetLibraryInfo *TLI;
4378 LoopInfo *LI;
4379 DominatorTree *DT;
4380 AssumptionCache *AC;
4381 DemandedBits *DB;
4382 const DataLayout *DL;
4384
4385 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4386 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4387
4388 /// Instruction builder to construct the vectorized tree.
4390
4391 /// A map of scalar integer values to the smallest bit width with which they
4392 /// can legally be represented. The values map to (width, signed) pairs,
4393 /// where "width" indicates the minimum bit width and "signed" is True if the
4394 /// value must be signed-extended, rather than zero-extended, back to its
4395 /// original width.
4397
4398 /// Final size of the reduced vector, if the current graph represents the
4399 /// input for the reduction and it was possible to narrow the size of the
4400 /// reduction.
4401 unsigned ReductionBitWidth = 0;
4402
4403 /// Canonical graph size before the transformations.
4404 unsigned BaseGraphSize = 1;
4405
4406 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4407 /// type sizes, used in the tree.
4408 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4409
4410 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4411 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4412 DenseSet<unsigned> ExtraBitWidthNodes;
4413};
4414
4415} // end namespace slpvectorizer
4416
4417template <> struct GraphTraits<BoUpSLP *> {
4418 using TreeEntry = BoUpSLP::TreeEntry;
4419
4420 /// NodeRef has to be a pointer per the GraphWriter.
4422
4424
4425 /// Add the VectorizableTree to the index iterator to be able to return
4426 /// TreeEntry pointers.
4427 struct ChildIteratorType
4428 : public iterator_adaptor_base<
4429 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4431
4433 ContainerTy &VT)
4434 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4435
4436 NodeRef operator*() { return I->UserTE; }
4437 };
4438
4440 return R.VectorizableTree[0].get();
4441 }
4442
4443 static ChildIteratorType child_begin(NodeRef N) {
4444 return {N->UserTreeIndices.begin(), N->Container};
4445 }
4446
4447 static ChildIteratorType child_end(NodeRef N) {
4448 return {N->UserTreeIndices.end(), N->Container};
4449 }
4450
4451 /// For the node iterator we just need to turn the TreeEntry iterator into a
4452 /// TreeEntry* iterator so that it dereferences to NodeRef.
4453 class nodes_iterator {
4455 ItTy It;
4456
4457 public:
4458 nodes_iterator(const ItTy &It2) : It(It2) {}
4459 NodeRef operator*() { return It->get(); }
4460 nodes_iterator operator++() {
4461 ++It;
4462 return *this;
4463 }
4464 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4465 };
4466
4467 static nodes_iterator nodes_begin(BoUpSLP *R) {
4468 return nodes_iterator(R->VectorizableTree.begin());
4469 }
4470
4471 static nodes_iterator nodes_end(BoUpSLP *R) {
4472 return nodes_iterator(R->VectorizableTree.end());
4473 }
4474
4475 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4476};
4477
4478template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4479 using TreeEntry = BoUpSLP::TreeEntry;
4480
4481 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4482
4483 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4484 std::string Str;
4486 OS << Entry->Idx << ".\n";
4487 if (isSplat(Entry->Scalars))
4488 OS << "<splat> ";
4489 for (auto *V : Entry->Scalars) {
4490 OS << *V;
4491 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4492 return EU.Scalar == V;
4493 }))
4494 OS << " <extract>";
4495 OS << "\n";
4496 }
4497 return Str;
4498 }
4499
4500 static std::string getNodeAttributes(const TreeEntry *Entry,
4501 const BoUpSLP *) {
4502 if (Entry->isGather())
4503 return "color=red";
4504 if (Entry->State == TreeEntry::ScatterVectorize ||
4505 Entry->State == TreeEntry::StridedVectorize)
4506 return "color=blue";
4507 return "";
4508 }
4509};
4510
4511} // end namespace llvm
4512
4515 for (auto *I : DeletedInstructions) {
4516 if (!I->getParent()) {
4517 // Temporarily insert instruction back to erase them from parent and
4518 // memory later.
4519 if (isa<PHINode>(I))
4520 // Phi nodes must be the very first instructions in the block.
4521 I->insertBefore(F->getEntryBlock(),
4522 F->getEntryBlock().getFirstNonPHIIt());
4523 else
4524 I->insertBefore(F->getEntryBlock().getTerminator());
4525 continue;
4526 }
4527 for (Use &U : I->operands()) {
4528 auto *Op = dyn_cast<Instruction>(U.get());
4529 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4531 DeadInsts.emplace_back(Op);
4532 }
4533 I->dropAllReferences();
4534 }
4535 for (auto *I : DeletedInstructions) {
4536 assert(I->use_empty() &&
4537 "trying to erase instruction with users.");
4538 I->eraseFromParent();
4539 }
4540
4541 // Cleanup any dead scalar code feeding the vectorized instructions
4543
4544#ifdef EXPENSIVE_CHECKS
4545 // If we could guarantee that this call is not extremely slow, we could
4546 // remove the ifdef limitation (see PR47712).
4547 assert(!verifyFunction(*F, &dbgs()));
4548#endif
4549}
4550
4551/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4552/// contains original mask for the scalars reused in the node. Procedure
4553/// transform this mask in accordance with the given \p Mask.
4555 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4556 "Expected non-empty mask.");
4557 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4558 Prev.swap(Reuses);
4559 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4560 if (Mask[I] != PoisonMaskElem)
4561 Reuses[Mask[I]] = Prev[I];
4562}
4563
4564/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4565/// the original order of the scalars. Procedure transforms the provided order
4566/// in accordance with the given \p Mask. If the resulting \p Order is just an
4567/// identity order, \p Order is cleared.
4569 bool BottomOrder = false) {
4570 assert(!Mask.empty() && "Expected non-empty mask.");
4571 unsigned Sz = Mask.size();
4572 if (BottomOrder) {
4573 SmallVector<unsigned> PrevOrder;
4574 if (Order.empty()) {
4575 PrevOrder.resize(Sz);
4576 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4577 } else {
4578 PrevOrder.swap(Order);
4579 }
4580 Order.assign(Sz, Sz);
4581 for (unsigned I = 0; I < Sz; ++I)
4582 if (Mask[I] != PoisonMaskElem)
4583 Order[I] = PrevOrder[Mask[I]];
4584 if (all_of(enumerate(Order), [&](const auto &Data) {
4585 return Data.value() == Sz || Data.index() == Data.value();
4586 })) {
4587 Order.clear();
4588 return;
4589 }
4590 fixupOrderingIndices(Order);
4591 return;
4592 }
4593 SmallVector<int> MaskOrder;
4594 if (Order.empty()) {
4595 MaskOrder.resize(Sz);
4596 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4597 } else {
4598 inversePermutation(Order, MaskOrder);
4599 }
4600 reorderReuses(MaskOrder, Mask);
4601 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4602 Order.clear();
4603 return;
4604 }
4605 Order.assign(Sz, Sz);
4606 for (unsigned I = 0; I < Sz; ++I)
4607 if (MaskOrder[I] != PoisonMaskElem)
4608 Order[MaskOrder[I]] = I;
4609 fixupOrderingIndices(Order);
4610}
4611
4612std::optional<BoUpSLP::OrdersType>
4613BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4614 assert(TE.isGather() && "Expected gather node only.");
4615 // Try to find subvector extract/insert patterns and reorder only such
4616 // patterns.
4617 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4618 Type *ScalarTy = GatheredScalars.front()->getType();
4619 int NumScalars = GatheredScalars.size();
4620 if (!isValidElementType(ScalarTy))
4621 return std::nullopt;
4622 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4623 int NumParts = TTI->getNumberOfParts(VecTy);
4624 if (NumParts == 0 || NumParts >= NumScalars ||
4625 VecTy->getNumElements() % NumParts != 0 ||
4626 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4627 VecTy->getNumElements() / NumParts))
4628 NumParts = 1;
4629 SmallVector<int> ExtractMask;
4630 SmallVector<int> Mask;
4633 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4635 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4636 /*ForOrder=*/true);
4637 // No shuffled operands - ignore.
4638 if (GatherShuffles.empty() && ExtractShuffles.empty())
4639 return std::nullopt;
4640 OrdersType CurrentOrder(NumScalars, NumScalars);
4641 if (GatherShuffles.size() == 1 &&
4642 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4643 Entries.front().front()->isSame(TE.Scalars)) {
4644 // Perfect match in the graph, will reuse the previously vectorized
4645 // node. Cost is 0.
4646 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4647 return CurrentOrder;
4648 }
4649 auto IsSplatMask = [](ArrayRef<int> Mask) {
4650 int SingleElt = PoisonMaskElem;
4651 return all_of(Mask, [&](int I) {
4652 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4653 SingleElt = I;
4654 return I == PoisonMaskElem || I == SingleElt;
4655 });
4656 };
4657 // Exclusive broadcast mask - ignore.
4658 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4659 (Entries.size() != 1 ||
4660 Entries.front().front()->ReorderIndices.empty())) ||
4661 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4662 return std::nullopt;
4663 SmallBitVector ShuffledSubMasks(NumParts);
4664 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4665 ArrayRef<int> Mask, int PartSz, int NumParts,
4666 function_ref<unsigned(unsigned)> GetVF) {
4667 for (int I : seq<int>(0, NumParts)) {
4668 if (ShuffledSubMasks.test(I))
4669 continue;
4670 const int VF = GetVF(I);
4671 if (VF == 0)
4672 continue;
4673 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4674 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4675 // Shuffle of at least 2 vectors - ignore.
4676 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4677 std::fill(Slice.begin(), Slice.end(), NumScalars);
4678 ShuffledSubMasks.set(I);
4679 continue;
4680 }
4681 // Try to include as much elements from the mask as possible.
4682 int FirstMin = INT_MAX;
4683 int SecondVecFound = false;
4684 for (int K : seq<int>(Limit)) {
4685 int Idx = Mask[I * PartSz + K];
4686 if (Idx == PoisonMaskElem) {
4687 Value *V = GatheredScalars[I * PartSz + K];
4688 if (isConstant(V) && !isa<PoisonValue>(V)) {
4689 SecondVecFound = true;
4690 break;
4691 }
4692 continue;
4693 }
4694 if (Idx < VF) {
4695 if (FirstMin > Idx)
4696 FirstMin = Idx;
4697 } else {
4698 SecondVecFound = true;
4699 break;
4700 }
4701 }
4702 FirstMin = (FirstMin / PartSz) * PartSz;
4703 // Shuffle of at least 2 vectors - ignore.
4704 if (SecondVecFound) {
4705 std::fill(Slice.begin(), Slice.end(), NumScalars);
4706 ShuffledSubMasks.set(I);
4707 continue;
4708 }
4709 for (int K : seq<int>(Limit)) {
4710 int Idx = Mask[I * PartSz + K];
4711 if (Idx == PoisonMaskElem)
4712 continue;
4713 Idx -= FirstMin;
4714 if (Idx >= PartSz) {
4715 SecondVecFound = true;
4716 break;
4717 }
4718 if (CurrentOrder[I * PartSz + Idx] >
4719 static_cast<unsigned>(I * PartSz + K) &&
4720 CurrentOrder[I * PartSz + Idx] !=
4721 static_cast<unsigned>(I * PartSz + Idx))
4722 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4723 }
4724 // Shuffle of at least 2 vectors - ignore.
4725 if (SecondVecFound) {
4726 std::fill(Slice.begin(), Slice.end(), NumScalars);
4727 ShuffledSubMasks.set(I);
4728 continue;
4729 }
4730 }
4731 };
4732 int PartSz = getPartNumElems(NumScalars, NumParts);
4733 if (!ExtractShuffles.empty())
4734 TransformMaskToOrder(
4735 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4736 if (!ExtractShuffles[I])
4737 return 0U;
4738 unsigned VF = 0;
4739 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4740 for (unsigned Idx : seq<unsigned>(Sz)) {
4741 int K = I * PartSz + Idx;
4742 if (ExtractMask[K] == PoisonMaskElem)
4743 continue;
4744 if (!TE.ReuseShuffleIndices.empty())
4745 K = TE.ReuseShuffleIndices[K];
4746 if (K == PoisonMaskElem)
4747 continue;
4748 if (!TE.ReorderIndices.empty())
4749 K = std::distance(TE.ReorderIndices.begin(),
4750 find(TE.ReorderIndices, K));
4751 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4752 if (!EI)
4753 continue;
4754 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4755 ->getElementCount()
4756 .getKnownMinValue());
4757 }
4758 return VF;
4759 });
4760 // Check special corner case - single shuffle of the same entry.
4761 if (GatherShuffles.size() == 1 && NumParts != 1) {
4762 if (ShuffledSubMasks.any())
4763 return std::nullopt;
4764 PartSz = NumScalars;
4765 NumParts = 1;
4766 }
4767 if (!Entries.empty())
4768 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4769 if (!GatherShuffles[I])
4770 return 0U;
4771 return std::max(Entries[I].front()->getVectorFactor(),
4772 Entries[I].back()->getVectorFactor());
4773 });
4774 int NumUndefs =
4775 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4776 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4777 return std::nullopt;
4778 return std::move(CurrentOrder);
4779}
4780
4781static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4782 const TargetLibraryInfo &TLI,
4783 bool CompareOpcodes = true) {
4786 return false;
4787 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4788 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4789 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4790 (!GEP2 || GEP2->getNumOperands() == 2) &&
4791 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4792 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4793 !CompareOpcodes ||
4794 (GEP1 && GEP2 &&
4795 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4796}
4797
4798/// Calculates minimal alignment as a common alignment.
4799template <typename T>
4801 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4802 for (Value *V : VL.drop_front())
4803 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4804 return CommonAlignment;
4805}
4806
4807/// Check if \p Order represents reverse order.
4809 assert(!Order.empty() &&
4810 "Order is empty. Please check it before using isReverseOrder.");
4811 unsigned Sz = Order.size();
4812 return all_of(enumerate(Order), [&](const auto &Pair) {
4813 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4814 });
4815}
4816
4817/// Checks if the provided list of pointers \p Pointers represents the strided
4818/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4819/// Otherwise, if \p Inst is not specified, just initialized optional value is
4820/// returned to show that the pointers represent strided pointers. If \p Inst
4821/// specified, the runtime stride is materialized before the given \p Inst.
4822/// \returns std::nullopt if the pointers are not pointers with the runtime
4823/// stride, nullptr or actual stride value, otherwise.
4824static std::optional<Value *>
4826 const DataLayout &DL, ScalarEvolution &SE,
4827 SmallVectorImpl<unsigned> &SortedIndices,
4828 Instruction *Inst = nullptr) {
4830 const SCEV *PtrSCEVLowest = nullptr;
4831 const SCEV *PtrSCEVHighest = nullptr;
4832 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4833 // addresses).
4834 for (Value *Ptr : PointerOps) {
4835 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4836 if (!PtrSCEV)
4837 return std::nullopt;
4838 SCEVs.push_back(PtrSCEV);
4839 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4840 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4841 continue;
4842 }
4843 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4844 if (isa<SCEVCouldNotCompute>(Diff))
4845 return std::nullopt;
4846 if (Diff->isNonConstantNegative()) {
4847 PtrSCEVLowest = PtrSCEV;
4848 continue;
4849 }
4850 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4851 if (isa<SCEVCouldNotCompute>(Diff1))
4852 return std::nullopt;
4853 if (Diff1->isNonConstantNegative()) {
4854 PtrSCEVHighest = PtrSCEV;
4855 continue;
4856 }
4857 }
4858 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4859 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4860 if (isa<SCEVCouldNotCompute>(Dist))
4861 return std::nullopt;
4862 int Size = DL.getTypeStoreSize(ElemTy);
4863 auto TryGetStride = [&](const SCEV *Dist,
4864 const SCEV *Multiplier) -> const SCEV * {
4865 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4866 if (M->getOperand(0) == Multiplier)
4867 return M->getOperand(1);
4868 if (M->getOperand(1) == Multiplier)
4869 return M->getOperand(0);
4870 return nullptr;
4871 }
4872 if (Multiplier == Dist)
4873 return SE.getConstant(Dist->getType(), 1);
4874 return SE.getUDivExactExpr(Dist, Multiplier);
4875 };
4876 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4877 const SCEV *Stride = nullptr;
4878 if (Size != 1 || SCEVs.size() > 2) {
4879 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4880 Stride = TryGetStride(Dist, Sz);
4881 if (!Stride)
4882 return std::nullopt;
4883 }
4884 if (!Stride || isa<SCEVConstant>(Stride))
4885 return std::nullopt;
4886 // Iterate through all pointers and check if all distances are
4887 // unique multiple of Stride.
4888 using DistOrdPair = std::pair<int64_t, int>;
4889 auto Compare = llvm::less_first();
4890 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4891 int Cnt = 0;
4892 bool IsConsecutive = true;
4893 for (const SCEV *PtrSCEV : SCEVs) {
4894 unsigned Dist = 0;
4895 if (PtrSCEV != PtrSCEVLowest) {
4896 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4897 const SCEV *Coeff = TryGetStride(Diff, Stride);
4898 if (!Coeff)
4899 return std::nullopt;
4900 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4901 if (!SC || isa<SCEVCouldNotCompute>(SC))
4902 return std::nullopt;
4903 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4904 SE.getMulExpr(Stride, SC)))
4905 ->isZero())
4906 return std::nullopt;
4907 Dist = SC->getAPInt().getZExtValue();
4908 }
4909 // If the strides are not the same or repeated, we can't vectorize.
4910 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4911 return std::nullopt;
4912 auto Res = Offsets.emplace(Dist, Cnt);
4913 if (!Res.second)
4914 return std::nullopt;
4915 // Consecutive order if the inserted element is the last one.
4916 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4917 ++Cnt;
4918 }
4919 if (Offsets.size() != SCEVs.size())
4920 return std::nullopt;
4921 SortedIndices.clear();
4922 if (!IsConsecutive) {
4923 // Fill SortedIndices array only if it is non-consecutive.
4924 SortedIndices.resize(PointerOps.size());
4925 Cnt = 0;
4926 for (const std::pair<int64_t, int> &Pair : Offsets) {
4927 SortedIndices[Cnt] = Pair.second;
4928 ++Cnt;
4929 }
4930 }
4931 if (!Inst)
4932 return nullptr;
4933 SCEVExpander Expander(SE, DL, "strided-load-vec");
4934 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4935}
4936
4937static std::pair<InstructionCost, InstructionCost>
4939 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4940 Type *ScalarTy, VectorType *VecTy);
4941
4942/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4943/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4944/// subvector pattern.
4945static InstructionCost
4947 VectorType *Tp, ArrayRef<int> Mask = {},
4949 int Index = 0, VectorType *SubTp = nullptr,
4951 if (Kind != TTI::SK_PermuteTwoSrc)
4952 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4953 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4954 int NumSubElts;
4956 Mask, NumSrcElts, NumSubElts, Index)) {
4957 if (Index + NumSubElts > NumSrcElts &&
4958 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4959 return TTI.getShuffleCost(
4961 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4963 }
4964 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4965}
4966
4967/// Correctly creates insert_subvector, checking that the index is multiple of
4968/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4969/// using default shuffle.
4971 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
4972 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
4973 const unsigned SubVecVF = getNumElements(V->getType());
4974 if (Index % SubVecVF == 0) {
4975 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
4976 Builder.getInt64(Index));
4977 } else {
4978 // Create shuffle, insertvector requires that index is multiple of
4979 // the subvector length.
4980 const unsigned VecVF = getNumElements(Vec->getType());
4982 std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0);
4983 for (unsigned I : seq<unsigned>(SubVecVF))
4984 Mask[I + Index] = I + VecVF;
4985 if (Generator) {
4986 Vec = Generator(Vec, V, Mask);
4987 } else {
4988 // 1. Resize V to the size of Vec.
4989 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
4990 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4991 V = Builder.CreateShuffleVector(V, ResizeMask);
4992 Vec = Builder.CreateShuffleVector(Vec, V, Mask);
4993 }
4994 }
4995 return Vec;
4996}
4997
5001 SmallVectorImpl<Value *> &PointerOps,
5002 unsigned *BestVF, bool TryRecursiveCheck) const {
5003 // Check that a vectorized load would load the same memory as a scalar
5004 // load. For example, we don't want to vectorize loads that are smaller
5005 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5006 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5007 // from such a struct, we read/write packed bits disagreeing with the
5008 // unvectorized version.
5009 if (BestVF)
5010 *BestVF = 0;
5012 return LoadsState::Gather;
5013 Type *ScalarTy = VL0->getType();
5014
5015 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
5016 return LoadsState::Gather;
5017
5018 // Make sure all loads in the bundle are simple - we can't vectorize
5019 // atomic or volatile loads.
5020 PointerOps.clear();
5021 const unsigned Sz = VL.size();
5022 PointerOps.resize(Sz);
5023 auto *POIter = PointerOps.begin();
5024 for (Value *V : VL) {
5025 auto *L = dyn_cast<LoadInst>(V);
5026 if (!L || !L->isSimple())
5027 return LoadsState::Gather;
5028 *POIter = L->getPointerOperand();
5029 ++POIter;
5030 }
5031
5032 Order.clear();
5033 // Check the order of pointer operands or that all pointers are the same.
5034 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
5035
5036 auto *VecTy = getWidenedType(ScalarTy, Sz);
5037 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5038 if (!IsSorted) {
5039 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
5040 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5041 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5043 }
5044
5045 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5046 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5047 return LoadsState::Gather;
5048
5049 if (!all_of(PointerOps, [&](Value *P) {
5050 return arePointersCompatible(P, PointerOps.front(), *TLI);
5051 }))
5052 return LoadsState::Gather;
5053
5054 } else {
5055 Value *Ptr0;
5056 Value *PtrN;
5057 if (Order.empty()) {
5058 Ptr0 = PointerOps.front();
5059 PtrN = PointerOps.back();
5060 } else {
5061 Ptr0 = PointerOps[Order.front()];
5062 PtrN = PointerOps[Order.back()];
5063 }
5064 std::optional<int> Diff =
5065 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5066 // Check that the sorted loads are consecutive.
5067 if (static_cast<unsigned>(*Diff) == Sz - 1)
5068 return LoadsState::Vectorize;
5069 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5070 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5071 return LoadsState::Gather;
5072 // Simple check if not a strided access - clear order.
5073 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5074 // Try to generate strided load node if:
5075 // 1. Target with strided load support is detected.
5076 // 2. The number of loads is greater than MinProfitableStridedLoads,
5077 // or the potential stride <= MaxProfitableLoadStride and the
5078 // potential stride is power-of-2 (to avoid perf regressions for the very
5079 // small number of loads) and max distance > number of loads, or potential
5080 // stride is -1.
5081 // 3. The loads are ordered, or number of unordered loads <=
5082 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5083 // (this check is to avoid extra costs for very expensive shuffles).
5084 // 4. Any pointer operand is an instruction with the users outside of the
5085 // current graph (for masked gathers extra extractelement instructions
5086 // might be required).
5087 auto IsAnyPointerUsedOutGraph =
5088 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5089 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5090 return !getTreeEntry(U) && !MustGather.contains(U);
5091 });
5092 });
5093 const unsigned AbsoluteDiff = std::abs(*Diff);
5094 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5096 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5097 has_single_bit(AbsoluteDiff))) &&
5098 AbsoluteDiff > Sz) ||
5099 *Diff == -(static_cast<int>(Sz) - 1))) {
5100 int Stride = *Diff / static_cast<int>(Sz - 1);
5101 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5102 Align Alignment =
5103 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5104 ->getAlign();
5105 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5106 // Iterate through all pointers and check if all distances are
5107 // unique multiple of Dist.
5108 SmallSet<int, 4> Dists;
5109 for (Value *Ptr : PointerOps) {
5110 int Dist = 0;
5111 if (Ptr == PtrN)
5112 Dist = *Diff;
5113 else if (Ptr != Ptr0)
5114 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5115 // If the strides are not the same or repeated, we can't
5116 // vectorize.
5117 if (((Dist / Stride) * Stride) != Dist ||
5118 !Dists.insert(Dist).second)
5119 break;
5120 }
5121 if (Dists.size() == Sz)
5123 }
5124 }
5125 }
5126 }
5127 // Correctly identify compare the cost of loads + shuffles rather than
5128 // strided/masked gather loads. Returns true if vectorized + shuffles
5129 // representation is better than just gather.
5130 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5131 unsigned *BestVF,
5132 bool ProfitableGatherPointers) {
5133 if (BestVF)
5134 *BestVF = 0;
5135 // Compare masked gather cost and loads + insert subvector costs.
5137 auto [ScalarGEPCost, VectorGEPCost] =
5138 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5139 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5140 // Estimate the cost of masked gather GEP. If not a splat, roughly
5141 // estimate as a buildvector, otherwise estimate as splat.
5142 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5143 VectorType *PtrVecTy =
5144 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5145 VecTy->getNumElements());
5146 if (static_cast<unsigned>(count_if(
5147 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5148 any_of(PointerOps, [&](Value *V) {
5149 return getUnderlyingObject(V) !=
5150 getUnderlyingObject(PointerOps.front());
5151 }))
5152 VectorGEPCost += TTI.getScalarizationOverhead(
5153 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5154 else
5155 VectorGEPCost +=
5157 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5158 /*Insert=*/true, /*Extract=*/false, CostKind) +
5160 // The cost of scalar loads.
5161 InstructionCost ScalarLoadsCost =
5162 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5163 [&](InstructionCost C, Value *V) {
5164 return C + TTI.getInstructionCost(
5165 cast<Instruction>(V), CostKind);
5166 }) +
5167 ScalarGEPCost;
5168 // The cost of masked gather.
5169 InstructionCost MaskedGatherCost =
5171 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5172 /*VariableMask=*/false, CommonAlignment, CostKind) +
5173 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5174 InstructionCost GatherCost =
5175 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5176 /*Extract=*/false, CostKind) +
5177 ScalarLoadsCost;
5178 // The list of loads is small or perform partial check already - directly
5179 // compare masked gather cost and gather cost.
5180 constexpr unsigned ListLimit = 4;
5181 if (!TryRecursiveCheck || VL.size() < ListLimit)
5182 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5183
5184 // FIXME: The following code has not been updated for non-power-of-2
5185 // vectors. The splitting logic here does not cover the original
5186 // vector if the vector factor is not a power of two. FIXME
5187 if (!has_single_bit(VL.size()))
5188 return false;
5189
5190 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5191 unsigned MinVF = getMinVF(2 * Sz);
5192 DemandedElts.clearAllBits();
5193 // Iterate through possible vectorization factors and check if vectorized +
5194 // shuffles is better than just gather.
5195 for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
5197 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5198 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5200 SmallVector<Value *> PointerOps;
5201 LoadsState LS =
5202 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5203 /*TryRecursiveCheck=*/false);
5204 // Check that the sorted loads are consecutive.
5205 if (LS == LoadsState::Gather) {
5206 if (BestVF) {
5207 DemandedElts.setAllBits();
5208 break;
5209 }
5210 DemandedElts.setBits(Cnt, Cnt + VF);
5211 continue;
5212 }
5213 // If need the reorder - consider as high-cost masked gather for now.
5214 if ((LS == LoadsState::Vectorize ||
5216 !Order.empty() && !isReverseOrder(Order))
5218 States.push_back(LS);
5219 }
5220 if (DemandedElts.isAllOnes())
5221 // All loads gathered - try smaller VF.
5222 continue;
5223 // Can be vectorized later as a serie of loads/insertelements.
5224 InstructionCost VecLdCost = 0;
5225 if (!DemandedElts.isZero()) {
5226 VecLdCost =
5227 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5228 /*Extract=*/false, CostKind) +
5229 ScalarGEPCost;
5230 for (unsigned Idx : seq<unsigned>(VL.size()))
5231 if (DemandedElts[Idx])
5232 VecLdCost +=
5233 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5234 }
5235 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5236 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5237 for (auto [I, LS] : enumerate(States)) {
5238 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5239 InstructionCost VectorGEPCost =
5240 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5241 ? 0
5242 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5243 LI0->getPointerOperand(),
5244 Instruction::GetElementPtr, CostKind, ScalarTy,
5245 SubVecTy)
5246 .second;
5247 if (LS == LoadsState::ScatterVectorize) {
5248 if (static_cast<unsigned>(
5249 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5250 PointerOps.size() - 1 ||
5251 any_of(PointerOps, [&](Value *V) {
5252 return getUnderlyingObject(V) !=
5253 getUnderlyingObject(PointerOps.front());
5254 }))
5255 VectorGEPCost += TTI.getScalarizationOverhead(
5256 SubVecTy, APInt::getAllOnes(VF),
5257 /*Insert=*/true, /*Extract=*/false, CostKind);
5258 else
5259 VectorGEPCost +=
5261 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5262 /*Insert=*/true, /*Extract=*/false, CostKind) +
5263 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5264 CostKind);
5265 }
5266 switch (LS) {
5268 VecLdCost +=
5269 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5270 LI0->getPointerAddressSpace(), CostKind,
5272 VectorGEPCost;
5273 break;
5275 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5276 LI0->getPointerOperand(),
5277 /*VariableMask=*/false,
5278 CommonAlignment, CostKind) +
5279 VectorGEPCost;
5280 break;
5282 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5283 LI0->getPointerOperand(),
5284 /*VariableMask=*/false,
5285 CommonAlignment, CostKind) +
5286 VectorGEPCost;
5287 break;
5288 case LoadsState::Gather:
5289 // Gathers are already calculated - ignore.
5290 continue;
5291 }
5292 SmallVector<int> ShuffleMask(VL.size());
5293 for (int Idx : seq<int>(0, VL.size()))
5294 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5295 if (I > 0)
5296 VecLdCost +=
5297 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5298 CostKind, I * VF, SubVecTy);
5299 }
5300 // If masked gather cost is higher - better to vectorize, so
5301 // consider it as a gather node. It will be better estimated
5302 // later.
5303 if (MaskedGatherCost >= VecLdCost &&
5304 VecLdCost - GatherCost < -SLPCostThreshold) {
5305 if (BestVF)
5306 *BestVF = VF;
5307 return true;
5308 }
5309 }
5310 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5311 };
5312 // TODO: need to improve analysis of the pointers, if not all of them are
5313 // GEPs or have > 2 operands, we end up with a gather node, which just
5314 // increases the cost.
5315 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5316 bool ProfitableGatherPointers =
5317 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5318 return L->isLoopInvariant(V);
5319 })) <= Sz / 2;
5320 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5321 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5322 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5323 (GEP && GEP->getNumOperands() == 2 &&
5324 isa<Constant, Instruction>(GEP->getOperand(1)));
5325 })) {
5326 // Check if potential masked gather can be represented as series
5327 // of loads + insertsubvectors.
5328 // If masked gather cost is higher - better to vectorize, so
5329 // consider it as a gather node. It will be better estimated
5330 // later.
5331 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5332 ProfitableGatherPointers))
5334 }
5335
5336 return LoadsState::Gather;
5337}
5338
5340 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5341 const DataLayout &DL, ScalarEvolution &SE,
5342 SmallVectorImpl<unsigned> &SortedIndices) {
5343 assert(
5344 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5345 "Expected list of pointer operands.");
5346 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5347 // Ptr into, sort and return the sorted indices with values next to one
5348 // another.
5351 Bases;
5352 Bases
5353 .try_emplace(std::make_pair(
5355 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5356
5357 SortedIndices.clear();
5358 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5359 auto Key = std::make_pair(BBs[Cnt + 1],
5361 bool Found = any_of(Bases.try_emplace(Key).first->second,
5362 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5363 std::optional<int> Diff = getPointersDiff(
5364 ElemTy, std::get<0>(Base.front()), ElemTy,
5365 Ptr, DL, SE,
5366 /*StrictCheck=*/true);
5367 if (!Diff)
5368 return false;
5369
5370 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5371 return true;
5372 });
5373
5374 if (!Found) {
5375 // If we haven't found enough to usefully cluster, return early.
5376 if (Bases.size() > VL.size() / 2 - 1)
5377 return false;
5378
5379 // Not found already - add a new Base
5380 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5381 }
5382 }
5383
5384 if (Bases.size() == VL.size())
5385 return false;
5386
5387 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5388 Bases.front().second.size() == VL.size()))
5389 return false;
5390
5391 // For each of the bases sort the pointers by Offset and check if any of the
5392 // base become consecutively allocated.
5393 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5394 SmallPtrSet<Value *, 13> FirstPointers;
5395 SmallPtrSet<Value *, 13> SecondPointers;
5396 Value *P1 = Ptr1;
5397 Value *P2 = Ptr2;
5398 unsigned Depth = 0;
5399 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
5400 if (P1 == P2 || Depth > RecursionMaxDepth)
5401 return false;
5402 FirstPointers.insert(P1);
5403 SecondPointers.insert(P2);
5404 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5405 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5406 ++Depth;
5407 }
5408 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5409 "Unable to find matching root.");
5410 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5411 };
5412 for (auto &Base : Bases) {
5413 for (auto &Vec : Base.second) {
5414 if (Vec.size() > 1) {
5415 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5416 const std::tuple<Value *, int, unsigned> &Y) {
5417 return std::get<1>(X) < std::get<1>(Y);
5418 });
5419 int InitialOffset = std::get<1>(Vec[0]);
5420 bool AnyConsecutive =
5421 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5422 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5423 });
5424 // Fill SortedIndices array only if it looks worth-while to sort the
5425 // ptrs.
5426 if (!AnyConsecutive)
5427 return false;
5428 }
5429 }
5430 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5431 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5432 });
5433 }
5434
5435 for (auto &T : Bases)
5436 for (const auto &Vec : T.second)
5437 for (const auto &P : Vec)
5438 SortedIndices.push_back(std::get<2>(P));
5439
5440 assert(SortedIndices.size() == VL.size() &&
5441 "Expected SortedIndices to be the size of VL");
5442 return true;
5443}
5444
5445std::optional<BoUpSLP::OrdersType>
5446BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5447 assert(TE.isGather() && "Expected gather node only.");
5448 Type *ScalarTy = TE.Scalars[0]->getType();
5449
5451 Ptrs.reserve(TE.Scalars.size());
5453 BBs.reserve(TE.Scalars.size());
5454 for (Value *V : TE.Scalars) {
5455 auto *L = dyn_cast<LoadInst>(V);
5456 if (!L || !L->isSimple())
5457 return std::nullopt;
5458 Ptrs.push_back(L->getPointerOperand());
5459 BBs.push_back(L->getParent());
5460 }
5461
5462 BoUpSLP::OrdersType Order;
5463 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5464 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5465 return std::move(Order);
5466 return std::nullopt;
5467}
5468
5469/// Check if two insertelement instructions are from the same buildvector.
5472 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5473 // Instructions must be from the same basic blocks.
5474 if (VU->getParent() != V->getParent())
5475 return false;
5476 // Checks if 2 insertelements are from the same buildvector.
5477 if (VU->getType() != V->getType())
5478 return false;
5479 // Multiple used inserts are separate nodes.
5480 if (!VU->hasOneUse() && !V->hasOneUse())
5481 return false;
5482 auto *IE1 = VU;
5483 auto *IE2 = V;
5484 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5485 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5486 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5487 return false;
5488 // Go through the vector operand of insertelement instructions trying to find
5489 // either VU as the original vector for IE2 or V as the original vector for
5490 // IE1.
5491 SmallBitVector ReusedIdx(
5492 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5493 bool IsReusedIdx = false;
5494 do {
5495 if (IE2 == VU && !IE1)
5496 return VU->hasOneUse();
5497 if (IE1 == V && !IE2)
5498 return V->hasOneUse();
5499 if (IE1 && IE1 != V) {
5500 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5501 IsReusedIdx |= ReusedIdx.test(Idx1);
5502 ReusedIdx.set(Idx1);
5503 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5504 IE1 = nullptr;
5505 else
5506 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5507 }
5508 if (IE2 && IE2 != VU) {
5509 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5510 IsReusedIdx |= ReusedIdx.test(Idx2);
5511 ReusedIdx.set(Idx2);
5512 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5513 IE2 = nullptr;
5514 else
5515 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5516 }
5517 } while (!IsReusedIdx && (IE1 || IE2));
5518 return false;
5519}
5520
5521std::optional<BoUpSLP::OrdersType>
5522BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5523 // No need to reorder if need to shuffle reuses, still need to shuffle the
5524 // node.
5525 if (!TE.ReuseShuffleIndices.empty()) {
5526 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5527 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5528 "Reshuffling scalars not yet supported for nodes with padding");
5529
5530 if (isSplat(TE.Scalars))
5531 return std::nullopt;
5532 // Check if reuse shuffle indices can be improved by reordering.
5533 // For this, check that reuse mask is "clustered", i.e. each scalar values
5534 // is used once in each submask of size <number_of_scalars>.
5535 // Example: 4 scalar values.
5536 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5537 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5538 // element 3 is used twice in the second submask.
5539 unsigned Sz = TE.Scalars.size();
5540 if (TE.isGather()) {
5541 if (std::optional<OrdersType> CurrentOrder =
5543 SmallVector<int> Mask;
5544 fixupOrderingIndices(*CurrentOrder);
5545 inversePermutation(*CurrentOrder, Mask);
5546 ::addMask(Mask, TE.ReuseShuffleIndices);
5547 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5548 unsigned Sz = TE.Scalars.size();
5549 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5550 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5551 if (Idx != PoisonMaskElem)
5552 Res[Idx + K * Sz] = I + K * Sz;
5553 }
5554 return std::move(Res);
5555 }
5556 }
5557 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5558 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5559 2 * TE.getVectorFactor())) == 1)
5560 return std::nullopt;
5561 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5562 Sz)) {
5563 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5564 if (TE.ReorderIndices.empty())
5565 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5566 else
5567 inversePermutation(TE.ReorderIndices, ReorderMask);
5568 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5569 unsigned VF = ReorderMask.size();
5570 OrdersType ResOrder(VF, VF);
5571 unsigned NumParts = divideCeil(VF, Sz);
5572 SmallBitVector UsedVals(NumParts);
5573 for (unsigned I = 0; I < VF; I += Sz) {
5574 int Val = PoisonMaskElem;
5575 unsigned UndefCnt = 0;
5576 unsigned Limit = std::min(Sz, VF - I);
5577 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5578 [&](int Idx) {
5579 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5580 Val = Idx;
5581 if (Idx == PoisonMaskElem)
5582 ++UndefCnt;
5583 return Idx != PoisonMaskElem && Idx != Val;
5584 }) ||
5585 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5586 UndefCnt > Sz / 2)
5587 return std::nullopt;
5588 UsedVals.set(Val);
5589 for (unsigned K = 0; K < NumParts; ++K) {
5590 unsigned Idx = Val + Sz * K;
5591 if (Idx < VF)
5592 ResOrder[Idx] = I + K;
5593 }
5594 }
5595 return std::move(ResOrder);
5596 }
5597 unsigned VF = TE.getVectorFactor();
5598 // Try build correct order for extractelement instructions.
5599 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5600 TE.ReuseShuffleIndices.end());
5601 if (TE.getOpcode() == Instruction::ExtractElement &&
5602 all_of(TE.Scalars, [Sz](Value *V) {
5603 if (isa<PoisonValue>(V))
5604 return true;
5605 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5606 return Idx && *Idx < Sz;
5607 })) {
5608 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5609 "by BinaryOperator and CastInst.");
5610 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5611 if (TE.ReorderIndices.empty())
5612 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5613 else
5614 inversePermutation(TE.ReorderIndices, ReorderMask);
5615 for (unsigned I = 0; I < VF; ++I) {
5616 int &Idx = ReusedMask[I];
5617 if (Idx == PoisonMaskElem)
5618 continue;
5619 Value *V = TE.Scalars[ReorderMask[Idx]];
5620 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5621 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5622 }
5623 }
5624 // Build the order of the VF size, need to reorder reuses shuffles, they are
5625 // always of VF size.
5626 OrdersType ResOrder(VF);
5627 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5628 auto *It = ResOrder.begin();
5629 for (unsigned K = 0; K < VF; K += Sz) {
5630 OrdersType CurrentOrder(TE.ReorderIndices);
5631 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5632 if (SubMask.front() == PoisonMaskElem)
5633 std::iota(SubMask.begin(), SubMask.end(), 0);
5634 reorderOrder(CurrentOrder, SubMask);
5635 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5636 std::advance(It, Sz);
5637 }
5638 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5639 return Data.index() == Data.value();
5640 }))
5641 return std::nullopt; // No need to reorder.
5642 return std::move(ResOrder);
5643 }
5644 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5645 any_of(TE.UserTreeIndices,
5646 [](const EdgeInfo &EI) {
5647 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5648 }) &&
5649 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5650 return std::nullopt;
5651 if ((TE.State == TreeEntry::Vectorize ||
5652 TE.State == TreeEntry::StridedVectorize) &&
5653 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5654 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5655 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5656 "BinaryOperator and CastInst.");
5657 return TE.ReorderIndices;
5658 }
5659 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5660 if (!TE.ReorderIndices.empty())
5661 return TE.ReorderIndices;
5662
5663 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5664 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5665 if (!V->hasNUsesOrMore(1))
5666 continue;
5667 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5668 if (!II)
5669 continue;
5670 Instruction *BVHead = nullptr;
5671 BasicBlock *BB = II->getParent();
5672 while (II && II->hasOneUse() && II->getParent() == BB) {
5673 BVHead = II;
5674 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5675 }
5676 I = BVHead;
5677 }
5678
5679 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5680 assert(BB1 != BB2 && "Expected different basic blocks.");
5681 auto *NodeA = DT->getNode(BB1);
5682 auto *NodeB = DT->getNode(BB2);
5683 assert(NodeA && "Should only process reachable instructions");
5684 assert(NodeB && "Should only process reachable instructions");
5685 assert((NodeA == NodeB) ==
5686 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5687 "Different nodes should have different DFS numbers");
5688 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5689 };
5690 auto PHICompare = [&](unsigned I1, unsigned I2) {
5691 Value *V1 = TE.Scalars[I1];
5692 Value *V2 = TE.Scalars[I2];
5693 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5694 return false;
5695 if (isa<PoisonValue>(V1))
5696 return true;
5697 if (isa<PoisonValue>(V2))
5698 return false;
5699 if (V1->getNumUses() < V2->getNumUses())
5700 return true;
5701 if (V1->getNumUses() > V2->getNumUses())
5702 return false;
5703 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5704 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5705 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5706 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5707 FirstUserOfPhi2->getParent());
5708 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5709 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5710 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5711 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5712 if (IE1 && !IE2)
5713 return true;
5714 if (!IE1 && IE2)
5715 return false;
5716 if (IE1 && IE2) {
5717 if (UserBVHead[I1] && !UserBVHead[I2])
5718 return true;
5719 if (!UserBVHead[I1])
5720 return false;
5721 if (UserBVHead[I1] == UserBVHead[I2])
5722 return getElementIndex(IE1) < getElementIndex(IE2);
5723 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5724 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5725 UserBVHead[I2]->getParent());
5726 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5727 }
5728 if (EE1 && !EE2)
5729 return true;
5730 if (!EE1 && EE2)
5731 return false;
5732 if (EE1 && EE2) {
5733 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5734 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5735 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5736 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5737 if (!Inst2 && !P2)
5738 return Inst1 || P1;
5739 if (EE1->getOperand(0) == EE2->getOperand(0))
5740 return getElementIndex(EE1) < getElementIndex(EE2);
5741 if (!Inst1 && Inst2)
5742 return false;
5743 if (Inst1 && Inst2) {
5744 if (Inst1->getParent() != Inst2->getParent())
5745 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5746 return Inst1->comesBefore(Inst2);
5747 }
5748 if (!P1 && P2)
5749 return false;
5750 assert(P1 && P2 &&
5751 "Expected either instructions or arguments vector operands.");
5752 return P1->getArgNo() < P2->getArgNo();
5753 }
5754 return false;
5755 };
5756 OrdersType Phis(TE.Scalars.size());
5757 std::iota(Phis.begin(), Phis.end(), 0);
5758 stable_sort(Phis, PHICompare);
5759 if (isIdentityOrder(Phis))
5760 return std::nullopt; // No need to reorder.
5761 return std::move(Phis);
5762 }
5763 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5764 // TODO: add analysis of other gather nodes with extractelement
5765 // instructions and other values/instructions, not only undefs.
5766 if ((TE.getOpcode() == Instruction::ExtractElement ||
5767 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5768 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5769 all_of(TE.Scalars, [](Value *V) {
5770 auto *EE = dyn_cast<ExtractElementInst>(V);
5771 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5772 })) {
5773 // Check that gather of extractelements can be represented as
5774 // just a shuffle of a single vector.
5775 OrdersType CurrentOrder;
5776 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5777 /*ResizeAllowed=*/true);
5778 if (Reuse || !CurrentOrder.empty())
5779 return std::move(CurrentOrder);
5780 }
5781 // If the gather node is <undef, v, .., poison> and
5782 // insertelement poison, v, 0 [+ permute]
5783 // is cheaper than
5784 // insertelement poison, v, n - try to reorder.
5785 // If rotating the whole graph, exclude the permute cost, the whole graph
5786 // might be transformed.
5787 int Sz = TE.Scalars.size();
5788 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5789 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5790 const auto *It =
5791 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5792 if (It == TE.Scalars.begin())
5793 return OrdersType();
5794 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5795 if (It != TE.Scalars.end()) {
5796 OrdersType Order(Sz, Sz);
5797 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5798 Order[Idx] = 0;
5799 fixupOrderingIndices(Order);
5800 SmallVector<int> Mask;
5801 inversePermutation(Order, Mask);
5802 InstructionCost PermuteCost =
5803 TopToBottom
5804 ? 0
5806 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5807 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5808 PoisonValue::get(Ty), *It);
5809 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5810 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5811 PoisonValue::get(Ty), *It);
5812 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5813 OrdersType Order(Sz, Sz);
5814 Order[Idx] = 0;
5815 return std::move(Order);
5816 }
5817 }
5818 }
5819 if (isSplat(TE.Scalars))
5820 return std::nullopt;
5821 if (TE.Scalars.size() >= 3)
5822 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5823 return Order;
5824 // Check if can include the order of vectorized loads. For masked gathers do
5825 // extra analysis later, so include such nodes into a special list.
5826 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5827 SmallVector<Value *> PointerOps;
5828 OrdersType CurrentOrder;
5829 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5830 CurrentOrder, PointerOps);
5832 return std::move(CurrentOrder);
5833 }
5834 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5835 // has been auditted for correctness with non-power-of-two vectors.
5836 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5837 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5838 return CurrentOrder;
5839 }
5840 return std::nullopt;
5841}
5842
5843/// Checks if the given mask is a "clustered" mask with the same clusters of
5844/// size \p Sz, which are not identity submasks.
5846 unsigned Sz) {
5847 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5848 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5849 return false;
5850 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5851 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5852 if (Cluster != FirstCluster)
5853 return false;
5854 }
5855 return true;
5856}
5857
5858void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5859 // Reorder reuses mask.
5860 reorderReuses(TE.ReuseShuffleIndices, Mask);
5861 const unsigned Sz = TE.Scalars.size();
5862 // For vectorized and non-clustered reused no need to do anything else.
5863 if (!TE.isGather() ||
5865 Sz) ||
5866 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5867 return;
5868 SmallVector<int> NewMask;
5869 inversePermutation(TE.ReorderIndices, NewMask);
5870 addMask(NewMask, TE.ReuseShuffleIndices);
5871 // Clear reorder since it is going to be applied to the new mask.
5872 TE.ReorderIndices.clear();
5873 // Try to improve gathered nodes with clustered reuses, if possible.
5874 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5875 SmallVector<unsigned> NewOrder(Slice);
5876 inversePermutation(NewOrder, NewMask);
5877 reorderScalars(TE.Scalars, NewMask);
5878 // Fill the reuses mask with the identity submasks.
5879 for (auto *It = TE.ReuseShuffleIndices.begin(),
5880 *End = TE.ReuseShuffleIndices.end();
5881 It != End; std::advance(It, Sz))
5882 std::iota(It, std::next(It, Sz), 0);
5883}
5884
5886 ArrayRef<unsigned> SecondaryOrder) {
5887 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5888 "Expected same size of orders");
5889 unsigned Sz = Order.size();
5890 SmallBitVector UsedIndices(Sz);
5891 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5892 if (Order[Idx] != Sz)
5893 UsedIndices.set(Order[Idx]);
5894 }
5895 if (SecondaryOrder.empty()) {
5896 for (unsigned Idx : seq<unsigned>(0, Sz))
5897 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5898 Order[Idx] = Idx;
5899 } else {
5900 for (unsigned Idx : seq<unsigned>(0, Sz))
5901 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5902 !UsedIndices.test(SecondaryOrder[Idx]))
5903 Order[Idx] = SecondaryOrder[Idx];
5904 }
5905}
5906
5908 // Maps VF to the graph nodes.
5910 // ExtractElement gather nodes which can be vectorized and need to handle
5911 // their ordering.
5913
5914 // Phi nodes can have preferred ordering based on their result users
5916
5917 // AltShuffles can also have a preferred ordering that leads to fewer
5918 // instructions, e.g., the addsub instruction in x86.
5919 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5920
5921 // Maps a TreeEntry to the reorder indices of external users.
5923 ExternalUserReorderMap;
5924 // Find all reorderable nodes with the given VF.
5925 // Currently the are vectorized stores,loads,extracts + some gathering of
5926 // extracts.
5927 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5928 const std::unique_ptr<TreeEntry> &TE) {
5929 // Look for external users that will probably be vectorized.
5930 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5931 findExternalStoreUsersReorderIndices(TE.get());
5932 if (!ExternalUserReorderIndices.empty()) {
5933 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5934 ExternalUserReorderMap.try_emplace(TE.get(),
5935 std::move(ExternalUserReorderIndices));
5936 }
5937
5938 // Patterns like [fadd,fsub] can be combined into a single instruction in
5939 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5940 // to take into account their order when looking for the most used order.
5941 if (TE->isAltShuffle()) {
5942 VectorType *VecTy =
5943 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5944 unsigned Opcode0 = TE->getOpcode();
5945 unsigned Opcode1 = TE->getAltOpcode();
5946 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5947 // If this pattern is supported by the target then we consider the order.
5948 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5949 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5950 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5951 }
5952 // TODO: Check the reverse order too.
5953 }
5954
5955 if (std::optional<OrdersType> CurrentOrder =
5956 getReorderingData(*TE, /*TopToBottom=*/true)) {
5957 // Do not include ordering for nodes used in the alt opcode vectorization,
5958 // better to reorder them during bottom-to-top stage. If follow the order
5959 // here, it causes reordering of the whole graph though actually it is
5960 // profitable just to reorder the subgraph that starts from the alternate
5961 // opcode vectorization node. Such nodes already end-up with the shuffle
5962 // instruction and it is just enough to change this shuffle rather than
5963 // rotate the scalars for the whole graph.
5964 unsigned Cnt = 0;
5965 const TreeEntry *UserTE = TE.get();
5966 while (UserTE && Cnt < RecursionMaxDepth) {
5967 if (UserTE->UserTreeIndices.size() != 1)
5968 break;
5969 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5970 return EI.UserTE->State == TreeEntry::Vectorize &&
5971 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5972 }))
5973 return;
5974 UserTE = UserTE->UserTreeIndices.back().UserTE;
5975 ++Cnt;
5976 }
5977 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5978 if (!(TE->State == TreeEntry::Vectorize ||
5979 TE->State == TreeEntry::StridedVectorize) ||
5980 !TE->ReuseShuffleIndices.empty())
5981 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5982 if (TE->State == TreeEntry::Vectorize &&
5983 TE->getOpcode() == Instruction::PHI)
5984 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5985 }
5986 });
5987
5988 // Reorder the graph nodes according to their vectorization factor.
5989 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5990 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5991 auto It = VFToOrderedEntries.find(VF);
5992 if (It == VFToOrderedEntries.end())
5993 continue;
5994 // Try to find the most profitable order. We just are looking for the most
5995 // used order and reorder scalar elements in the nodes according to this
5996 // mostly used order.
5997 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5998 // Delete VF entry upon exit.
5999 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
6000
6001 // All operands are reordered and used only in this node - propagate the
6002 // most used order to the user node.
6005 OrdersUses;
6007 for (const TreeEntry *OpTE : OrderedEntries) {
6008 // No need to reorder this nodes, still need to extend and to use shuffle,
6009 // just need to merge reordering shuffle and the reuse shuffle.
6010 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6011 continue;
6012 // Count number of orders uses.
6013 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6014 &PhisToOrders]() -> const OrdersType & {
6015 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6016 auto It = GathersToOrders.find(OpTE);
6017 if (It != GathersToOrders.end())
6018 return It->second;
6019 }
6020 if (OpTE->isAltShuffle()) {
6021 auto It = AltShufflesToOrders.find(OpTE);
6022 if (It != AltShufflesToOrders.end())
6023 return It->second;
6024 }
6025 if (OpTE->State == TreeEntry::Vectorize &&
6026 OpTE->getOpcode() == Instruction::PHI) {
6027 auto It = PhisToOrders.find(OpTE);
6028 if (It != PhisToOrders.end())
6029 return It->second;
6030 }
6031 return OpTE->ReorderIndices;
6032 }();
6033 // First consider the order of the external scalar users.
6034 auto It = ExternalUserReorderMap.find(OpTE);
6035 if (It != ExternalUserReorderMap.end()) {
6036 const auto &ExternalUserReorderIndices = It->second;
6037 // If the OpTE vector factor != number of scalars - use natural order,
6038 // it is an attempt to reorder node with reused scalars but with
6039 // external uses.
6040 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6041 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6042 ExternalUserReorderIndices.size();
6043 } else {
6044 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
6045 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6046 }
6047 // No other useful reorder data in this entry.
6048 if (Order.empty())
6049 continue;
6050 }
6051 // Stores actually store the mask, not the order, need to invert.
6052 if (OpTE->State == TreeEntry::Vectorize &&
6053 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6054 assert(!OpTE->isAltShuffle() &&
6055 "Alternate instructions are only supported by BinaryOperator "
6056 "and CastInst.");
6057 SmallVector<int> Mask;
6058 inversePermutation(Order, Mask);
6059 unsigned E = Order.size();
6060 OrdersType CurrentOrder(E, E);
6061 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6062 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6063 });
6064 fixupOrderingIndices(CurrentOrder);
6065 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6066 } else {
6067 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6068 }
6069 }
6070 if (OrdersUses.empty())
6071 continue;
6072 // Choose the most used order.
6073 unsigned IdentityCnt = 0;
6074 unsigned FilledIdentityCnt = 0;
6075 OrdersType IdentityOrder(VF, VF);
6076 for (auto &Pair : OrdersUses) {
6077 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6078 if (!Pair.first.empty())
6079 FilledIdentityCnt += Pair.second;
6080 IdentityCnt += Pair.second;
6081 combineOrders(IdentityOrder, Pair.first);
6082 }
6083 }
6084 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6085 unsigned Cnt = IdentityCnt;
6086 for (auto &Pair : OrdersUses) {
6087 // Prefer identity order. But, if filled identity found (non-empty order)
6088 // with same number of uses, as the new candidate order, we can choose
6089 // this candidate order.
6090 if (Cnt < Pair.second ||
6091 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6092 Cnt == Pair.second && !BestOrder.empty() &&
6093 isIdentityOrder(BestOrder))) {
6094 combineOrders(Pair.first, BestOrder);
6095 BestOrder = Pair.first;
6096 Cnt = Pair.second;
6097 } else {
6098 combineOrders(BestOrder, Pair.first);
6099 }
6100 }
6101 // Set order of the user node.
6102 if (isIdentityOrder(BestOrder))
6103 continue;
6104 fixupOrderingIndices(BestOrder);
6105 SmallVector<int> Mask;
6106 inversePermutation(BestOrder, Mask);
6107 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6108 unsigned E = BestOrder.size();
6109 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6110 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6111 });
6112 // Do an actual reordering, if profitable.
6113 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6114 // Just do the reordering for the nodes with the given VF.
6115 if (TE->Scalars.size() != VF) {
6116 if (TE->ReuseShuffleIndices.size() == VF) {
6117 // Need to reorder the reuses masks of the operands with smaller VF to
6118 // be able to find the match between the graph nodes and scalar
6119 // operands of the given node during vectorization/cost estimation.
6120 assert(all_of(TE->UserTreeIndices,
6121 [VF, &TE](const EdgeInfo &EI) {
6122 return EI.UserTE->Scalars.size() == VF ||
6123 EI.UserTE->Scalars.size() ==
6124 TE->Scalars.size();
6125 }) &&
6126 "All users must be of VF size.");
6127 if (SLPReVec) {
6128 assert(SLPReVec && "Only supported by REVEC.");
6129 // ShuffleVectorInst does not do reorderOperands (and it should not
6130 // because ShuffleVectorInst supports only a limited set of
6131 // patterns). Only do reorderNodeWithReuses if all of the users are
6132 // not ShuffleVectorInst.
6133 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6134 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6135 }))
6136 continue;
6137 assert(none_of(TE->UserTreeIndices,
6138 [&](const EdgeInfo &EI) {
6139 return isa<ShuffleVectorInst>(
6140 EI.UserTE->getMainOp());
6141 }) &&
6142 "Does not know how to reorder.");
6143 }
6144 // Update ordering of the operands with the smaller VF than the given
6145 // one.
6146 reorderNodeWithReuses(*TE, Mask);
6147 }
6148 continue;
6149 }
6150 if ((TE->State == TreeEntry::Vectorize ||
6151 TE->State == TreeEntry::StridedVectorize) &&
6153 InsertElementInst>(TE->getMainOp()) ||
6154 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6155 assert(!TE->isAltShuffle() &&
6156 "Alternate instructions are only supported by BinaryOperator "
6157 "and CastInst.");
6158 // Build correct orders for extract{element,value}, loads and
6159 // stores.
6160 reorderOrder(TE->ReorderIndices, Mask);
6161 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6162 TE->reorderOperands(Mask);
6163 } else {
6164 // Reorder the node and its operands.
6165 TE->reorderOperands(Mask);
6166 assert(TE->ReorderIndices.empty() &&
6167 "Expected empty reorder sequence.");
6168 reorderScalars(TE->Scalars, Mask);
6169 }
6170 if (!TE->ReuseShuffleIndices.empty()) {
6171 // Apply reversed order to keep the original ordering of the reused
6172 // elements to avoid extra reorder indices shuffling.
6173 OrdersType CurrentOrder;
6174 reorderOrder(CurrentOrder, MaskOrder);
6175 SmallVector<int> NewReuses;
6176 inversePermutation(CurrentOrder, NewReuses);
6177 addMask(NewReuses, TE->ReuseShuffleIndices);
6178 TE->ReuseShuffleIndices.swap(NewReuses);
6179 }
6180 }
6181 }
6182}
6183
6184bool BoUpSLP::canReorderOperands(
6185 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6186 ArrayRef<TreeEntry *> ReorderableGathers,
6187 SmallVectorImpl<TreeEntry *> &GatherOps) {
6188 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6189 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6190 return OpData.first == I &&
6191 (OpData.second->State == TreeEntry::Vectorize ||
6192 OpData.second->State == TreeEntry::StridedVectorize);
6193 }))
6194 continue;
6195 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6196 // Do not reorder if operand node is used by many user nodes.
6197 if (any_of(TE->UserTreeIndices,
6198 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6199 return false;
6200 // Add the node to the list of the ordered nodes with the identity
6201 // order.
6202 Edges.emplace_back(I, TE);
6203 // Add ScatterVectorize nodes to the list of operands, where just
6204 // reordering of the scalars is required. Similar to the gathers, so
6205 // simply add to the list of gathered ops.
6206 // If there are reused scalars, process this node as a regular vectorize
6207 // node, just reorder reuses mask.
6208 if (TE->State != TreeEntry::Vectorize &&
6209 TE->State != TreeEntry::StridedVectorize &&
6210 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6211 GatherOps.push_back(TE);
6212 continue;
6213 }
6214 TreeEntry *Gather = nullptr;
6215 if (count_if(ReorderableGathers,
6216 [&Gather, UserTE, I](TreeEntry *TE) {
6217 assert(TE->State != TreeEntry::Vectorize &&
6218 TE->State != TreeEntry::StridedVectorize &&
6219 "Only non-vectorized nodes are expected.");
6220 if (any_of(TE->UserTreeIndices,
6221 [UserTE, I](const EdgeInfo &EI) {
6222 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6223 })) {
6224 assert(TE->isSame(UserTE->getOperand(I)) &&
6225 "Operand entry does not match operands.");
6226 Gather = TE;
6227 return true;
6228 }
6229 return false;
6230 }) > 1 &&
6231 !allConstant(UserTE->getOperand(I)))
6232 return false;
6233 if (Gather)
6234 GatherOps.push_back(Gather);
6235 }
6236 return true;
6237}
6238
6239void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6240 SetVector<TreeEntry *> OrderedEntries;
6241 DenseSet<const TreeEntry *> GathersToOrders;
6242 // Find all reorderable leaf nodes with the given VF.
6243 // Currently the are vectorized loads,extracts without alternate operands +
6244 // some gathering of extracts.
6245 SmallVector<TreeEntry *> NonVectorized;
6246 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6247 if (TE->State != TreeEntry::Vectorize &&
6248 TE->State != TreeEntry::StridedVectorize)
6249 NonVectorized.push_back(TE.get());
6250 if (std::optional<OrdersType> CurrentOrder =
6251 getReorderingData(*TE, /*TopToBottom=*/false)) {
6252 OrderedEntries.insert(TE.get());
6253 if (!(TE->State == TreeEntry::Vectorize ||
6254 TE->State == TreeEntry::StridedVectorize) ||
6255 !TE->ReuseShuffleIndices.empty())
6256 GathersToOrders.insert(TE.get());
6257 }
6258 }
6259
6260 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6261 // I.e., if the node has operands, that are reordered, try to make at least
6262 // one operand order in the natural order and reorder others + reorder the
6263 // user node itself.
6265 while (!OrderedEntries.empty()) {
6266 // 1. Filter out only reordered nodes.
6267 // 2. If the entry has multiple uses - skip it and jump to the next node.
6269 SmallVector<TreeEntry *> Filtered;
6270 for (TreeEntry *TE : OrderedEntries) {
6271 if (!(TE->State == TreeEntry::Vectorize ||
6272 TE->State == TreeEntry::StridedVectorize ||
6273 (TE->isGather() && GathersToOrders.contains(TE))) ||
6274 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6275 !all_of(drop_begin(TE->UserTreeIndices),
6276 [TE](const EdgeInfo &EI) {
6277 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6278 }) ||
6279 !Visited.insert(TE).second) {
6280 Filtered.push_back(TE);
6281 continue;
6282 }
6283 // Build a map between user nodes and their operands order to speedup
6284 // search. The graph currently does not provide this dependency directly.
6285 for (EdgeInfo &EI : TE->UserTreeIndices)
6286 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6287 }
6288 // Erase filtered entries.
6289 for (TreeEntry *TE : Filtered)
6290 OrderedEntries.remove(TE);
6292 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6293 UsersVec(Users.begin(), Users.end());
6294 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6295 return Data1.first->Idx > Data2.first->Idx;
6296 });
6297 for (auto &Data : UsersVec) {
6298 // Check that operands are used only in the User node.
6299 SmallVector<TreeEntry *> GatherOps;
6300 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6301 GatherOps)) {
6302 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6303 OrderedEntries.remove(Op.second);
6304 continue;
6305 }
6306 // All operands are reordered and used only in this node - propagate the
6307 // most used order to the user node.
6310 OrdersUses;
6311 // Do the analysis for each tree entry only once, otherwise the order of
6312 // the same node my be considered several times, though might be not
6313 // profitable.
6316 for (const auto &Op : Data.second) {
6317 TreeEntry *OpTE = Op.second;
6318 if (!VisitedOps.insert(OpTE).second)
6319 continue;
6320 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6321 continue;
6322 const auto Order = [&]() -> const OrdersType {
6323 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6324 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6325 .value_or(OrdersType(1));
6326 return OpTE->ReorderIndices;
6327 }();
6328 // The order is partially ordered, skip it in favor of fully non-ordered
6329 // orders.
6330 if (Order.size() == 1)
6331 continue;
6332 unsigned NumOps = count_if(
6333 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6334 return P.second == OpTE;
6335 });
6336 // Stores actually store the mask, not the order, need to invert.
6337 if (OpTE->State == TreeEntry::Vectorize &&
6338 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6339 assert(!OpTE->isAltShuffle() &&
6340 "Alternate instructions are only supported by BinaryOperator "
6341 "and CastInst.");
6342 SmallVector<int> Mask;
6343 inversePermutation(Order, Mask);
6344 unsigned E = Order.size();
6345 OrdersType CurrentOrder(E, E);
6346 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6347 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6348 });
6349 fixupOrderingIndices(CurrentOrder);
6350 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6351 NumOps;
6352 } else {
6353 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6354 }
6355 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6356 const auto AllowsReordering = [&](const TreeEntry *TE) {
6357 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6358 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6359 (IgnoreReorder && TE->Idx == 0))
6360 return true;
6361 if (TE->isGather()) {
6362 if (GathersToOrders.contains(TE))
6363 return !getReorderingData(*TE, /*TopToBottom=*/false)
6364 .value_or(OrdersType(1))
6365 .empty();
6366 return true;
6367 }
6368 return false;
6369 };
6370 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6371 TreeEntry *UserTE = EI.UserTE;
6372 if (!VisitedUsers.insert(UserTE).second)
6373 continue;
6374 // May reorder user node if it requires reordering, has reused
6375 // scalars, is an alternate op vectorize node or its op nodes require
6376 // reordering.
6377 if (AllowsReordering(UserTE))
6378 continue;
6379 // Check if users allow reordering.
6380 // Currently look up just 1 level of operands to avoid increase of
6381 // the compile time.
6382 // Profitable to reorder if definitely more operands allow
6383 // reordering rather than those with natural order.
6385 if (static_cast<unsigned>(count_if(
6386 Ops, [UserTE, &AllowsReordering](
6387 const std::pair<unsigned, TreeEntry *> &Op) {
6388 return AllowsReordering(Op.second) &&
6389 all_of(Op.second->UserTreeIndices,
6390 [UserTE](const EdgeInfo &EI) {
6391 return EI.UserTE == UserTE;
6392 });
6393 })) <= Ops.size() / 2)
6394 ++Res.first->second;
6395 }
6396 }
6397 if (OrdersUses.empty()) {
6398 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6399 OrderedEntries.remove(Op.second);
6400 continue;
6401 }
6402 // Choose the most used order.
6403 unsigned IdentityCnt = 0;
6404 unsigned VF = Data.second.front().second->getVectorFactor();
6405 OrdersType IdentityOrder(VF, VF);
6406 for (auto &Pair : OrdersUses) {
6407 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6408 IdentityCnt += Pair.second;
6409 combineOrders(IdentityOrder, Pair.first);
6410 }
6411 }
6412 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6413 unsigned Cnt = IdentityCnt;
6414 for (auto &Pair : OrdersUses) {
6415 // Prefer identity order. But, if filled identity found (non-empty
6416 // order) with same number of uses, as the new candidate order, we can
6417 // choose this candidate order.
6418 if (Cnt < Pair.second) {
6419 combineOrders(Pair.first, BestOrder);
6420 BestOrder = Pair.first;
6421 Cnt = Pair.second;
6422 } else {
6423 combineOrders(BestOrder, Pair.first);
6424 }
6425 }
6426 // Set order of the user node.
6427 if (isIdentityOrder(BestOrder)) {
6428 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6429 OrderedEntries.remove(Op.second);
6430 continue;
6431 }
6432 fixupOrderingIndices(BestOrder);
6433 // Erase operands from OrderedEntries list and adjust their orders.
6434 VisitedOps.clear();
6435 SmallVector<int> Mask;
6436 inversePermutation(BestOrder, Mask);
6437 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6438 unsigned E = BestOrder.size();
6439 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6440 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6441 });
6442 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6443 TreeEntry *TE = Op.second;
6444 OrderedEntries.remove(TE);
6445 if (!VisitedOps.insert(TE).second)
6446 continue;
6447 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6448 reorderNodeWithReuses(*TE, Mask);
6449 continue;
6450 }
6451 // Gathers are processed separately.
6452 if (TE->State != TreeEntry::Vectorize &&
6453 TE->State != TreeEntry::StridedVectorize &&
6454 (TE->State != TreeEntry::ScatterVectorize ||
6455 TE->ReorderIndices.empty()))
6456 continue;
6457 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6458 TE->ReorderIndices.empty()) &&
6459 "Non-matching sizes of user/operand entries.");
6460 reorderOrder(TE->ReorderIndices, Mask);
6461 if (IgnoreReorder && TE == VectorizableTree.front().get())
6462 IgnoreReorder = false;
6463 }
6464 // For gathers just need to reorder its scalars.
6465 for (TreeEntry *Gather : GatherOps) {
6466 assert(Gather->ReorderIndices.empty() &&
6467 "Unexpected reordering of gathers.");
6468 if (!Gather->ReuseShuffleIndices.empty()) {
6469 // Just reorder reuses indices.
6470 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6471 continue;
6472 }
6473 reorderScalars(Gather->Scalars, Mask);
6474 OrderedEntries.remove(Gather);
6475 }
6476 // Reorder operands of the user node and set the ordering for the user
6477 // node itself.
6478 if (Data.first->State != TreeEntry::Vectorize ||
6479 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6480 Data.first->getMainOp()) ||
6481 Data.first->isAltShuffle())
6482 Data.first->reorderOperands(Mask);
6483 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6484 Data.first->isAltShuffle() ||
6485 Data.first->State == TreeEntry::StridedVectorize) {
6486 reorderScalars(Data.first->Scalars, Mask);
6487 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6488 /*BottomOrder=*/true);
6489 if (Data.first->ReuseShuffleIndices.empty() &&
6490 !Data.first->ReorderIndices.empty() &&
6491 !Data.first->isAltShuffle()) {
6492 // Insert user node to the list to try to sink reordering deeper in
6493 // the graph.
6494 OrderedEntries.insert(Data.first);
6495 }
6496 } else {
6497 reorderOrder(Data.first->ReorderIndices, Mask);
6498 }
6499 }
6500 }
6501 // If the reordering is unnecessary, just remove the reorder.
6502 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6503 VectorizableTree.front()->ReuseShuffleIndices.empty())
6504 VectorizableTree.front()->ReorderIndices.clear();
6505}
6506
6507Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6508 if ((Entry.getOpcode() == Instruction::Store ||
6509 Entry.getOpcode() == Instruction::Load) &&
6510 Entry.State == TreeEntry::StridedVectorize &&
6511 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6512 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6513 return dyn_cast<Instruction>(Entry.Scalars.front());
6514}
6515
6517 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6518 DenseMap<Value *, unsigned> ScalarToExtUses;
6519 // Collect the values that we need to extract from the tree.
6520 for (auto &TEPtr : VectorizableTree) {
6521 TreeEntry *Entry = TEPtr.get();
6522
6523 // No need to handle users of gathered values.
6524 if (Entry->isGather())
6525 continue;
6526
6527 // For each lane:
6528 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6529 Value *Scalar = Entry->Scalars[Lane];
6530 if (!isa<Instruction>(Scalar))
6531 continue;
6532 // All uses must be replaced already? No need to do it again.
6533 auto It = ScalarToExtUses.find(Scalar);
6534 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6535 continue;
6536
6537 // Check if the scalar is externally used as an extra arg.
6538 const auto ExtI = ExternallyUsedValues.find(Scalar);
6539 if (ExtI != ExternallyUsedValues.end()) {
6540 int FoundLane = Entry->findLaneForValue(Scalar);
6541 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6542 << FoundLane << " from " << *Scalar << ".\n");
6543 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6544 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6545 continue;
6546 }
6547 for (User *U : Scalar->users()) {
6548 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6549
6550 Instruction *UserInst = dyn_cast<Instruction>(U);
6551 if (!UserInst || isDeleted(UserInst))
6552 continue;
6553
6554 // Ignore users in the user ignore list.
6555 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6556 continue;
6557
6558 // Skip in-tree scalars that become vectors
6559 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6560 // Some in-tree scalars will remain as scalar in vectorized
6561 // instructions. If that is the case, the one in FoundLane will
6562 // be used.
6563 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6565 Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
6566 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6567 << ".\n");
6568 assert(!UseEntry->isGather() && "Bad state");
6569 continue;
6570 }
6571 U = nullptr;
6572 if (It != ScalarToExtUses.end()) {
6573 ExternalUses[It->second].User = nullptr;
6574 break;
6575 }
6576 }
6577
6578 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6579 U = nullptr;
6580 int FoundLane = Entry->findLaneForValue(Scalar);
6581 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6582 << " from lane " << FoundLane << " from " << *Scalar
6583 << ".\n");
6584 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6585 ExternalUses.emplace_back(Scalar, U, FoundLane);
6586 if (!U)
6587 break;
6588 }
6589 }
6590 }
6591}
6592
6594BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6597 PtrToStoresMap;
6598 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6599 Value *V = TE->Scalars[Lane];
6600 // Don't iterate over the users of constant data.
6601 if (!isa<Instruction>(V))
6602 continue;
6603 // To save compilation time we don't visit if we have too many users.
6604 if (V->hasNUsesOrMore(UsesLimit))
6605 break;
6606
6607 // Collect stores per pointer object.
6608 for (User *U : V->users()) {
6609 auto *SI = dyn_cast<StoreInst>(U);
6610 // Test whether we can handle the store. V might be a global, which could
6611 // be used in a different function.
6612 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6613 !isValidElementType(SI->getValueOperand()->getType()))
6614 continue;
6615 // Skip entry if already
6616 if (getTreeEntry(U))
6617 continue;
6618
6619 Value *Ptr =
6620 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6621 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6622 SI->getValueOperand()->getType(), Ptr}];
6623 // For now just keep one store per pointer object per lane.
6624 // TODO: Extend this to support multiple stores per pointer per lane
6625 if (StoresVec.size() > Lane)
6626 continue;
6627 if (!StoresVec.empty()) {
6628 std::optional<int> Diff = getPointersDiff(
6629 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6630 SI->getValueOperand()->getType(),
6631 StoresVec.front()->getPointerOperand(), *DL, *SE,
6632 /*StrictCheck=*/true);
6633 // We failed to compare the pointers so just abandon this store.
6634 if (!Diff)
6635 continue;
6636 }
6637 StoresVec.push_back(SI);
6638 }
6639 }
6640 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6641 unsigned I = 0;
6642 for (auto &P : PtrToStoresMap) {
6643 Res[I].swap(P.second);
6644 ++I;
6645 }
6646 return Res;
6647}
6648
6649bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6650 OrdersType &ReorderIndices) const {
6651 // We check whether the stores in StoreVec can form a vector by sorting them
6652 // and checking whether they are consecutive.
6653
6654 // To avoid calling getPointersDiff() while sorting we create a vector of
6655 // pairs {store, offset from first} and sort this instead.
6657 StoreInst *S0 = StoresVec[0];
6658 StoreOffsetVec.emplace_back(0, 0);
6659 Type *S0Ty = S0->getValueOperand()->getType();
6660 Value *S0Ptr = S0->getPointerOperand();
6661 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6662 StoreInst *SI = StoresVec[Idx];
6663 std::optional<int> Diff =
6664 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6665 SI->getPointerOperand(), *DL, *SE,
6666 /*StrictCheck=*/true);
6667 StoreOffsetVec.emplace_back(*Diff, Idx);
6668 }
6669
6670 // Check if the stores are consecutive by checking if their difference is 1.
6671 if (StoreOffsetVec.size() != StoresVec.size())
6672 return false;
6673 sort(StoreOffsetVec,
6674 [](const std::pair<int, unsigned> &L,
6675 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6676 unsigned Idx = 0;
6677 int PrevDist = 0;
6678 for (const auto &P : StoreOffsetVec) {
6679 if (Idx > 0 && P.first != PrevDist + 1)
6680 return false;
6681 PrevDist = P.first;
6682 ++Idx;
6683 }
6684
6685 // Calculate the shuffle indices according to their offset against the sorted
6686 // StoreOffsetVec.
6687 ReorderIndices.assign(StoresVec.size(), 0);
6688 bool IsIdentity = true;
6689 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6690 ReorderIndices[P.second] = I;
6691 IsIdentity &= P.second == I;
6692 }
6693 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6694 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6695 // same convention here.
6696 if (IsIdentity)
6697 ReorderIndices.clear();
6698
6699 return true;
6700}
6701
6702#ifndef NDEBUG
6704 for (unsigned Idx : Order)
6705 dbgs() << Idx << ", ";
6706 dbgs() << "\n";
6707}
6708#endif
6709
6711BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6712 unsigned NumLanes = TE->Scalars.size();
6713
6714 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6715
6716 // Holds the reorder indices for each candidate store vector that is a user of
6717 // the current TreeEntry.
6718 SmallVector<OrdersType, 1> ExternalReorderIndices;
6719
6720 // Now inspect the stores collected per pointer and look for vectorization
6721 // candidates. For each candidate calculate the reorder index vector and push
6722 // it into `ExternalReorderIndices`
6723 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6724 // If we have fewer than NumLanes stores, then we can't form a vector.
6725 if (StoresVec.size() != NumLanes)
6726 continue;
6727
6728 // If the stores are not consecutive then abandon this StoresVec.
6729 OrdersType ReorderIndices;
6730 if (!canFormVector(StoresVec, ReorderIndices))
6731 continue;
6732
6733 // We now know that the scalars in StoresVec can form a vector instruction,
6734 // so set the reorder indices.
6735 ExternalReorderIndices.push_back(ReorderIndices);
6736 }
6737 return ExternalReorderIndices;
6738}
6739
6741 const SmallDenseSet<Value *> &UserIgnoreLst) {
6742 deleteTree();
6743 UserIgnoreList = &UserIgnoreLst;
6744 if (!allSameType(Roots))
6745 return;
6746 buildTree_rec(Roots, 0, EdgeInfo());
6747}
6748
6750 deleteTree();
6751 if (!allSameType(Roots))
6752 return;
6753 buildTree_rec(Roots, 0, EdgeInfo());
6754}
6755
6756/// Tries to find subvector of loads and builds new vector of only loads if can
6757/// be profitable.
6759 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6761 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6762 bool AddNew = true) {
6763 if (VL.empty())
6764 return;
6765 Type *ScalarTy = getValueType(VL.front());
6766 if (!isValidElementType(ScalarTy))
6767 return;
6769 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6770 for (Value *V : VL) {
6771 auto *LI = dyn_cast<LoadInst>(V);
6772 if (!LI)
6773 continue;
6774 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6775 continue;
6776 bool IsFound = false;
6777 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6778 assert(LI->getParent() == Data.front().first->getParent() &&
6779 LI->getType() == Data.front().first->getType() &&
6780 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6781 getUnderlyingObject(Data.front().first->getPointerOperand(),
6783 "Expected loads with the same type, same parent and same "
6784 "underlying pointer.");
6785 std::optional<int> Dist = getPointersDiff(
6786 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6787 Data.front().first->getPointerOperand(), DL, SE,
6788 /*StrictCheck=*/true);
6789 if (!Dist)
6790 continue;
6791 auto It = Map.find(*Dist);
6792 if (It != Map.end() && It->second != LI)
6793 continue;
6794 if (It == Map.end()) {
6795 Data.emplace_back(LI, *Dist);
6796 Map.try_emplace(*Dist, LI);
6797 }
6798 IsFound = true;
6799 break;
6800 }
6801 if (!IsFound) {
6802 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6803 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6804 }
6805 }
6806 auto FindMatchingLoads =
6809 &GatheredLoads,
6810 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6811 int &Offset, unsigned &Start) {
6812 if (Loads.empty())
6813 return GatheredLoads.end();
6815 LoadInst *LI = Loads.front().first;
6816 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6817 if (Idx < Start)
6818 continue;
6819 ToAdd.clear();
6820 if (LI->getParent() != Data.front().first->getParent() ||
6821 LI->getType() != Data.front().first->getType())
6822 continue;
6823 std::optional<int> Dist =
6825 Data.front().first->getType(),
6826 Data.front().first->getPointerOperand(), DL, SE,
6827 /*StrictCheck=*/true);
6828 if (!Dist)
6829 continue;
6830 SmallSet<int, 4> DataDists;
6832 for (std::pair<LoadInst *, int> P : Data) {
6833 DataDists.insert(P.second);
6834 DataLoads.insert(P.first);
6835 }
6836 // Found matching gathered loads - check if all loads are unique or
6837 // can be effectively vectorized.
6838 unsigned NumUniques = 0;
6839 for (auto [Cnt, Pair] : enumerate(Loads)) {
6840 bool Used = DataLoads.contains(Pair.first);
6841 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6842 ++NumUniques;
6843 ToAdd.insert(Cnt);
6844 } else if (Used) {
6845 Repeated.insert(Cnt);
6846 }
6847 }
6848 if (NumUniques > 0 &&
6849 (Loads.size() == NumUniques ||
6850 (Loads.size() - NumUniques >= 2 &&
6851 Loads.size() - NumUniques >= Loads.size() / 2 &&
6852 (has_single_bit(Data.size() + NumUniques) ||
6853 bit_ceil(Data.size()) <
6854 bit_ceil(Data.size() + NumUniques))))) {
6855 Offset = *Dist;
6856 Start = Idx + 1;
6857 return std::next(GatheredLoads.begin(), Idx);
6858 }
6859 }
6860 ToAdd.clear();
6861 return GatheredLoads.end();
6862 };
6863 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6864 unsigned Start = 0;
6865 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6866 int Offset = 0;
6867 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6868 Offset, Start);
6869 while (It != GatheredLoads.end()) {
6870 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6871 for (unsigned Idx : LocalToAdd)
6872 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6873 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6874 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6875 Start);
6876 }
6877 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6878 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6879 })) {
6880 auto AddNewLoads =
6882 for (unsigned Idx : seq<unsigned>(Data.size())) {
6883 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6884 continue;
6885 Loads.push_back(Data[Idx]);
6886 }
6887 };
6888 if (!AddNew) {
6889 LoadInst *LI = Data.front().first;
6890 It = find_if(
6891 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6892 return PD.front().first->getParent() == LI->getParent() &&
6893 PD.front().first->getType() == LI->getType();
6894 });
6895 while (It != GatheredLoads.end()) {
6896 AddNewLoads(*It);
6897 It = std::find_if(
6898 std::next(It), GatheredLoads.end(),
6899 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6900 return PD.front().first->getParent() == LI->getParent() &&
6901 PD.front().first->getType() == LI->getType();
6902 });
6903 }
6904 }
6905 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6906 AddNewLoads(GatheredLoads.emplace_back());
6907 }
6908 }
6909}
6910
6911void BoUpSLP::tryToVectorizeGatheredLoads(
6912 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6913 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6914 8> &GatheredLoads) {
6915 GatheredLoadsEntriesFirst = VectorizableTree.size();
6916
6917 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6918 LoadEntriesToVectorize.size());
6919 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6920 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6921 VectorizableTree[Idx]->Scalars.end());
6922
6923 // Sort loads by distance.
6924 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6925 const std::pair<LoadInst *, int> &L2) {
6926 return L1.second > L2.second;
6927 };
6928
6929 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
6930 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6931 Loads.size());
6932 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6933 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6934 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6935 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6936 };
6937
6938 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6939 BoUpSLP::ValueSet &VectorizedLoads,
6940 SmallVectorImpl<LoadInst *> &NonVectorized,
6941 bool Final, unsigned MaxVF) {
6943 unsigned StartIdx = 0;
6944 SmallVector<int> CandidateVFs;
6945 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6946 CandidateVFs.push_back(MaxVF);
6947 for (int NumElts = getFloorFullVectorNumberOfElements(
6948 *TTI, Loads.front()->getType(), MaxVF);
6949 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6950 *TTI, Loads.front()->getType(), NumElts - 1)) {
6951 CandidateVFs.push_back(NumElts);
6952 if (VectorizeNonPowerOf2 && NumElts > 2)
6953 CandidateVFs.push_back(NumElts - 1);
6954 }
6955
6956 if (Final && CandidateVFs.empty())
6957 return Results;
6958
6959 unsigned BestVF = Final ? CandidateVFs.back() : 0;
6960 for (unsigned NumElts : CandidateVFs) {
6961 if (Final && NumElts > BestVF)
6962 continue;
6963 SmallVector<unsigned> MaskedGatherVectorized;
6964 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
6965 ++Cnt) {
6966 ArrayRef<LoadInst *> Slice =
6967 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
6968 if (VectorizedLoads.count(Slice.front()) ||
6969 VectorizedLoads.count(Slice.back()) ||
6971 continue;
6972 // Check if it is profitable to try vectorizing gathered loads. It is
6973 // profitable if we have more than 3 consecutive loads or if we have
6974 // less but all users are vectorized or deleted.
6975 bool AllowToVectorize = false;
6976 // Check if it is profitable to vectorize 2-elements loads.
6977 if (NumElts == 2) {
6978 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
6979 Slice.front()->getType(), ElementCount::getFixed(NumElts));
6980 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
6981 for (LoadInst *LI : Slice) {
6982 // If single use/user - allow to vectorize.
6983 if (LI->hasOneUse())
6984 continue;
6985 // 1. Check if number of uses equals number of users.
6986 // 2. All users are deleted.
6987 // 3. The load broadcasts are not allowed or the load is not
6988 // broadcasted.
6989 if (static_cast<unsigned int>(std::distance(
6990 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6991 return false;
6992 if (!IsLegalBroadcastLoad)
6993 continue;
6994 if (LI->hasNUsesOrMore(UsesLimit))
6995 return false;
6996 for (User *U : LI->users()) {
6997 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
6998 continue;
6999 if (const TreeEntry *UTE = getTreeEntry(U)) {
7000 for (int I : seq<int>(UTE->getNumOperands())) {
7001 if (all_of(UTE->getOperand(I),
7002 [LI](Value *V) { return V == LI; }))
7003 // Found legal broadcast - do not vectorize.
7004 return false;
7005 }
7006 }
7007 }
7008 }
7009 return true;
7010 };
7011 AllowToVectorize = CheckIfAllowed(Slice);
7012 } else {
7013 AllowToVectorize =
7014 (NumElts >= 3 ||
7015 any_of(ValueToGatherNodes.at(Slice.front()),
7016 [=](const TreeEntry *TE) {
7017 return TE->Scalars.size() == 2 &&
7018 ((TE->Scalars.front() == Slice.front() &&
7019 TE->Scalars.back() == Slice.back()) ||
7020 (TE->Scalars.front() == Slice.back() &&
7021 TE->Scalars.back() == Slice.front()));
7022 })) &&
7023 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
7024 Slice.size());
7025 }
7026 if (AllowToVectorize) {
7027 SmallVector<Value *> PointerOps;
7028 OrdersType CurrentOrder;
7029 // Try to build vector load.
7030 ArrayRef<Value *> Values(
7031 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7032 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
7033 PointerOps, &BestVF);
7034 if (LS != LoadsState::Gather ||
7035 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7036 if (LS == LoadsState::ScatterVectorize) {
7037 if (MaskedGatherVectorized.empty() ||
7038 Cnt >= MaskedGatherVectorized.back() + NumElts)
7039 MaskedGatherVectorized.push_back(Cnt);
7040 continue;
7041 }
7042 if (LS != LoadsState::Gather) {
7043 Results.emplace_back(Values, LS);
7044 VectorizedLoads.insert(Slice.begin(), Slice.end());
7045 // If we vectorized initial block, no need to try to vectorize it
7046 // again.
7047 if (Cnt == StartIdx)
7048 StartIdx += NumElts;
7049 }
7050 // Check if the whole array was vectorized already - exit.
7051 if (StartIdx >= Loads.size())
7052 break;
7053 // Erase last masked gather candidate, if another candidate within
7054 // the range is found to be better.
7055 if (!MaskedGatherVectorized.empty() &&
7056 Cnt < MaskedGatherVectorized.back() + NumElts)
7057 MaskedGatherVectorized.pop_back();
7058 Cnt += NumElts - 1;
7059 continue;
7060 }
7061 }
7062 if (!AllowToVectorize || BestVF == 0)
7064 }
7065 // Mark masked gathers candidates as vectorized, if any.
7066 for (unsigned Cnt : MaskedGatherVectorized) {
7067 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7068 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7069 ArrayRef<Value *> Values(
7070 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7071 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7072 VectorizedLoads.insert(Slice.begin(), Slice.end());
7073 // If we vectorized initial block, no need to try to vectorize it again.
7074 if (Cnt == StartIdx)
7075 StartIdx += NumElts;
7076 }
7077 }
7078 for (LoadInst *LI : Loads) {
7079 if (!VectorizedLoads.contains(LI))
7080 NonVectorized.push_back(LI);
7081 }
7082 return Results;
7083 };
7084 auto ProcessGatheredLoads =
7085 [&, &TTI = *TTI](
7087 bool Final = false) {
7088 SmallVector<LoadInst *> NonVectorized;
7089 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7090 if (LoadsDists.size() <= 1) {
7091 NonVectorized.push_back(LoadsDists.back().first);
7092 continue;
7093 }
7094 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7095 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7096 transform(LoadsDists, OriginalLoads.begin(),
7097 [](const std::pair<LoadInst *, int> &L) -> LoadInst * {
7098 return L.first;
7099 });
7100 stable_sort(LocalLoadsDists, LoadSorter);
7102 unsigned MaxConsecutiveDistance = 0;
7103 unsigned CurrentConsecutiveDist = 1;
7104 int LastDist = LocalLoadsDists.front().second;
7105 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7106 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7107 if (getTreeEntry(L.first))
7108 continue;
7109 assert(LastDist >= L.second &&
7110 "Expected first distance always not less than second");
7111 if (static_cast<unsigned>(LastDist - L.second) ==
7112 CurrentConsecutiveDist) {
7113 ++CurrentConsecutiveDist;
7114 MaxConsecutiveDistance =
7115 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7116 Loads.push_back(L.first);
7117 continue;
7118 }
7119 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7120 !Loads.empty())
7121 Loads.pop_back();
7122 CurrentConsecutiveDist = 1;
7123 LastDist = L.second;
7124 Loads.push_back(L.first);
7125 }
7126 if (Loads.size() <= 1)
7127 continue;
7128 if (AllowMaskedGather)
7129 MaxConsecutiveDistance = Loads.size();
7130 else if (MaxConsecutiveDistance < 2)
7131 continue;
7132 BoUpSLP::ValueSet VectorizedLoads;
7133 SmallVector<LoadInst *> SortedNonVectorized;
7135 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7136 Final, MaxConsecutiveDistance);
7137 if (!Results.empty() && !SortedNonVectorized.empty() &&
7138 OriginalLoads.size() == Loads.size() &&
7139 MaxConsecutiveDistance == Loads.size() &&
7141 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7142 return P.second == LoadsState::ScatterVectorize;
7143 })) {
7144 VectorizedLoads.clear();
7145 SmallVector<LoadInst *> UnsortedNonVectorized;
7147 UnsortedResults =
7148 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7149 UnsortedNonVectorized, Final,
7150 OriginalLoads.size());
7151 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7152 SortedNonVectorized.swap(UnsortedNonVectorized);
7153 Results.swap(UnsortedResults);
7154 }
7155 }
7156 for (auto [Slice, _] : Results) {
7157 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7158 << Slice.size() << ")\n");
7159 if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
7160 for (Value *L : Slice)
7161 if (!getTreeEntry(L))
7162 SortedNonVectorized.push_back(cast<LoadInst>(L));
7163 continue;
7164 }
7165
7166 // Select maximum VF as a maximum of user gathered nodes and
7167 // distance between scalar loads in these nodes.
7168 unsigned MaxVF = Slice.size();
7169 unsigned UserMaxVF = 0;
7170 unsigned InterleaveFactor = 0;
7171 if (MaxVF == 2) {
7172 UserMaxVF = MaxVF;
7173 } else {
7174 // Found distance between segments of the interleaved loads.
7175 std::optional<unsigned> InterleavedLoadsDistance = 0;
7176 unsigned Order = 0;
7177 std::optional<unsigned> CommonVF = 0;
7179 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7180 for (auto [Idx, V] : enumerate(Slice)) {
7181 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7182 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7183 unsigned Pos =
7184 EntryToPosition.try_emplace(E, Idx).first->second;
7185 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7186 if (CommonVF) {
7187 if (*CommonVF == 0) {
7188 CommonVF = E->Scalars.size();
7189 continue;
7190 }
7191 if (*CommonVF != E->Scalars.size())
7192 CommonVF.reset();
7193 }
7194 // Check if the load is the part of the interleaved load.
7195 if (Pos != Idx && InterleavedLoadsDistance) {
7196 if (!DeinterleavedNodes.contains(E) &&
7197 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7198 if (isa<Constant>(V))
7199 return false;
7200 if (getTreeEntry(V))
7201 return true;
7202 const auto &Nodes = ValueToGatherNodes.at(V);
7203 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7204 !is_contained(Slice, V);
7205 })) {
7206 InterleavedLoadsDistance.reset();
7207 continue;
7208 }
7209 DeinterleavedNodes.insert(E);
7210 if (*InterleavedLoadsDistance == 0) {
7211 InterleavedLoadsDistance = Idx - Pos;
7212 continue;
7213 }
7214 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7215 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7216 InterleavedLoadsDistance.reset();
7217 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7218 }
7219 }
7220 }
7221 DeinterleavedNodes.clear();
7222 // Check if the large load represents interleaved load operation.
7223 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7224 CommonVF.value_or(0) != 0) {
7225 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7226 unsigned VF = *CommonVF;
7227 OrdersType Order;
7228 SmallVector<Value *> PointerOps;
7229 // Segmented load detected - vectorize at maximum vector factor.
7230 if (InterleaveFactor <= Slice.size() &&
7232 getWidenedType(Slice.front()->getType(), VF),
7233 InterleaveFactor,
7234 cast<LoadInst>(Slice.front())->getAlign(),
7235 cast<LoadInst>(Slice.front())
7237 canVectorizeLoads(Slice, Slice.front(), Order,
7238 PointerOps) == LoadsState::Vectorize) {
7239 UserMaxVF = InterleaveFactor * VF;
7240 } else {
7241 InterleaveFactor = 0;
7242 }
7243 }
7244 // Cannot represent the loads as consecutive vectorizable nodes -
7245 // just exit.
7246 unsigned ConsecutiveNodesSize = 0;
7247 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7248 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7249 [&, Slice = Slice](const auto &P) {
7250 const auto *It = find_if(Slice, [&](Value *V) {
7251 return std::get<1>(P).contains(V);
7252 });
7253 if (It == Slice.end())
7254 return false;
7256 VectorizableTree[std::get<0>(P)]->Scalars;
7257 ConsecutiveNodesSize += VL.size();
7258 unsigned Start = std::distance(Slice.begin(), It);
7259 unsigned Sz = Slice.size() - Start;
7260 return Sz < VL.size() ||
7261 Slice.slice(std::distance(Slice.begin(), It),
7262 VL.size()) != VL;
7263 }))
7264 continue;
7265 // Try to build long masked gather loads.
7266 UserMaxVF = bit_ceil(UserMaxVF);
7267 if (InterleaveFactor == 0 &&
7268 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7269 [&, Slice = Slice](unsigned Idx) {
7270 OrdersType Order;
7271 SmallVector<Value *> PointerOps;
7272 return canVectorizeLoads(
7273 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7274 Slice[Idx * UserMaxVF], Order,
7275 PointerOps) ==
7276 LoadsState::ScatterVectorize;
7277 }))
7278 UserMaxVF = MaxVF;
7279 if (Slice.size() != ConsecutiveNodesSize)
7280 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7281 }
7282 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7283 bool IsVectorized = true;
7284 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7285 ArrayRef<Value *> SubSlice =
7286 Slice.slice(I, std::min(VF, E - I));
7287 if (getTreeEntry(SubSlice.front()))
7288 continue;
7289 // Check if the subslice is to be-vectorized entry, which is not
7290 // equal to entry.
7291 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7292 [&](const auto &P) {
7293 return !SubSlice.equals(
7294 VectorizableTree[std::get<0>(P)]
7295 ->Scalars) &&
7296 set_is_subset(SubSlice, std::get<1>(P));
7297 }))
7298 continue;
7299 unsigned Sz = VectorizableTree.size();
7300 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7301 if (Sz == VectorizableTree.size()) {
7302 IsVectorized = false;
7303 // Try non-interleaved vectorization with smaller vector
7304 // factor.
7305 if (InterleaveFactor > 0) {
7306 VF = 2 * (MaxVF / InterleaveFactor);
7307 InterleaveFactor = 0;
7308 }
7309 continue;
7310 }
7311 }
7312 if (IsVectorized)
7313 break;
7314 }
7315 }
7316 NonVectorized.append(SortedNonVectorized);
7317 }
7318 return NonVectorized;
7319 };
7320 for (const auto &GLs : GatheredLoads) {
7321 const auto &Ref = GLs.second;
7322 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7323 if (!Ref.empty() && !NonVectorized.empty() &&
7324 std::accumulate(
7325 Ref.begin(), Ref.end(), 0u,
7326 [](unsigned S,
7327 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned {
7328 return S + LoadsDists.size();
7329 }) != NonVectorized.size() &&
7330 IsMaskedGatherSupported(NonVectorized)) {
7332 for (LoadInst *LI : NonVectorized) {
7333 // Reinsert non-vectorized loads to other list of loads with the same
7334 // base pointers.
7335 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7336 FinalGatheredLoads,
7337 /*AddNew=*/false);
7338 }
7339 // Final attempt to vectorize non-vectorized loads.
7340 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7341 }
7342 }
7343 // Try to vectorize postponed load entries, previously marked as gathered.
7344 for (unsigned Idx : LoadEntriesToVectorize) {
7345 const TreeEntry &E = *VectorizableTree[Idx];
7346 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7347 // Avoid reordering, if possible.
7348 if (!E.ReorderIndices.empty()) {
7349 // Build a mask out of the reorder indices and reorder scalars per this
7350 // mask.
7351 SmallVector<int> ReorderMask;
7352 inversePermutation(E.ReorderIndices, ReorderMask);
7353 reorderScalars(GatheredScalars, ReorderMask);
7354 }
7355 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7356 }
7357 // If no new entries created, consider it as no gathered loads entries must be
7358 // handled.
7359 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7360 VectorizableTree.size())
7361 GatheredLoadsEntriesFirst.reset();
7362}
7363
7364/// \return true if the specified list of values has only one instruction that
7365/// requires scheduling, false otherwise.
7366#ifndef NDEBUG
7368 Value *NeedsScheduling = nullptr;
7369 for (Value *V : VL) {
7371 continue;
7372 if (!NeedsScheduling) {
7373 NeedsScheduling = V;
7374 continue;
7375 }
7376 return false;
7377 }
7378 return NeedsScheduling;
7379}
7380#endif
7381
7382/// Generates key/subkey pair for the given value to provide effective sorting
7383/// of the values and better detection of the vectorizable values sequences. The
7384/// keys/subkeys can be used for better sorting of the values themselves (keys)
7385/// and in values subgroups (subkeys).
7386static std::pair<size_t, size_t> generateKeySubkey(
7387 Value *V, const TargetLibraryInfo *TLI,
7388 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7389 bool AllowAlternate) {
7390 hash_code Key = hash_value(V->getValueID() + 2);
7391 hash_code SubKey = hash_value(0);
7392 // Sort the loads by the distance between the pointers.
7393 if (auto *LI = dyn_cast<LoadInst>(V)) {
7394 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7395 if (LI->isSimple())
7396 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7397 else
7398 Key = SubKey = hash_value(LI);
7399 } else if (isVectorLikeInstWithConstOps(V)) {
7400 // Sort extracts by the vector operands.
7401 if (isa<ExtractElementInst, UndefValue>(V))
7402 Key = hash_value(Value::UndefValueVal + 1);
7403 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7404 if (!isUndefVector(EI->getVectorOperand()).all() &&
7405 !isa<UndefValue>(EI->getIndexOperand()))
7406 SubKey = hash_value(EI->getVectorOperand());
7407 }
7408 } else if (auto *I = dyn_cast<Instruction>(V)) {
7409 // Sort other instructions just by the opcodes except for CMPInst.
7410 // For CMP also sort by the predicate kind.
7411 if ((isa<BinaryOperator, CastInst>(I)) &&
7412 isValidForAlternation(I->getOpcode())) {
7413 if (AllowAlternate)
7414 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7415 else
7416 Key = hash_combine(hash_value(I->getOpcode()), Key);
7417 SubKey = hash_combine(
7418 hash_value(I->getOpcode()), hash_value(I->getType()),
7419 hash_value(isa<BinaryOperator>(I)
7420 ? I->getType()
7421 : cast<CastInst>(I)->getOperand(0)->getType()));
7422 // For casts, look through the only operand to improve compile time.
7423 if (isa<CastInst>(I)) {
7424 std::pair<size_t, size_t> OpVals =
7425 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7426 /*AllowAlternate=*/true);
7427 Key = hash_combine(OpVals.first, Key);
7428 SubKey = hash_combine(OpVals.first, SubKey);
7429 }
7430 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7431 CmpInst::Predicate Pred = CI->getPredicate();
7432 if (CI->isCommutative())
7433 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7435 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7436 hash_value(SwapPred),
7437 hash_value(CI->getOperand(0)->getType()));
7438 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7441 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7442 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7443 SubKey = hash_combine(hash_value(I->getOpcode()),
7444 hash_value(Call->getCalledFunction()));
7445 } else {
7446 Key = hash_combine(hash_value(Call), Key);
7447 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7448 }
7449 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7450 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7451 hash_value(Op.Tag), SubKey);
7452 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7453 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7454 SubKey = hash_value(Gep->getPointerOperand());
7455 else
7456 SubKey = hash_value(Gep);
7457 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7458 !isa<ConstantInt>(I->getOperand(1))) {
7459 // Do not try to vectorize instructions with potentially high cost.
7460 SubKey = hash_value(I);
7461 } else {
7462 SubKey = hash_value(I->getOpcode());
7463 }
7464 Key = hash_combine(hash_value(I->getParent()), Key);
7465 }
7466 return std::make_pair(Key, SubKey);
7467}
7468
7469/// Checks if the specified instruction \p I is an alternate operation for
7470/// the given \p MainOp and \p AltOp instructions.
7471static bool isAlternateInstruction(const Instruction *I,
7472 const Instruction *MainOp,
7473 const Instruction *AltOp,
7474 const TargetLibraryInfo &TLI);
7475
7476bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7477 ArrayRef<Value *> VL) const {
7478 unsigned Opcode0 = S.getOpcode();
7479 unsigned Opcode1 = S.getAltOpcode();
7480 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7481 // If this pattern is supported by the target then consider it profitable.
7482 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7483 Opcode0, Opcode1, OpcodeMask))
7484 return true;
7486 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7487 Operands.emplace_back();
7488 // Prepare the operand vector.
7489 for (Value *V : VL) {
7490 if (isa<PoisonValue>(V)) {
7491 Operands.back().push_back(
7492 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7493 continue;
7494 }
7495 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7496 }
7497 }
7498 if (Operands.size() == 2) {
7499 // Try find best operands candidates.
7500 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7502 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7503 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7504 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7505 std::optional<int> Res = findBestRootPair(Candidates);
7506 switch (Res.value_or(0)) {
7507 case 0:
7508 break;
7509 case 1:
7510 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7511 break;
7512 case 2:
7513 std::swap(Operands[0][I], Operands[1][I]);
7514 break;
7515 default:
7516 llvm_unreachable("Unexpected index.");
7517 }
7518 }
7519 }
7520 DenseSet<unsigned> UniqueOpcodes;
7521 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7522 unsigned NonInstCnt = 0;
7523 // Estimate number of instructions, required for the vectorized node and for
7524 // the buildvector node.
7525 unsigned UndefCnt = 0;
7526 // Count the number of extra shuffles, required for vector nodes.
7527 unsigned ExtraShuffleInsts = 0;
7528 // Check that operands do not contain same values and create either perfect
7529 // diamond match or shuffled match.
7530 if (Operands.size() == 2) {
7531 // Do not count same operands twice.
7532 if (Operands.front() == Operands.back()) {
7533 Operands.erase(Operands.begin());
7534 } else if (!allConstant(Operands.front()) &&
7535 all_of(Operands.front(), [&](Value *V) {
7536 return is_contained(Operands.back(), V);
7537 })) {
7538 Operands.erase(Operands.begin());
7539 ++ExtraShuffleInsts;
7540 }
7541 }
7542 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7543 // Vectorize node, if:
7544 // 1. at least single operand is constant or splat.
7545 // 2. Operands have many loop invariants (the instructions are not loop
7546 // invariants).
7547 // 3. At least single unique operands is supposed to vectorized.
7548 return none_of(Operands,
7549 [&](ArrayRef<Value *> Op) {
7550 if (allConstant(Op) ||
7551 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7552 getSameOpcode(Op, *TLI)))
7553 return false;
7555 for (Value *V : Op) {
7556 if (isa<Constant, ExtractElementInst>(V) ||
7557 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
7558 if (isa<UndefValue>(V))
7559 ++UndefCnt;
7560 continue;
7561 }
7562 auto Res = Uniques.try_emplace(V, 0);
7563 // Found first duplicate - need to add shuffle.
7564 if (!Res.second && Res.first->second == 1)
7565 ++ExtraShuffleInsts;
7566 ++Res.first->getSecond();
7567 if (auto *I = dyn_cast<Instruction>(V))
7568 UniqueOpcodes.insert(I->getOpcode());
7569 else if (Res.second)
7570 ++NonInstCnt;
7571 }
7572 return none_of(Uniques, [&](const auto &P) {
7573 return P.first->hasNUsesOrMore(P.second + 1) &&
7574 none_of(P.first->users(), [&](User *U) {
7575 return getTreeEntry(U) || Uniques.contains(U);
7576 });
7577 });
7578 }) ||
7579 // Do not vectorize node, if estimated number of vector instructions is
7580 // more than estimated number of buildvector instructions. Number of
7581 // vector operands is number of vector instructions + number of vector
7582 // instructions for operands (buildvectors). Number of buildvector
7583 // instructions is just number_of_operands * number_of_scalars.
7584 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7585 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7586 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7587}
7588
7589BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7590 const InstructionsState &S, ArrayRef<Value *> VL,
7591 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7592 SmallVectorImpl<Value *> &PointerOps) {
7593 assert(S.getMainOp() &&
7594 "Expected instructions with same/alternate opcodes only.");
7595
7596 unsigned ShuffleOrOp =
7597 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7598 Instruction *VL0 = S.getMainOp();
7599 switch (ShuffleOrOp) {
7600 case Instruction::PHI: {
7601 // Too many operands - gather, most probably won't be vectorized.
7602 if (VL0->getNumOperands() > MaxPHINumOperands)
7603 return TreeEntry::NeedToGather;
7604 // Check for terminator values (e.g. invoke).
7605 for (Value *V : VL) {
7606 auto *PHI = dyn_cast<PHINode>(V);
7607 if (!PHI)
7608 continue;
7609 for (Value *Incoming : PHI->incoming_values()) {
7610 Instruction *Term = dyn_cast<Instruction>(Incoming);
7611 if (Term && Term->isTerminator()) {
7613 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7614 return TreeEntry::NeedToGather;
7615 }
7616 }
7617 }
7618
7619 return TreeEntry::Vectorize;
7620 }
7621 case Instruction::ExtractValue:
7622 case Instruction::ExtractElement: {
7623 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7624 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7625 if (!has_single_bit(VL.size()))
7626 return TreeEntry::NeedToGather;
7627 if (Reuse || !CurrentOrder.empty())
7628 return TreeEntry::Vectorize;
7629 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7630 return TreeEntry::NeedToGather;
7631 }
7632 case Instruction::InsertElement: {
7633 // Check that we have a buildvector and not a shuffle of 2 or more
7634 // different vectors.
7635 ValueSet SourceVectors;
7636 for (Value *V : VL) {
7637 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7638 assert(getElementIndex(V) != std::nullopt &&
7639 "Non-constant or undef index?");
7640 }
7641
7642 if (count_if(VL, [&SourceVectors](Value *V) {
7643 return !SourceVectors.contains(V);
7644 }) >= 2) {
7645 // Found 2nd source vector - cancel.
7646 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7647 "different source vectors.\n");
7648 return TreeEntry::NeedToGather;
7649 }
7650
7651 if (any_of(VL, [&SourceVectors](Value *V) {
7652 // The last InsertElement can have multiple uses.
7653 return SourceVectors.contains(V) && !V->hasOneUse();
7654 })) {
7655 assert(SLPReVec && "Only supported by REVEC.");
7656 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7657 "multiple uses.\n");
7658 return TreeEntry::NeedToGather;
7659 }
7660
7661 return TreeEntry::Vectorize;
7662 }
7663 case Instruction::Load: {
7664 // Check that a vectorized load would load the same memory as a scalar
7665 // load. For example, we don't want to vectorize loads that are smaller
7666 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7667 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7668 // from such a struct, we read/write packed bits disagreeing with the
7669 // unvectorized version.
7670 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7672 return TreeEntry::Vectorize;
7674 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7675 // Delay slow vectorized nodes for better vectorization attempts.
7676 LoadEntriesToVectorize.insert(VectorizableTree.size());
7677 return TreeEntry::NeedToGather;
7678 }
7679 return TreeEntry::ScatterVectorize;
7681 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7682 // Delay slow vectorized nodes for better vectorization attempts.
7683 LoadEntriesToVectorize.insert(VectorizableTree.size());
7684 return TreeEntry::NeedToGather;
7685 }
7686 return TreeEntry::StridedVectorize;
7687 case LoadsState::Gather:
7688#ifndef NDEBUG
7689 Type *ScalarTy = VL0->getType();
7690 if (DL->getTypeSizeInBits(ScalarTy) !=
7691 DL->getTypeAllocSizeInBits(ScalarTy))
7692 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7693 else if (any_of(VL, [](Value *V) {
7694 auto *LI = dyn_cast<LoadInst>(V);
7695 return !LI || !LI->isSimple();
7696 }))
7697 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7698 else
7699 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7700#endif // NDEBUG
7702 return TreeEntry::NeedToGather;
7703 }
7704 llvm_unreachable("Unexpected state of loads");
7705 }
7706 case Instruction::ZExt:
7707 case Instruction::SExt:
7708 case Instruction::FPToUI:
7709 case Instruction::FPToSI:
7710 case Instruction::FPExt:
7711 case Instruction::PtrToInt:
7712 case Instruction::IntToPtr:
7713 case Instruction::SIToFP:
7714 case Instruction::UIToFP:
7715 case Instruction::Trunc:
7716 case Instruction::FPTrunc:
7717 case Instruction::BitCast: {
7718 Type *SrcTy = VL0->getOperand(0)->getType();
7719 for (Value *V : VL) {
7720 if (isa<PoisonValue>(V))
7721 continue;
7722 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7723 if (Ty != SrcTy || !isValidElementType(Ty)) {
7724 LLVM_DEBUG(
7725 dbgs() << "SLP: Gathering casts with different src types.\n");
7726 return TreeEntry::NeedToGather;
7727 }
7728 }
7729 return TreeEntry::Vectorize;
7730 }
7731 case Instruction::ICmp:
7732 case Instruction::FCmp: {
7733 // Check that all of the compares have the same predicate.
7734 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7736 Type *ComparedTy = VL0->getOperand(0)->getType();
7737 for (Value *V : VL) {
7738 if (isa<PoisonValue>(V))
7739 continue;
7740 auto *Cmp = cast<CmpInst>(V);
7741 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7742 Cmp->getOperand(0)->getType() != ComparedTy) {
7743 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7744 return TreeEntry::NeedToGather;
7745 }
7746 }
7747 return TreeEntry::Vectorize;
7748 }
7749 case Instruction::Select:
7750 case Instruction::FNeg:
7751 case Instruction::Add:
7752 case Instruction::FAdd:
7753 case Instruction::Sub:
7754 case Instruction::FSub:
7755 case Instruction::Mul:
7756 case Instruction::FMul:
7757 case Instruction::UDiv:
7758 case Instruction::SDiv:
7759 case Instruction::FDiv:
7760 case Instruction::URem:
7761 case Instruction::SRem:
7762 case Instruction::FRem:
7763 case Instruction::Shl:
7764 case Instruction::LShr:
7765 case Instruction::AShr:
7766 case Instruction::And:
7767 case Instruction::Or:
7768 case Instruction::Xor:
7769 case Instruction::Freeze:
7770 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7772 auto *I = dyn_cast<Instruction>(V);
7773 return I && I->isBinaryOp() && !I->isFast();
7774 }))
7775 return TreeEntry::NeedToGather;
7776 return TreeEntry::Vectorize;
7777 case Instruction::GetElementPtr: {
7778 // We don't combine GEPs with complicated (nested) indexing.
7779 for (Value *V : VL) {
7780 auto *I = dyn_cast<GetElementPtrInst>(V);
7781 if (!I)
7782 continue;
7783 if (I->getNumOperands() != 2) {
7784 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7785 return TreeEntry::NeedToGather;
7786 }
7787 }
7788
7789 // We can't combine several GEPs into one vector if they operate on
7790 // different types.
7791 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7792 for (Value *V : VL) {
7793 auto *GEP = dyn_cast<GEPOperator>(V);
7794 if (!GEP)
7795 continue;
7796 Type *CurTy = GEP->getSourceElementType();
7797 if (Ty0 != CurTy) {
7798 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7799 return TreeEntry::NeedToGather;
7800 }
7801 }
7802
7803 // We don't combine GEPs with non-constant indexes.
7804 Type *Ty1 = VL0->getOperand(1)->getType();
7805 for (Value *V : VL) {
7806 auto *I = dyn_cast<GetElementPtrInst>(V);
7807 if (!I)
7808 continue;
7809 auto *Op = I->getOperand(1);
7810 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7811 (Op->getType() != Ty1 &&
7812 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7813 Op->getType()->getScalarSizeInBits() >
7814 DL->getIndexSizeInBits(
7815 V->getType()->getPointerAddressSpace())))) {
7816 LLVM_DEBUG(
7817 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7818 return TreeEntry::NeedToGather;
7819 }
7820 }
7821
7822 return TreeEntry::Vectorize;
7823 }
7824 case Instruction::Store: {
7825 // Check if the stores are consecutive or if we need to swizzle them.
7826 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7827 // Avoid types that are padded when being allocated as scalars, while
7828 // being packed together in a vector (such as i1).
7829 if (DL->getTypeSizeInBits(ScalarTy) !=
7830 DL->getTypeAllocSizeInBits(ScalarTy)) {
7831 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7832 return TreeEntry::NeedToGather;
7833 }
7834 // Make sure all stores in the bundle are simple - we can't vectorize
7835 // atomic or volatile stores.
7836 for (Value *V : VL) {
7837 auto *SI = cast<StoreInst>(V);
7838 if (!SI->isSimple()) {
7839 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7840 return TreeEntry::NeedToGather;
7841 }
7842 PointerOps.push_back(SI->getPointerOperand());
7843 }
7844
7845 // Check the order of pointer operands.
7846 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7847 Value *Ptr0;
7848 Value *PtrN;
7849 if (CurrentOrder.empty()) {
7850 Ptr0 = PointerOps.front();
7851 PtrN = PointerOps.back();
7852 } else {
7853 Ptr0 = PointerOps[CurrentOrder.front()];
7854 PtrN = PointerOps[CurrentOrder.back()];
7855 }
7856 std::optional<int> Dist =
7857 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7858 // Check that the sorted pointer operands are consecutive.
7859 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7860 return TreeEntry::Vectorize;
7861 }
7862
7863 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7864 return TreeEntry::NeedToGather;
7865 }
7866 case Instruction::Call: {
7867 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7869 auto *I = dyn_cast<Instruction>(V);
7870 return I && !I->isFast();
7871 }))
7872 return TreeEntry::NeedToGather;
7873 // Check if the calls are all to the same vectorizable intrinsic or
7874 // library function.
7875 CallInst *CI = cast<CallInst>(VL0);
7877
7878 VFShape Shape = VFShape::get(
7879 CI->getFunctionType(),
7880 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7881 false /*HasGlobalPred*/);
7882 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7883
7884 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7885 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7886 return TreeEntry::NeedToGather;
7887 }
7888 Function *F = CI->getCalledFunction();
7889 unsigned NumArgs = CI->arg_size();
7890 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7891 for (unsigned J = 0; J != NumArgs; ++J)
7893 ScalarArgs[J] = CI->getArgOperand(J);
7894 for (Value *V : VL) {
7895 CallInst *CI2 = dyn_cast<CallInst>(V);
7896 if (!CI2 || CI2->getCalledFunction() != F ||
7897 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7898 (VecFunc &&
7899 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7901 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7902 << "\n");
7903 return TreeEntry::NeedToGather;
7904 }
7905 // Some intrinsics have scalar arguments and should be same in order for
7906 // them to be vectorized.
7907 for (unsigned J = 0; J != NumArgs; ++J) {
7909 Value *A1J = CI2->getArgOperand(J);
7910 if (ScalarArgs[J] != A1J) {
7912 << "SLP: mismatched arguments in call:" << *CI
7913 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7914 return TreeEntry::NeedToGather;
7915 }
7916 }
7917 }
7918 // Verify that the bundle operands are identical between the two calls.
7919 if (CI->hasOperandBundles() &&
7920 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7921 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7922 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7923 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7924 << "!=" << *V << '\n');
7925 return TreeEntry::NeedToGather;
7926 }
7927 }
7928
7929 return TreeEntry::Vectorize;
7930 }
7931 case Instruction::ShuffleVector: {
7932 if (!S.isAltShuffle()) {
7933 // REVEC can support non alternate shuffle.
7935 return TreeEntry::Vectorize;
7936 // If this is not an alternate sequence of opcode like add-sub
7937 // then do not vectorize this instruction.
7938 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7939 return TreeEntry::NeedToGather;
7940 }
7941 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7942 LLVM_DEBUG(
7943 dbgs()
7944 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7945 "the whole alt sequence is not profitable.\n");
7946 return TreeEntry::NeedToGather;
7947 }
7948
7949 return TreeEntry::Vectorize;
7950 }
7951 default:
7952 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
7953 return TreeEntry::NeedToGather;
7954 }
7955}
7956
7957namespace {
7958/// Allows to correctly handle operands of the phi nodes based on the \p Main
7959/// PHINode order of incoming basic blocks/values.
7960class PHIHandler {
7961 DominatorTree &DT;
7962 PHINode *Main = nullptr;
7965
7966public:
7967 PHIHandler() = delete;
7968 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
7969 : DT(DT), Main(Main), Phis(Phis),
7970 Operands(Main->getNumIncomingValues(),
7971 SmallVector<Value *>(Phis.size(), nullptr)) {}
7972 void buildOperands() {
7973 constexpr unsigned FastLimit = 4;
7974 if (Main->getNumIncomingValues() <= FastLimit) {
7975 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7976 BasicBlock *InBB = Main->getIncomingBlock(I);
7977 if (!DT.isReachableFromEntry(InBB)) {
7978 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7979 continue;
7980 }
7981 // Prepare the operand vector.
7982 for (auto [Idx, V] : enumerate(Phis)) {
7983 auto *P = dyn_cast<PHINode>(V);
7984 if (!P) {
7985 assert(isa<PoisonValue>(V) &&
7986 "Expected isa instruction or poison value.");
7987 Operands[I][Idx] = V;
7988 continue;
7989 }
7990 if (P->getIncomingBlock(I) == InBB)
7991 Operands[I][Idx] = P->getIncomingValue(I);
7992 else
7993 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
7994 }
7995 }
7996 return;
7997 }
7999 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
8000 BasicBlock *InBB = Main->getIncomingBlock(I);
8001 if (!DT.isReachableFromEntry(InBB)) {
8002 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
8003 continue;
8004 }
8005 Blocks.try_emplace(InBB).first->second.push_back(I);
8006 }
8007 for (auto [Idx, V] : enumerate(Phis)) {
8008 if (isa<PoisonValue>(V)) {
8009 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
8010 Operands[I][Idx] = V;
8011 continue;
8012 }
8013 auto *P = cast<PHINode>(V);
8014 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
8015 BasicBlock *InBB = P->getIncomingBlock(I);
8016 if (InBB == Main->getIncomingBlock(I)) {
8017 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
8018 continue;
8019 Operands[I][Idx] = P->getIncomingValue(I);
8020 continue;
8021 }
8022 auto It = Blocks.find(InBB);
8023 if (It == Blocks.end())
8024 continue;
8025 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
8026 }
8027 }
8028 for (const auto &P : Blocks) {
8029 if (P.getSecond().size() <= 1)
8030 continue;
8031 unsigned BasicI = P.getSecond().front();
8032 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
8034 [&](const auto &Data) {
8035 return !Data.value() ||
8036 Data.value() == Operands[BasicI][Data.index()];
8037 }) &&
8038 "Expected empty operands list.");
8039 Operands[I] = Operands[BasicI];
8040 }
8041 }
8042 }
8043 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
8044};
8045} // namespace
8046
8047void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8048 const EdgeInfo &UserTreeIdx,
8049 unsigned InterleaveFactor) {
8050 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
8051
8052 SmallVector<int> ReuseShuffleIndices;
8053 SmallVector<Value *> UniqueValues;
8054 SmallVector<Value *> NonUniqueValueVL;
8055 auto TryToFindDuplicates = [&](const InstructionsState &S,
8056 bool DoNotFail = false) {
8057 // Check that every instruction appears once in this bundle.
8058 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8059 for (Value *V : VL) {
8060 if (isConstant(V)) {
8061 ReuseShuffleIndices.emplace_back(
8062 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8063 UniqueValues.emplace_back(V);
8064 continue;
8065 }
8066 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8067 ReuseShuffleIndices.emplace_back(Res.first->second);
8068 if (Res.second)
8069 UniqueValues.emplace_back(V);
8070 }
8071 size_t NumUniqueScalarValues = UniqueValues.size();
8072 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8073 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8074 if (NumUniqueScalarValues == VL.size() &&
8075 (VectorizeNonPowerOf2 || IsFullVectors)) {
8076 ReuseShuffleIndices.clear();
8077 } else {
8078 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8079 if ((UserTreeIdx.UserTE &&
8080 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8081 !has_single_bit(VL.size())) {
8082 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8083 "for nodes with padding.\n");
8084 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8085 return false;
8086 }
8087 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8088 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8089 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8090 return isa<UndefValue>(V) || !isConstant(V);
8091 }))) {
8092 if (DoNotFail && UniquePositions.size() > 1 &&
8093 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8094 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8095 // Find the number of elements, which forms full vectors.
8096 unsigned PWSz = getFullVectorNumberOfElements(
8097 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8098 if (PWSz == VL.size()) {
8099 ReuseShuffleIndices.clear();
8100 } else {
8101 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8102 NonUniqueValueVL.append(
8103 PWSz - UniqueValues.size(),
8104 PoisonValue::get(UniqueValues.front()->getType()));
8105 // Check that extended with poisons operations are still valid for
8106 // vectorization (div/rem are not allowed).
8107 if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
8108 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8109 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8110 return false;
8111 }
8112 VL = NonUniqueValueVL;
8113 }
8114 return true;
8115 }
8116 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8117 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8118 return false;
8119 }
8120 VL = UniqueValues;
8121 }
8122 return true;
8123 };
8124
8125 InstructionsState S = getSameOpcode(VL, *TLI);
8126
8127 // Don't go into catchswitch blocks, which can happen with PHIs.
8128 // Such blocks can only have PHIs and the catchswitch. There is no
8129 // place to insert a shuffle if we need to, so just avoid that issue.
8130 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8131 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8132 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8133 return;
8134 }
8135
8136 // Check if this is a duplicate of another entry.
8137 if (S) {
8138 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8139 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
8140 << ".\n");
8141 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8142 auto It = MultiNodeScalars.find(S.getMainOp());
8143 if (It != MultiNodeScalars.end()) {
8144 auto *TEIt = find_if(It->getSecond(),
8145 [&](TreeEntry *ME) { return ME->isSame(VL); });
8146 if (TEIt != It->getSecond().end())
8147 E = *TEIt;
8148 else
8149 E = nullptr;
8150 } else {
8151 E = nullptr;
8152 }
8153 }
8154 if (!E) {
8155 if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8156 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
8157 if (TryToFindDuplicates(S))
8158 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8159 ReuseShuffleIndices);
8160 return;
8161 }
8163 Nodes.insert(getTreeEntry(S.getMainOp()));
8164 for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8165 Nodes.insert(E);
8166 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8167 if (any_of(Nodes, [&](const TreeEntry *E) {
8168 if (all_of(E->Scalars,
8169 [&](Value *V) { return Values.contains(V); }))
8170 return true;
8171 SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8172 E->Scalars.end());
8173 return (
8174 all_of(VL, [&](Value *V) { return EValues.contains(V); }));
8175 })) {
8176 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8177 if (TryToFindDuplicates(S))
8178 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8179 ReuseShuffleIndices);
8180 return;
8181 }
8182 } else {
8183 // Record the reuse of the tree node. FIXME, currently this is only
8184 // used to properly draw the graph rather than for the actual
8185 // vectorization.
8186 E->UserTreeIndices.push_back(UserTreeIdx);
8187 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8188 << ".\n");
8189 return;
8190 }
8191 }
8192 }
8193
8194 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8195 // a load), in which case peek through to include it in the tree, without
8196 // ballooning over-budget.
8197 if (Depth >= RecursionMaxDepth &&
8198 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8199 (match(S.getMainOp(), m_Load(m_Value())) ||
8200 all_of(VL, [&S](const Value *I) {
8201 return match(I,
8203 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8204 })))) {
8205 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8206 if (TryToFindDuplicates(S))
8207 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8208 ReuseShuffleIndices);
8209 return;
8210 }
8211
8212 // Don't handle scalable vectors
8213 if (S && S.getOpcode() == Instruction::ExtractElement &&
8214 isa<ScalableVectorType>(
8215 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8216 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8217 if (TryToFindDuplicates(S))
8218 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8219 ReuseShuffleIndices);
8220 return;
8221 }
8222
8223 // Don't handle vectors.
8224 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8225 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8226 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8227 return;
8228 }
8229
8230 // If all of the operands are identical or constant we have a simple solution.
8231 // If we deal with insert/extract instructions, they all must have constant
8232 // indices, otherwise we should gather them, not try to vectorize.
8233 // If alternate op node with 2 elements with gathered operands - do not
8234 // vectorize.
8235 auto &&NotProfitableForVectorization = [&S, this,
8237 if (!S || !S.isAltShuffle() || VL.size() > 2)
8238 return false;
8239 if (VectorizableTree.size() < MinTreeSize)
8240 return false;
8241 if (Depth >= RecursionMaxDepth - 1)
8242 return true;
8243 // Check if all operands are extracts, part of vector node or can build a
8244 // regular vectorize node.
8245 SmallVector<unsigned, 8> InstsCount;
8246 for (Value *V : VL) {
8247 auto *I = cast<Instruction>(V);
8248 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8249 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8250 }));
8251 }
8252 bool IsCommutative =
8253 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8254 if ((IsCommutative &&
8255 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8256 (!IsCommutative &&
8257 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8258 return true;
8259 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8261 auto *I1 = cast<Instruction>(VL.front());
8262 auto *I2 = cast<Instruction>(VL.back());
8263 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8264 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8265 I2->getOperand(Op));
8266 if (static_cast<unsigned>(count_if(
8267 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8269 })) >= S.getMainOp()->getNumOperands() / 2)
8270 return false;
8271 if (S.getMainOp()->getNumOperands() > 2)
8272 return true;
8273 if (IsCommutative) {
8274 // Check permuted operands.
8275 Candidates.clear();
8276 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8277 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8278 I2->getOperand((Op + 1) % E));
8279 if (any_of(
8280 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8282 }))
8283 return false;
8284 }
8285 return true;
8286 };
8287 SmallVector<unsigned> SortedIndices;
8288 BasicBlock *BB = nullptr;
8289 bool IsScatterVectorizeUserTE =
8290 UserTreeIdx.UserTE &&
8291 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8292 bool AreAllSameBlock = S && allSameBlock(VL);
8293 bool AreScatterAllGEPSameBlock =
8294 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8295 VL.size() > 2 &&
8296 all_of(VL,
8297 [&BB](Value *V) {
8298 auto *I = dyn_cast<GetElementPtrInst>(V);
8299 if (!I)
8300 return doesNotNeedToBeScheduled(V);
8301 if (!BB)
8302 BB = I->getParent();
8303 return BB == I->getParent() && I->getNumOperands() == 2;
8304 }) &&
8305 BB &&
8306 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8307 SortedIndices));
8308 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8309 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
8310 (S &&
8311 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8312 S.getMainOp()) &&
8314 NotProfitableForVectorization(VL)) {
8315 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8316 if (TryToFindDuplicates(S))
8317 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8318 ReuseShuffleIndices);
8319 return;
8320 }
8321
8322 // Don't vectorize ephemeral values.
8323 if (S && !EphValues.empty()) {
8324 for (Value *V : VL) {
8325 if (EphValues.count(V)) {
8326 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8327 << ") is ephemeral.\n");
8328 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8329 return;
8330 }
8331 }
8332 }
8333
8334 // We now know that this is a vector of instructions of the same type from
8335 // the same block.
8336
8337 // Check that none of the instructions in the bundle are already in the tree.
8338 for (Value *V : VL) {
8339 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8341 continue;
8342 if (getTreeEntry(V)) {
8343 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8344 << ") is already in tree.\n");
8345 if (TryToFindDuplicates(S))
8346 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8347 ReuseShuffleIndices);
8348 return;
8349 }
8350 }
8351
8352 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8353 if (UserIgnoreList && !UserIgnoreList->empty()) {
8354 for (Value *V : VL) {
8355 if (UserIgnoreList->contains(V)) {
8356 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8357 if (TryToFindDuplicates(S))
8358 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8359 ReuseShuffleIndices);
8360 return;
8361 }
8362 }
8363 }
8364
8365 // Special processing for sorted pointers for ScatterVectorize node with
8366 // constant indeces only.
8367 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8368 assert(VL.front()->getType()->isPointerTy() &&
8369 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8370 "Expected pointers only.");
8371 // Reset S to make it GetElementPtr kind of node.
8372 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8373 assert(It != VL.end() && "Expected at least one GEP.");
8374 S = getSameOpcode(*It, *TLI);
8375 }
8376
8377 // Check that all of the users of the scalars that we want to vectorize are
8378 // schedulable.
8379 Instruction *VL0 = S.getMainOp();
8380 BB = VL0->getParent();
8381
8382 if (S &&
8383 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8384 !DT->isReachableFromEntry(BB))) {
8385 // Don't go into unreachable blocks. They may contain instructions with
8386 // dependency cycles which confuse the final scheduling.
8387 // Do not vectorize EH and non-returning blocks, not profitable in most
8388 // cases.
8389 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8390 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8391 return;
8392 }
8393
8394 // Check that every instruction appears once in this bundle.
8395 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8396 return;
8397
8398 // Perform specific checks for each particular instruction kind.
8399 OrdersType CurrentOrder;
8400 SmallVector<Value *> PointerOps;
8401 TreeEntry::EntryState State = getScalarsVectorizationState(
8402 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8403 if (State == TreeEntry::NeedToGather) {
8404 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8405 ReuseShuffleIndices);
8406 return;
8407 }
8408
8409 auto &BSRef = BlocksSchedules[BB];
8410 if (!BSRef)
8411 BSRef = std::make_unique<BlockScheduling>(BB);
8412
8413 BlockScheduling &BS = *BSRef;
8414
8415 std::optional<ScheduleData *> Bundle =
8416 BS.tryScheduleBundle(UniqueValues, this, S);
8417#ifdef EXPENSIVE_CHECKS
8418 // Make sure we didn't break any internal invariants
8419 BS.verify();
8420#endif
8421 if (!Bundle) {
8422 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8423 assert((!BS.getScheduleData(VL0) ||
8424 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8425 "tryScheduleBundle should cancelScheduling on failure");
8426 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8427 ReuseShuffleIndices);
8428 NonScheduledFirst.insert(VL.front());
8429 if (S.getOpcode() == Instruction::Load &&
8430 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8432 return;
8433 }
8434 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8435
8436 unsigned ShuffleOrOp =
8437 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8438 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8439 // Postpone PHI nodes creation
8440 SmallVector<unsigned> PHIOps;
8441 for (unsigned I : seq<unsigned>(Operands.size())) {
8443 if (Op.empty())
8444 continue;
8445 InstructionsState S = getSameOpcode(Op, *TLI);
8446 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8447 buildTree_rec(Op, Depth + 1, {TE, I});
8448 else
8449 PHIOps.push_back(I);
8450 }
8451 for (unsigned I : PHIOps)
8452 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8453 };
8454 switch (ShuffleOrOp) {
8455 case Instruction::PHI: {
8456 auto *PH = cast<PHINode>(VL0);
8457
8458 TreeEntry *TE =
8459 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8460 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
8461 TE->dump());
8462
8463 // Keeps the reordered operands to avoid code duplication.
8464 PHIHandler Handler(*DT, PH, VL);
8465 Handler.buildOperands();
8466 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8467 TE->setOperand(I, Handler.getOperands(I));
8468 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8469 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8470 Operands[I] = Handler.getOperands(I);
8471 CreateOperandNodes(TE, Operands);
8472 return;
8473 }
8474 case Instruction::ExtractValue:
8475 case Instruction::ExtractElement: {
8476 if (CurrentOrder.empty()) {
8477 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8478 } else {
8479 LLVM_DEBUG({
8480 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8481 "with order";
8482 for (unsigned Idx : CurrentOrder)
8483 dbgs() << " " << Idx;
8484 dbgs() << "\n";
8485 });
8486 fixupOrderingIndices(CurrentOrder);
8487 }
8488 // Insert new order with initial value 0, if it does not exist,
8489 // otherwise return the iterator to the existing one.
8490 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8491 ReuseShuffleIndices, CurrentOrder);
8492 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
8493 "(ExtractValueInst/ExtractElementInst).\n";
8494 TE->dump());
8495 // This is a special case, as it does not gather, but at the same time
8496 // we are not extending buildTree_rec() towards the operands.
8497 TE->setOperand(*this);
8498 return;
8499 }
8500 case Instruction::InsertElement: {
8501 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8502
8503 auto OrdCompare = [](const std::pair<int, int> &P1,
8504 const std::pair<int, int> &P2) {
8505 return P1.first > P2.first;
8506 };
8508 decltype(OrdCompare)>
8509 Indices(OrdCompare);
8510 for (int I = 0, E = VL.size(); I < E; ++I) {
8511 unsigned Idx = *getElementIndex(VL[I]);
8512 Indices.emplace(Idx, I);
8513 }
8514 OrdersType CurrentOrder(VL.size(), VL.size());
8515 bool IsIdentity = true;
8516 for (int I = 0, E = VL.size(); I < E; ++I) {
8517 CurrentOrder[Indices.top().second] = I;
8518 IsIdentity &= Indices.top().second == I;
8519 Indices.pop();
8520 }
8521 if (IsIdentity)
8522 CurrentOrder.clear();
8523 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8524 {}, CurrentOrder);
8525 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
8526 TE->dump());
8527
8528 TE->setOperand(*this);
8529 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8530 return;
8531 }
8532 case Instruction::Load: {
8533 // Check that a vectorized load would load the same memory as a scalar
8534 // load. For example, we don't want to vectorize loads that are smaller
8535 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8536 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8537 // from such a struct, we read/write packed bits disagreeing with the
8538 // unvectorized version.
8539 TreeEntry *TE = nullptr;
8540 fixupOrderingIndices(CurrentOrder);
8541 switch (State) {
8542 case TreeEntry::Vectorize:
8543 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8544 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8545 if (CurrentOrder.empty())
8546 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
8547 TE->dump());
8548 else
8550 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
8551 TE->dump());
8552 break;
8553 case TreeEntry::StridedVectorize:
8554 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8555 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8556 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8557 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
8558 TE->dump());
8559 break;
8560 case TreeEntry::ScatterVectorize:
8561 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8562 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8563 UserTreeIdx, ReuseShuffleIndices);
8564 LLVM_DEBUG(
8565 dbgs()
8566 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8567 TE->dump());
8568 break;
8569 case TreeEntry::CombinedVectorize:
8570 case TreeEntry::NeedToGather:
8571 llvm_unreachable("Unexpected loads state.");
8572 }
8573 TE->setOperand(*this);
8574 if (State == TreeEntry::ScatterVectorize)
8575 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8576 return;
8577 }
8578 case Instruction::ZExt:
8579 case Instruction::SExt:
8580 case Instruction::FPToUI:
8581 case Instruction::FPToSI:
8582 case Instruction::FPExt:
8583 case Instruction::PtrToInt:
8584 case Instruction::IntToPtr:
8585 case Instruction::SIToFP:
8586 case Instruction::UIToFP:
8587 case Instruction::Trunc:
8588 case Instruction::FPTrunc:
8589 case Instruction::BitCast: {
8590 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8591 std::make_pair(std::numeric_limits<unsigned>::min(),
8592 std::numeric_limits<unsigned>::max()));
8593 if (ShuffleOrOp == Instruction::ZExt ||
8594 ShuffleOrOp == Instruction::SExt) {
8595 CastMaxMinBWSizes = std::make_pair(
8596 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8597 PrevMaxBW),
8598 std::min<unsigned>(
8599 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8600 PrevMinBW));
8601 } else if (ShuffleOrOp == Instruction::Trunc) {
8602 CastMaxMinBWSizes = std::make_pair(
8603 std::max<unsigned>(
8604 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8605 PrevMaxBW),
8606 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8607 PrevMinBW));
8608 }
8609 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8610 ReuseShuffleIndices);
8611 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
8612 TE->dump());
8613
8614 TE->setOperand(*this);
8615 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8616 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8617 if (ShuffleOrOp == Instruction::Trunc) {
8618 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8619 } else if (ShuffleOrOp == Instruction::SIToFP ||
8620 ShuffleOrOp == Instruction::UIToFP) {
8621 unsigned NumSignBits =
8622 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8623 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8624 APInt Mask = DB->getDemandedBits(OpI);
8625 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8626 }
8627 if (NumSignBits * 2 >=
8628 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8629 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8630 }
8631 return;
8632 }
8633 case Instruction::ICmp:
8634 case Instruction::FCmp: {
8635 // Check that all of the compares have the same predicate.
8636 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8637 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8638 ReuseShuffleIndices);
8639 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
8640 TE->dump());
8641
8643 VLOperands Ops(VL, VL0, *this);
8644 if (cast<CmpInst>(VL0)->isCommutative()) {
8645 // Commutative predicate - collect + sort operands of the instructions
8646 // so that each side is more likely to have the same opcode.
8648 "Commutative Predicate mismatch");
8649 Ops.reorder();
8650 Left = Ops.getVL(0);
8651 Right = Ops.getVL(1);
8652 } else {
8653 // Collect operands - commute if it uses the swapped predicate.
8654 for (Value *V : VL) {
8655 if (isa<PoisonValue>(V)) {
8656 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8657 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8658 continue;
8659 }
8660 auto *Cmp = cast<CmpInst>(V);
8661 Value *LHS = Cmp->getOperand(0);
8662 Value *RHS = Cmp->getOperand(1);
8663 if (Cmp->getPredicate() != P0)
8664 std::swap(LHS, RHS);
8665 Left.push_back(LHS);
8666 Right.push_back(RHS);
8667 }
8668 }
8669 TE->setOperand(0, Left);
8670 TE->setOperand(1, Right);
8671 buildTree_rec(Left, Depth + 1, {TE, 0});
8672 buildTree_rec(Right, Depth + 1, {TE, 1});
8673 if (ShuffleOrOp == Instruction::ICmp) {
8674 unsigned NumSignBits0 =
8675 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8676 if (NumSignBits0 * 2 >=
8677 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8678 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8679 unsigned NumSignBits1 =
8680 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8681 if (NumSignBits1 * 2 >=
8682 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8683 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8684 }
8685 return;
8686 }
8687 case Instruction::Select:
8688 case Instruction::FNeg:
8689 case Instruction::Add:
8690 case Instruction::FAdd:
8691 case Instruction::Sub:
8692 case Instruction::FSub:
8693 case Instruction::Mul:
8694 case Instruction::FMul:
8695 case Instruction::UDiv:
8696 case Instruction::SDiv:
8697 case Instruction::FDiv:
8698 case Instruction::URem:
8699 case Instruction::SRem:
8700 case Instruction::FRem:
8701 case Instruction::Shl:
8702 case Instruction::LShr:
8703 case Instruction::AShr:
8704 case Instruction::And:
8705 case Instruction::Or:
8706 case Instruction::Xor:
8707 case Instruction::Freeze: {
8708 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8709 ReuseShuffleIndices);
8710 LLVM_DEBUG(
8711 dbgs() << "SLP: added a new TreeEntry "
8712 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8713 TE->dump());
8714
8715 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8716 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8717 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8718 return;
8719 }
8720 case Instruction::GetElementPtr: {
8721 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8722 ReuseShuffleIndices);
8723 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
8724 TE->dump());
8726 // Prepare the operand vector for pointer operands.
8727 for (Value *V : VL) {
8728 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8729 if (!GEP) {
8730 Operands.front().push_back(V);
8731 continue;
8732 }
8733 Operands.front().push_back(GEP->getPointerOperand());
8734 }
8735 TE->setOperand(0, Operands.front());
8736 // Need to cast all indices to the same type before vectorization to
8737 // avoid crash.
8738 // Required to be able to find correct matches between different gather
8739 // nodes and reuse the vectorized values rather than trying to gather them
8740 // again.
8741 int IndexIdx = 1;
8742 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8743 Type *Ty = all_of(VL,
8744 [VL0Ty, IndexIdx](Value *V) {
8745 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8746 if (!GEP)
8747 return true;
8748 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8749 })
8750 ? VL0Ty
8751 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8752 ->getPointerOperandType()
8753 ->getScalarType());
8754 // Prepare the operand vector.
8755 for (Value *V : VL) {
8756 auto *I = dyn_cast<GetElementPtrInst>(V);
8757 if (!I) {
8758 Operands.back().push_back(
8759 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8760 continue;
8761 }
8762 auto *Op = I->getOperand(IndexIdx);
8763 auto *CI = dyn_cast<ConstantInt>(Op);
8764 if (!CI)
8765 Operands.back().push_back(Op);
8766 else
8767 Operands.back().push_back(ConstantFoldIntegerCast(
8768 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8769 }
8770 TE->setOperand(IndexIdx, Operands.back());
8771
8772 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8773 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8774 return;
8775 }
8776 case Instruction::Store: {
8777 bool Consecutive = CurrentOrder.empty();
8778 if (!Consecutive)
8779 fixupOrderingIndices(CurrentOrder);
8780 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8781 ReuseShuffleIndices, CurrentOrder);
8782 if (Consecutive)
8783 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
8784 TE->dump());
8785 else
8786 LLVM_DEBUG(
8787 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
8788 TE->dump());
8789 TE->setOperand(*this);
8790 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8791 return;
8792 }
8793 case Instruction::Call: {
8794 // Check if the calls are all to the same vectorizable intrinsic or
8795 // library function.
8796 CallInst *CI = cast<CallInst>(VL0);
8798
8799 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8800 ReuseShuffleIndices);
8801 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
8802 TE->dump());
8803 TE->setOperand(*this, isCommutative(VL0));
8804 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8805 // For scalar operands no need to create an entry since no need to
8806 // vectorize it.
8808 continue;
8809 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8810 }
8811 return;
8812 }
8813 case Instruction::ShuffleVector: {
8814 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8815 ReuseShuffleIndices);
8816 if (S.isAltShuffle()) {
8817 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
8818 TE->dump());
8819 } else {
8820 assert(SLPReVec && "Only supported by REVEC.");
8821 LLVM_DEBUG(
8822 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8823 TE->dump());
8824 }
8825
8826 // Reorder operands if reordering would enable vectorization.
8827 auto *CI = dyn_cast<CmpInst>(VL0);
8828 if (CI && any_of(VL, [](Value *V) {
8829 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8830 })) {
8831 auto *MainCI = cast<CmpInst>(S.getMainOp());
8832 auto *AltCI = cast<CmpInst>(S.getAltOp());
8833 CmpInst::Predicate MainP = MainCI->getPredicate();
8834 CmpInst::Predicate AltP = AltCI->getPredicate();
8835 assert(MainP != AltP &&
8836 "Expected different main/alternate predicates.");
8838 // Collect operands - commute if it uses the swapped predicate or
8839 // alternate operation.
8840 for (Value *V : VL) {
8841 if (isa<PoisonValue>(V)) {
8842 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8843 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8844 continue;
8845 }
8846 auto *Cmp = cast<CmpInst>(V);
8847 Value *LHS = Cmp->getOperand(0);
8848 Value *RHS = Cmp->getOperand(1);
8849
8850 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8851 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8852 std::swap(LHS, RHS);
8853 } else {
8854 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8855 std::swap(LHS, RHS);
8856 }
8857 Left.push_back(LHS);
8858 Right.push_back(RHS);
8859 }
8860 TE->setOperand(0, Left);
8861 TE->setOperand(1, Right);
8862 buildTree_rec(Left, Depth + 1, {TE, 0});
8863 buildTree_rec(Right, Depth + 1, {TE, 1});
8864 return;
8865 }
8866
8867 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8868 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8869 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8870 return;
8871 }
8872 default:
8873 break;
8874 }
8875 llvm_unreachable("Unexpected vectorization of the instructions.");
8876}
8877
8879 unsigned N = 1;
8880 Type *EltTy = T;
8881
8882 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8883 if (EltTy->isEmptyTy())
8884 return 0;
8885 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8886 // Check that struct is homogeneous.
8887 for (const auto *Ty : ST->elements())
8888 if (Ty != *ST->element_begin())
8889 return 0;
8890 N *= ST->getNumElements();
8891 EltTy = *ST->element_begin();
8892 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8893 N *= AT->getNumElements();
8894 EltTy = AT->getElementType();
8895 } else {
8896 auto *VT = cast<FixedVectorType>(EltTy);
8897 N *= VT->getNumElements();
8898 EltTy = VT->getElementType();
8899 }
8900 }
8901
8902 if (!isValidElementType(EltTy))
8903 return 0;
8904 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8905 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8906 VTSize != DL->getTypeStoreSizeInBits(T))
8907 return 0;
8908 return N;
8909}
8910
8911bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
8912 SmallVectorImpl<unsigned> &CurrentOrder,
8913 bool ResizeAllowed) const {
8914 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8915 assert(It != VL.end() && "Expected at least one extract instruction.");
8916 auto *E0 = cast<Instruction>(*It);
8917 assert(
8918 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8919 "Invalid opcode");
8920 // Check if all of the extracts come from the same vector and from the
8921 // correct offset.
8922 Value *Vec = E0->getOperand(0);
8923
8924 CurrentOrder.clear();
8925
8926 // We have to extract from a vector/aggregate with the same number of elements.
8927 unsigned NElts;
8928 if (E0->getOpcode() == Instruction::ExtractValue) {
8929 NElts = canMapToVector(Vec->getType());
8930 if (!NElts)
8931 return false;
8932 // Check if load can be rewritten as load of vector.
8933 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8934 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8935 return false;
8936 } else {
8937 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8938 }
8939
8940 unsigned E = VL.size();
8941 if (!ResizeAllowed && NElts != E)
8942 return false;
8943 SmallVector<int> Indices(E, PoisonMaskElem);
8944 unsigned MinIdx = NElts, MaxIdx = 0;
8945 for (auto [I, V] : enumerate(VL)) {
8946 auto *Inst = dyn_cast<Instruction>(V);
8947 if (!Inst)
8948 continue;
8949 if (Inst->getOperand(0) != Vec)
8950 return false;
8951 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8952 if (isa<UndefValue>(EE->getIndexOperand()))
8953 continue;
8954 std::optional<unsigned> Idx = getExtractIndex(Inst);
8955 if (!Idx)
8956 return false;
8957 const unsigned ExtIdx = *Idx;
8958 if (ExtIdx >= NElts)
8959 continue;
8960 Indices[I] = ExtIdx;
8961 if (MinIdx > ExtIdx)
8962 MinIdx = ExtIdx;
8963 if (MaxIdx < ExtIdx)
8964 MaxIdx = ExtIdx;
8965 }
8966 if (MaxIdx - MinIdx + 1 > E)
8967 return false;
8968 if (MaxIdx + 1 <= E)
8969 MinIdx = 0;
8970
8971 // Check that all of the indices extract from the correct offset.
8972 bool ShouldKeepOrder = true;
8973 // Assign to all items the initial value E + 1 so we can check if the extract
8974 // instruction index was used already.
8975 // Also, later we can check that all the indices are used and we have a
8976 // consecutive access in the extract instructions, by checking that no
8977 // element of CurrentOrder still has value E + 1.
8978 CurrentOrder.assign(E, E);
8979 for (unsigned I = 0; I < E; ++I) {
8980 if (Indices[I] == PoisonMaskElem)
8981 continue;
8982 const unsigned ExtIdx = Indices[I] - MinIdx;
8983 if (CurrentOrder[ExtIdx] != E) {
8984 CurrentOrder.clear();
8985 return false;
8986 }
8987 ShouldKeepOrder &= ExtIdx == I;
8988 CurrentOrder[ExtIdx] = I;
8989 }
8990 if (ShouldKeepOrder)
8991 CurrentOrder.clear();
8992
8993 return ShouldKeepOrder;
8994}
8995
8996bool BoUpSLP::areAllUsersVectorized(
8997 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
8998 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
8999 all_of(I->users(), [this](User *U) {
9000 return ScalarToTreeEntry.contains(U) ||
9001 isVectorLikeInstWithConstOps(U) ||
9002 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9003 });
9004}
9005
9006static std::pair<InstructionCost, InstructionCost>
9009 ArrayRef<Type *> ArgTys) {
9011
9012 // Calculate the cost of the scalar and vector calls.
9013 FastMathFlags FMF;
9014 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9015 FMF = FPCI->getFastMathFlags();
9017 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
9018 dyn_cast<IntrinsicInst>(CI));
9019 auto IntrinsicCost =
9021
9022 auto Shape = VFShape::get(CI->getFunctionType(),
9024 false /*HasGlobalPred*/);
9025 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9026 auto LibCost = IntrinsicCost;
9027 if (!CI->isNoBuiltin() && VecFunc) {
9028 // Calculate the cost of the vector library call.
9029 // If the corresponding vector call is cheaper, return its cost.
9030 LibCost =
9031 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9032 }
9033 return {IntrinsicCost, LibCost};
9034}
9035
9036void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9037 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
9038 SmallVectorImpl<Value *> *OpScalars,
9039 SmallVectorImpl<Value *> *AltScalars) const {
9040 unsigned Sz = Scalars.size();
9041 Mask.assign(Sz, PoisonMaskElem);
9042 SmallVector<int> OrderMask;
9043 if (!ReorderIndices.empty())
9044 inversePermutation(ReorderIndices, OrderMask);
9045 for (unsigned I = 0; I < Sz; ++I) {
9046 unsigned Idx = I;
9047 if (!ReorderIndices.empty())
9048 Idx = OrderMask[I];
9049 if (isa<PoisonValue>(Scalars[Idx]))
9050 continue;
9051 auto *OpInst = cast<Instruction>(Scalars[Idx]);
9052 if (IsAltOp(OpInst)) {
9053 Mask[I] = Sz + Idx;
9054 if (AltScalars)
9055 AltScalars->push_back(OpInst);
9056 } else {
9057 Mask[I] = Idx;
9058 if (OpScalars)
9059 OpScalars->push_back(OpInst);
9060 }
9061 }
9062 if (!ReuseShuffleIndices.empty()) {
9063 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
9064 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
9065 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9066 });
9067 Mask.swap(NewMask);
9068 }
9069}
9070
9072 const Instruction *MainOp,
9073 const Instruction *AltOp,
9074 const TargetLibraryInfo &TLI) {
9075 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9076 auto *AltCI = cast<CmpInst>(AltOp);
9077 CmpInst::Predicate MainP = MainCI->getPredicate();
9078 CmpInst::Predicate AltP = AltCI->getPredicate();
9079 assert(MainP != AltP && "Expected different main/alternate predicates.");
9080 auto *CI = cast<CmpInst>(I);
9081 if (isCmpSameOrSwapped(MainCI, CI, TLI))
9082 return false;
9083 if (isCmpSameOrSwapped(AltCI, CI, TLI))
9084 return true;
9085 CmpInst::Predicate P = CI->getPredicate();
9087
9088 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
9089 "CmpInst expected to match either main or alternate predicate or "
9090 "their swap.");
9091 (void)AltP;
9092 return MainP != P && MainP != SwappedP;
9093 }
9094 return I->getOpcode() == AltOp->getOpcode();
9095}
9096
9097TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9098 assert(!Ops.empty());
9099 const auto *Op0 = Ops.front();
9100
9101 const bool IsConstant = all_of(Ops, [](Value *V) {
9102 // TODO: We should allow undef elements here
9103 return isConstant(V) && !isa<UndefValue>(V);
9104 });
9105 const bool IsUniform = all_of(Ops, [=](Value *V) {
9106 // TODO: We should allow undef elements here
9107 return V == Op0;
9108 });
9109 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9110 // TODO: We should allow undef elements here
9111 if (auto *CI = dyn_cast<ConstantInt>(V))
9112 return CI->getValue().isPowerOf2();
9113 return false;
9114 });
9115 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9116 // TODO: We should allow undef elements here
9117 if (auto *CI = dyn_cast<ConstantInt>(V))
9118 return CI->getValue().isNegatedPowerOf2();
9119 return false;
9120 });
9121
9123 if (IsConstant && IsUniform)
9125 else if (IsConstant)
9127 else if (IsUniform)
9129
9131 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9132 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9133
9134 return {VK, VP};
9135}
9136
9137namespace {
9138/// The base class for shuffle instruction emission and shuffle cost estimation.
9139class BaseShuffleAnalysis {
9140protected:
9141 Type *ScalarTy = nullptr;
9142
9143 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9144
9145 /// V is expected to be a vectorized value.
9146 /// When REVEC is disabled, there is no difference between VF and
9147 /// VNumElements.
9148 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9149 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9150 /// of 8.
9151 unsigned getVF(Value *V) const {
9152 assert(V && "V cannot be nullptr");
9153 assert(isa<FixedVectorType>(V->getType()) &&
9154 "V does not have FixedVectorType");
9155 assert(ScalarTy && "ScalarTy cannot be nullptr");
9156 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9157 unsigned VNumElements =
9158 cast<FixedVectorType>(V->getType())->getNumElements();
9159 assert(VNumElements > ScalarTyNumElements &&
9160 "the number of elements of V is not large enough");
9161 assert(VNumElements % ScalarTyNumElements == 0 &&
9162 "the number of elements of V is not a vectorized value");
9163 return VNumElements / ScalarTyNumElements;
9164 }
9165
9166 /// Checks if the mask is an identity mask.
9167 /// \param IsStrict if is true the function returns false if mask size does
9168 /// not match vector size.
9169 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9170 bool IsStrict) {
9171 int Limit = Mask.size();
9172 int VF = VecTy->getNumElements();
9173 int Index = -1;
9174 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9175 return true;
9176 if (!IsStrict) {
9177 // Consider extract subvector starting from index 0.
9179 Index == 0)
9180 return true;
9181 // All VF-size submasks are identity (e.g.
9182 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9183 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9184 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9185 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9187 }))
9188 return true;
9189 }
9190 return false;
9191 }
9192
9193 /// Tries to combine 2 different masks into single one.
9194 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9195 /// change the size of the vector, \p LocalVF is the original size of the
9196 /// shuffled vector.
9197 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9198 ArrayRef<int> ExtMask) {
9199 unsigned VF = Mask.size();
9200 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9201 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9202 if (ExtMask[I] == PoisonMaskElem)
9203 continue;
9204 int MaskedIdx = Mask[ExtMask[I] % VF];
9205 NewMask[I] =
9206 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9207 }
9208 Mask.swap(NewMask);
9209 }
9210
9211 /// Looks through shuffles trying to reduce final number of shuffles in the
9212 /// code. The function looks through the previously emitted shuffle
9213 /// instructions and properly mark indices in mask as undef.
9214 /// For example, given the code
9215 /// \code
9216 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9217 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9218 /// \endcode
9219 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9220 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9221 /// <0, 1, 2, 3> for the shuffle.
9222 /// If 2 operands are of different size, the smallest one will be resized and
9223 /// the mask recalculated properly.
9224 /// For example, given the code
9225 /// \code
9226 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9227 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9228 /// \endcode
9229 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9230 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9231 /// <0, 1, 2, 3> for the shuffle.
9232 /// So, it tries to transform permutations to simple vector merge, if
9233 /// possible.
9234 /// \param V The input vector which must be shuffled using the given \p Mask.
9235 /// If the better candidate is found, \p V is set to this best candidate
9236 /// vector.
9237 /// \param Mask The input mask for the shuffle. If the best candidate is found
9238 /// during looking-through-shuffles attempt, it is updated accordingly.
9239 /// \param SinglePermute true if the shuffle operation is originally a
9240 /// single-value-permutation. In this case the look-through-shuffles procedure
9241 /// may look for resizing shuffles as the best candidates.
9242 /// \return true if the shuffle results in the non-resizing identity shuffle
9243 /// (and thus can be ignored), false - otherwise.
9244 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9245 bool SinglePermute) {
9246 Value *Op = V;
9247 ShuffleVectorInst *IdentityOp = nullptr;
9248 SmallVector<int> IdentityMask;
9249 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9250 // Exit if not a fixed vector type or changing size shuffle.
9251 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9252 if (!SVTy)
9253 break;
9254 // Remember the identity or broadcast mask, if it is not a resizing
9255 // shuffle. If no better candidates are found, this Op and Mask will be
9256 // used in the final shuffle.
9257 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9258 if (!IdentityOp || !SinglePermute ||
9259 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9261 IdentityMask.size()))) {
9262 IdentityOp = SV;
9263 // Store current mask in the IdentityMask so later we did not lost
9264 // this info if IdentityOp is selected as the best candidate for the
9265 // permutation.
9266 IdentityMask.assign(Mask);
9267 }
9268 }
9269 // Remember the broadcast mask. If no better candidates are found, this Op
9270 // and Mask will be used in the final shuffle.
9271 // Zero splat can be used as identity too, since it might be used with
9272 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9273 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9274 // expensive, the analysis founds out, that the source vector is just a
9275 // broadcast, this original mask can be transformed to identity mask <0,
9276 // 1, 2, 3>.
9277 // \code
9278 // %0 = shuffle %v, poison, zeroinitalizer
9279 // %res = shuffle %0, poison, <3, 1, 2, 0>
9280 // \endcode
9281 // may be transformed to
9282 // \code
9283 // %0 = shuffle %v, poison, zeroinitalizer
9284 // %res = shuffle %0, poison, <0, 1, 2, 3>
9285 // \endcode
9286 if (SV->isZeroEltSplat()) {
9287 IdentityOp = SV;
9288 IdentityMask.assign(Mask);
9289 }
9290 int LocalVF = Mask.size();
9291 if (auto *SVOpTy =
9292 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9293 LocalVF = SVOpTy->getNumElements();
9294 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9295 for (auto [Idx, I] : enumerate(Mask)) {
9296 if (I == PoisonMaskElem ||
9297 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9298 continue;
9299 ExtMask[Idx] = SV->getMaskValue(I);
9300 }
9301 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9302 SV->getOperand(0),
9303 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9304 .all();
9305 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9306 SV->getOperand(1),
9307 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9308 .all();
9309 if (!IsOp1Undef && !IsOp2Undef) {
9310 // Update mask and mark undef elems.
9311 for (int &I : Mask) {
9312 if (I == PoisonMaskElem)
9313 continue;
9314 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9316 I = PoisonMaskElem;
9317 }
9318 break;
9319 }
9320 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9321 combineMasks(LocalVF, ShuffleMask, Mask);
9322 Mask.swap(ShuffleMask);
9323 if (IsOp2Undef)
9324 Op = SV->getOperand(0);
9325 else
9326 Op = SV->getOperand(1);
9327 }
9328 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9329 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9331 if (IdentityOp) {
9332 V = IdentityOp;
9333 assert(Mask.size() == IdentityMask.size() &&
9334 "Expected masks of same sizes.");
9335 // Clear known poison elements.
9336 for (auto [I, Idx] : enumerate(Mask))
9337 if (Idx == PoisonMaskElem)
9338 IdentityMask[I] = PoisonMaskElem;
9339 Mask.swap(IdentityMask);
9340 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9341 return SinglePermute &&
9342 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9343 /*IsStrict=*/true) ||
9344 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9345 Shuffle->isZeroEltSplat() &&
9347 }
9348 V = Op;
9349 return false;
9350 }
9351 V = Op;
9352 return true;
9353 }
9354
9355 /// Smart shuffle instruction emission, walks through shuffles trees and
9356 /// tries to find the best matching vector for the actual shuffle
9357 /// instruction.
9358 template <typename T, typename ShuffleBuilderTy>
9359 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9360 ShuffleBuilderTy &Builder) {
9361 assert(V1 && "Expected at least one vector value.");
9362 if (V2)
9363 Builder.resizeToMatch(V1, V2);
9364 int VF = Mask.size();
9365 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9366 VF = FTy->getNumElements();
9367 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9368 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9369 .all()) {
9370 // Peek through shuffles.
9371 Value *Op1 = V1;
9372 Value *Op2 = V2;
9373 int VF =
9374 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9375 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9376 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9377 for (int I = 0, E = Mask.size(); I < E; ++I) {
9378 if (Mask[I] < VF)
9379 CombinedMask1[I] = Mask[I];
9380 else
9381 CombinedMask2[I] = Mask[I] - VF;
9382 }
9383 Value *PrevOp1;
9384 Value *PrevOp2;
9385 do {
9386 PrevOp1 = Op1;
9387 PrevOp2 = Op2;
9388 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9389 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9390 // Check if we have 2 resizing shuffles - need to peek through operands
9391 // again.
9392 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9393 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9394 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9395 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9396 if (I == PoisonMaskElem)
9397 continue;
9398 ExtMask1[Idx] = SV1->getMaskValue(I);
9399 }
9400 SmallBitVector UseMask1 = buildUseMask(
9401 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9402 ->getNumElements(),
9403 ExtMask1, UseMask::SecondArg);
9404 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9405 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9406 if (I == PoisonMaskElem)
9407 continue;
9408 ExtMask2[Idx] = SV2->getMaskValue(I);
9409 }
9410 SmallBitVector UseMask2 = buildUseMask(
9411 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9412 ->getNumElements(),
9413 ExtMask2, UseMask::SecondArg);
9414 if (SV1->getOperand(0)->getType() ==
9415 SV2->getOperand(0)->getType() &&
9416 SV1->getOperand(0)->getType() != SV1->getType() &&
9417 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9418 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9419 Op1 = SV1->getOperand(0);
9420 Op2 = SV2->getOperand(0);
9421 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9422 int LocalVF = ShuffleMask1.size();
9423 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9424 LocalVF = FTy->getNumElements();
9425 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9426 CombinedMask1.swap(ShuffleMask1);
9427 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9428 LocalVF = ShuffleMask2.size();
9429 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9430 LocalVF = FTy->getNumElements();
9431 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9432 CombinedMask2.swap(ShuffleMask2);
9433 }
9434 }
9435 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9436 Builder.resizeToMatch(Op1, Op2);
9437 VF = std::max(cast<VectorType>(Op1->getType())
9438 ->getElementCount()
9439 .getKnownMinValue(),
9440 cast<VectorType>(Op2->getType())
9441 ->getElementCount()
9442 .getKnownMinValue());
9443 for (int I = 0, E = Mask.size(); I < E; ++I) {
9444 if (CombinedMask2[I] != PoisonMaskElem) {
9445 assert(CombinedMask1[I] == PoisonMaskElem &&
9446 "Expected undefined mask element");
9447 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9448 }
9449 }
9450 if (Op1 == Op2 &&
9451 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9452 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9453 isa<ShuffleVectorInst>(Op1) &&
9454 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9455 ArrayRef(CombinedMask1))))
9456 return Builder.createIdentity(Op1);
9457 return Builder.createShuffleVector(
9458 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9459 CombinedMask1);
9460 }
9461 if (isa<PoisonValue>(V1))
9462 return Builder.createPoison(
9463 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9464 SmallVector<int> NewMask(Mask);
9465 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9466 assert(V1 && "Expected non-null value after looking through shuffles.");
9467
9468 if (!IsIdentity)
9469 return Builder.createShuffleVector(V1, NewMask);
9470 return Builder.createIdentity(V1);
9471 }
9472};
9473} // namespace
9474
9475/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9476static std::pair<InstructionCost, InstructionCost>
9478 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9479 Type *ScalarTy, VectorType *VecTy) {
9480 InstructionCost ScalarCost = 0;
9481 InstructionCost VecCost = 0;
9482 // Here we differentiate two cases: (1) when Ptrs represent a regular
9483 // vectorization tree node (as they are pointer arguments of scattered
9484 // loads) or (2) when Ptrs are the arguments of loads or stores being
9485 // vectorized as plane wide unit-stride load/store since all the
9486 // loads/stores are known to be from/to adjacent locations.
9487 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9488 // Case 2: estimate costs for pointer related costs when vectorizing to
9489 // a wide load/store.
9490 // Scalar cost is estimated as a set of pointers with known relationship
9491 // between them.
9492 // For vector code we will use BasePtr as argument for the wide load/store
9493 // but we also need to account all the instructions which are going to
9494 // stay in vectorized code due to uses outside of these scalar
9495 // loads/stores.
9496 ScalarCost = TTI.getPointersChainCost(
9497 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9498 CostKind);
9499
9500 SmallVector<const Value *> PtrsRetainedInVecCode;
9501 for (Value *V : Ptrs) {
9502 if (V == BasePtr) {
9503 PtrsRetainedInVecCode.push_back(V);
9504 continue;
9505 }
9506 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9507 // For simplicity assume Ptr to stay in vectorized code if it's not a
9508 // GEP instruction. We don't care since it's cost considered free.
9509 // TODO: We should check for any uses outside of vectorizable tree
9510 // rather than just single use.
9511 if (!Ptr || !Ptr->hasOneUse())
9512 PtrsRetainedInVecCode.push_back(V);
9513 }
9514
9515 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9516 // If all pointers stay in vectorized code then we don't have
9517 // any savings on that.
9518 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9519 }
9520 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9521 TTI::PointersChainInfo::getKnownStride(),
9522 VecTy, CostKind);
9523 } else {
9524 // Case 1: Ptrs are the arguments of loads that we are going to transform
9525 // into masked gather load intrinsic.
9526 // All the scalar GEPs will be removed as a result of vectorization.
9527 // For any external uses of some lanes extract element instructions will
9528 // be generated (which cost is estimated separately).
9529 TTI::PointersChainInfo PtrsInfo =
9530 all_of(Ptrs,
9531 [](const Value *V) {
9532 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9533 return Ptr && !Ptr->hasAllConstantIndices();
9534 })
9535 ? TTI::PointersChainInfo::getUnknownStride()
9536 : TTI::PointersChainInfo::getKnownStride();
9537
9538 ScalarCost =
9539 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9540 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9541 if (!BaseGEP) {
9542 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9543 if (It != Ptrs.end())
9544 BaseGEP = cast<GEPOperator>(*It);
9545 }
9546 if (BaseGEP) {
9547 SmallVector<const Value *> Indices(BaseGEP->indices());
9548 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9549 BaseGEP->getPointerOperand(), Indices, VecTy,
9550 CostKind);
9551 }
9552 }
9553
9554 return std::make_pair(ScalarCost, VecCost);
9555}
9556
9557void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9558 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9559 "Expected gather node without reordering.");
9561 SmallSet<size_t, 2> LoadKeyUsed;
9562
9563 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9564 // instructions have same opcode already.
9565 if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
9566 all_of(TE.Scalars, isConstant))
9567 return;
9568
9569 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9570 return VectorizableTree[Idx]->isSame(TE.Scalars);
9571 }))
9572 return;
9573
9574 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9575 Key = hash_combine(hash_value(LI->getParent()), Key);
9576 Value *Ptr =
9578 if (LoadKeyUsed.contains(Key)) {
9579 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9580 if (LIt != LoadsMap.end()) {
9581 for (LoadInst *RLI : LIt->second) {
9582 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9583 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9584 /*StrictCheck=*/true))
9585 return hash_value(RLI->getPointerOperand());
9586 }
9587 for (LoadInst *RLI : LIt->second) {
9589 LI->getPointerOperand(), *TLI)) {
9590 hash_code SubKey = hash_value(RLI->getPointerOperand());
9591 return SubKey;
9592 }
9593 }
9594 if (LIt->second.size() > 2) {
9595 hash_code SubKey =
9596 hash_value(LIt->second.back()->getPointerOperand());
9597 return SubKey;
9598 }
9599 }
9600 }
9601 LoadKeyUsed.insert(Key);
9602 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9603 return hash_value(LI->getPointerOperand());
9604 };
9607 bool IsOrdered = true;
9608 unsigned NumInstructions = 0;
9609 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9610 // nodes.
9611 for (auto [I, V] : enumerate(TE.Scalars)) {
9612 size_t Key = 1, Idx = 1;
9613 if (auto *Inst = dyn_cast<Instruction>(V);
9614 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9615 !isDeleted(Inst) && !isVectorized(V)) {
9616 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9617 /*AllowAlternate=*/false);
9618 ++NumInstructions;
9619 }
9620 auto &Container = SortedValues[Key];
9621 if (IsOrdered && !KeyToIndex.contains(V) &&
9622 !(isa<Constant, ExtractElementInst>(V) ||
9624 ((Container.contains(Idx) &&
9625 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9626 (!Container.empty() && !Container.contains(Idx) &&
9627 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9628 IsOrdered = false;
9629 auto &KTI = KeyToIndex[V];
9630 if (KTI.empty())
9631 Container[Idx].push_back(V);
9632 KTI.push_back(I);
9633 }
9635 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9636 if (!IsOrdered && NumInstructions > 1) {
9637 unsigned Cnt = 0;
9638 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9639 for (const auto &D : SortedValues) {
9640 for (const auto &P : D.second) {
9641 unsigned Sz = 0;
9642 for (Value *V : P.second) {
9643 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9644 for (auto [K, Idx] : enumerate(Indices)) {
9645 TE.ReorderIndices[Cnt + K] = Idx;
9646 TE.Scalars[Cnt + K] = V;
9647 }
9648 Sz += Indices.size();
9649 Cnt += Indices.size();
9650 }
9651 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9652 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9653 *TTI, TE.Scalars.front()->getType(), Sz);
9654 SubVectors.emplace_back(Cnt - Sz, SubVF);
9655 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9656 DemandedElts.clearBit(I);
9657 } else if (!P.second.empty() && isConstant(P.second.front())) {
9658 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9659 DemandedElts.clearBit(I);
9660 }
9661 }
9662 }
9663 }
9664 // Reuses always require shuffles, so consider it as profitable.
9665 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9666 return;
9667 // Do simple cost estimation.
9670 auto *ScalarTy = TE.Scalars.front()->getType();
9671 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9672 for (auto [Idx, Sz] : SubVectors) {
9674 Idx, getWidenedType(ScalarTy, Sz));
9675 }
9676 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9677 assert(SLPReVec && "Only supported by REVEC.");
9678 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9679 // of CreateInsertElement.
9680 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9681 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9682 if (DemandedElts[I])
9683 Cost +=
9684 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9685 CostKind, I * ScalarTyNumElements, FTy);
9686 } else {
9687 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9688 /*Extract=*/false, CostKind);
9689 }
9690 int Sz = TE.Scalars.size();
9691 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9692 TE.ReorderIndices.end());
9693 for (unsigned I : seq<unsigned>(Sz)) {
9694 Value *V = TE.getOrdered(I);
9695 if (isa<PoisonValue>(V)) {
9696 ReorderMask[I] = PoisonMaskElem;
9697 } else if (isConstant(V) || DemandedElts[I]) {
9698 ReorderMask[I] = I + TE.ReorderIndices.size();
9699 }
9700 }
9702 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9705 VecTy, ReorderMask);
9706 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9707 ReorderMask.assign(Sz, PoisonMaskElem);
9708 for (unsigned I : seq<unsigned>(Sz)) {
9709 Value *V = TE.getOrdered(I);
9710 if (isConstant(V)) {
9711 DemandedElts.clearBit(I);
9712 if (!isa<PoisonValue>(V))
9713 ReorderMask[I] = I;
9714 } else {
9715 ReorderMask[I] = I + Sz;
9716 }
9717 }
9719 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9720 if (!DemandedElts.isAllOnes())
9721 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9722 if (Cost >= BVCost) {
9723 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9724 reorderScalars(TE.Scalars, Mask);
9725 TE.ReorderIndices.clear();
9726 }
9727}
9728
9731 BaseGraphSize = VectorizableTree.size();
9732 // Turn graph transforming mode on and off, when done.
9733 class GraphTransformModeRAAI {
9734 bool &SavedIsGraphTransformMode;
9735
9736 public:
9737 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9738 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9739 IsGraphTransformMode = true;
9740 }
9741 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9742 } TransformContext(IsGraphTransformMode);
9743 // Operands are profitable if they are:
9744 // 1. At least one constant
9745 // or
9746 // 2. Splats
9747 // or
9748 // 3. Results in good vectorization opportunity, i.e. may generate vector
9749 // nodes and reduce cost of the graph.
9750 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9751 const InstructionsState &S) {
9753 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9754 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9755 I2->getOperand(Op));
9756 return all_of(
9757 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9758 return all_of(Cand,
9759 [](const std::pair<Value *, Value *> &P) {
9760 return isa<Constant>(P.first) ||
9761 isa<Constant>(P.second) || P.first == P.second;
9762 }) ||
9764 });
9765 };
9766
9767 // Try to reorder gather nodes for better vectorization opportunities.
9768 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9769 TreeEntry &E = *VectorizableTree[Idx];
9770 if (E.isGather())
9771 reorderGatherNode(E);
9772 }
9773
9774 // The tree may grow here, so iterate over nodes, built before.
9775 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9776 TreeEntry &E = *VectorizableTree[Idx];
9777 if (E.isGather()) {
9778 ArrayRef<Value *> VL = E.Scalars;
9779 const unsigned Sz = getVectorElementSize(VL.front());
9780 unsigned MinVF = getMinVF(2 * Sz);
9781 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9782 // same opcode and same parent block or all constants.
9783 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9784 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9785 E.isAltShuffle() || !allSameBlock(VL)) ||
9786 allConstant(VL) || isSplat(VL))
9787 continue;
9788 // Try to find vectorizable sequences and transform them into a series of
9789 // insertvector instructions.
9790 unsigned StartIdx = 0;
9791 unsigned End = VL.size();
9792 for (unsigned VF = getFloorFullVectorNumberOfElements(
9793 *TTI, VL.front()->getType(), VL.size() - 1);
9794 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9795 *TTI, VL.front()->getType(), VF - 1)) {
9796 if (StartIdx + VF > End)
9797 continue;
9799 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9800 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9801 // If any instruction is vectorized already - do not try again.
9802 // Reuse the existing node, if it fully matches the slice.
9803 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9804 SE || getTreeEntry(Slice.back())) {
9805 if (!SE)
9806 continue;
9807 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9808 continue;
9809 }
9810 // Constant already handled effectively - skip.
9811 if (allConstant(Slice))
9812 continue;
9813 // Do not try to vectorize small splats (less than vector register and
9814 // only with the single non-undef element).
9815 bool IsSplat = isSplat(Slice);
9816 if (Slices.empty() || !IsSplat ||
9817 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9818 Slice.front()->getType(), VF)),
9819 1U, VF - 1) !=
9821 Slice.front()->getType(), 2 * VF)),
9822 1U, 2 * VF)) ||
9823 count(Slice, Slice.front()) ==
9824 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9825 : 1)) {
9826 if (IsSplat)
9827 continue;
9828 InstructionsState S = getSameOpcode(Slice, *TLI);
9829 if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9830 (S.getOpcode() == Instruction::Load &&
9832 (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
9833 continue;
9834 if (VF == 2) {
9835 // Try to vectorize reduced values or if all users are vectorized.
9836 // For expensive instructions extra extracts might be profitable.
9837 if ((!UserIgnoreList || E.Idx != 0) &&
9838 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9840 !all_of(Slice, [&](Value *V) {
9841 if (isa<PoisonValue>(V))
9842 return true;
9843 return areAllUsersVectorized(cast<Instruction>(V),
9844 UserIgnoreList);
9845 }))
9846 continue;
9847 if (S.getOpcode() == Instruction::Load) {
9848 OrdersType Order;
9849 SmallVector<Value *> PointerOps;
9850 LoadsState Res =
9851 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9852 // Do not vectorize gathers.
9853 if (Res == LoadsState::ScatterVectorize ||
9854 Res == LoadsState::Gather) {
9855 if (Res == LoadsState::Gather) {
9857 // If reductions and the scalars from the root node are
9858 // analyzed - mark as non-vectorizable reduction.
9859 if (UserIgnoreList && E.Idx == 0)
9860 analyzedReductionVals(Slice);
9861 }
9862 continue;
9863 }
9864 } else if (S.getOpcode() == Instruction::ExtractElement ||
9865 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9867 !CheckOperandsProfitability(
9868 S.getMainOp(),
9869 cast<Instruction>(*find_if(reverse(Slice),
9870 IsaPred<Instruction>)),
9871 S))) {
9872 // Do not vectorize extractelements (handled effectively
9873 // alread). Do not vectorize non-profitable instructions (with
9874 // low cost and non-vectorizable operands.)
9875 continue;
9876 }
9877 }
9878 }
9879 Slices.emplace_back(Cnt, Slice.size());
9880 }
9881 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9882 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9883 if (StartIdx == Cnt)
9884 StartIdx = Cnt + Sz;
9885 if (End == Cnt + Sz)
9886 End = Cnt;
9887 };
9888 for (auto [Cnt, Sz] : Slices) {
9889 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9890 // If any instruction is vectorized already - do not try again.
9891 if (TreeEntry *SE = getTreeEntry(Slice.front());
9892 SE || getTreeEntry(Slice.back())) {
9893 if (!SE)
9894 continue;
9895 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9896 continue;
9897 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9898 AddCombinedNode(SE->Idx, Cnt, Sz);
9899 continue;
9900 }
9901 unsigned PrevSize = VectorizableTree.size();
9902 [[maybe_unused]] unsigned PrevEntriesSize =
9903 LoadEntriesToVectorize.size();
9904 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9905 if (PrevSize + 1 == VectorizableTree.size() &&
9906 VectorizableTree[PrevSize]->isGather() &&
9907 VectorizableTree[PrevSize]->getOpcode() !=
9908 Instruction::ExtractElement &&
9909 !isSplat(Slice)) {
9910 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9911 analyzedReductionVals(Slice);
9912 VectorizableTree.pop_back();
9913 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9914 "LoadEntriesToVectorize expected to remain the same");
9915 continue;
9916 }
9917 AddCombinedNode(PrevSize, Cnt, Sz);
9918 }
9919 }
9920 // Restore ordering, if no extra vectorization happened.
9921 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9922 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9923 reorderScalars(E.Scalars, Mask);
9924 E.ReorderIndices.clear();
9925 }
9926 }
9927 switch (E.getOpcode()) {
9928 case Instruction::Load: {
9929 // No need to reorder masked gather loads, just reorder the scalar
9930 // operands.
9931 if (E.State != TreeEntry::Vectorize)
9932 break;
9933 Type *ScalarTy = E.getMainOp()->getType();
9934 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9935 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9936 // Check if profitable to represent consecutive load + reverse as strided
9937 // load with stride -1.
9938 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9939 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9940 SmallVector<int> Mask;
9941 inversePermutation(E.ReorderIndices, Mask);
9942 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9943 InstructionCost OriginalVecCost =
9944 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9949 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9950 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9951 if (StridedCost < OriginalVecCost)
9952 // Strided load is more profitable than consecutive load + reverse -
9953 // transform the node to strided load.
9954 E.State = TreeEntry::StridedVectorize;
9955 }
9956 break;
9957 }
9958 case Instruction::Store: {
9959 Type *ScalarTy =
9960 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9961 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9962 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9963 // Check if profitable to represent consecutive load + reverse as strided
9964 // load with stride -1.
9965 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9966 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9967 SmallVector<int> Mask;
9968 inversePermutation(E.ReorderIndices, Mask);
9969 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9970 InstructionCost OriginalVecCost =
9971 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9976 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9977 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
9978 if (StridedCost < OriginalVecCost)
9979 // Strided store is more profitable than reverse + consecutive store -
9980 // transform the node to strided store.
9981 E.State = TreeEntry::StridedVectorize;
9982 } else if (!E.ReorderIndices.empty()) {
9983 // Check for interleaved stores.
9984 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
9985 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9986 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
9987 if (Mask.size() < 4)
9988 return 0u;
9989 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9991 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
9993 VecTy, Factor, BaseSI->getAlign(),
9994 BaseSI->getPointerAddressSpace()))
9995 return Factor;
9996 }
9997
9998 return 0u;
9999 };
10000 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
10001 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10002 if (InterleaveFactor != 0)
10003 E.setInterleave(InterleaveFactor);
10004 }
10005 break;
10006 }
10007 case Instruction::Select: {
10008 if (E.State != TreeEntry::Vectorize)
10009 break;
10010 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
10011 if (MinMaxID == Intrinsic::not_intrinsic)
10012 break;
10013 // This node is a minmax node.
10014 E.CombinedOp = TreeEntry::MinMax;
10015 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
10016 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10017 CondEntry->State == TreeEntry::Vectorize) {
10018 // The condition node is part of the combined minmax node.
10019 CondEntry->State = TreeEntry::CombinedVectorize;
10020 }
10021 break;
10022 }
10023 default:
10024 break;
10025 }
10026 }
10027
10028 if (LoadEntriesToVectorize.empty()) {
10029 // Single load node - exit.
10030 if (VectorizableTree.size() <= 1 &&
10031 VectorizableTree.front()->getOpcode() == Instruction::Load)
10032 return;
10033 // Small graph with small VF - exit.
10034 constexpr unsigned SmallTree = 3;
10035 constexpr unsigned SmallVF = 2;
10036 if ((VectorizableTree.size() <= SmallTree &&
10037 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10038 (VectorizableTree.size() <= 2 && UserIgnoreList))
10039 return;
10040
10041 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10042 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
10043 getCanonicalGraphSize() <= SmallTree &&
10044 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
10045 [](const std::unique_ptr<TreeEntry> &TE) {
10046 return TE->isGather() &&
10047 TE->getOpcode() == Instruction::Load &&
10048 !allSameBlock(TE->Scalars);
10049 }) == 1)
10050 return;
10051 }
10052
10053 // A list of loads to be gathered during the vectorization process. We can
10054 // try to vectorize them at the end, if profitable.
10057 GatheredLoads;
10058
10059 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10060 TreeEntry &E = *TE;
10061 if (E.isGather() &&
10062 (E.getOpcode() == Instruction::Load ||
10063 (!E.getOpcode() && any_of(E.Scalars,
10064 [&](Value *V) {
10065 return isa<LoadInst>(V) &&
10066 !isVectorized(V) &&
10067 !isDeleted(cast<Instruction>(V));
10068 }))) &&
10069 !isSplat(E.Scalars)) {
10070 for (Value *V : E.Scalars) {
10071 auto *LI = dyn_cast<LoadInst>(V);
10072 if (!LI)
10073 continue;
10074 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
10075 continue;
10077 *this, V, *DL, *SE, *TTI,
10078 GatheredLoads[std::make_tuple(
10079 LI->getParent(),
10081 LI->getType())]);
10082 }
10083 }
10084 }
10085 // Try to vectorize gathered loads if this is not just a gather of loads.
10086 if (!GatheredLoads.empty())
10087 tryToVectorizeGatheredLoads(GatheredLoads);
10088}
10089
10090/// Merges shuffle masks and emits final shuffle instruction, if required. It
10091/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10092/// when the actual shuffle instruction is generated only if this is actually
10093/// required. Otherwise, the shuffle instruction emission is delayed till the
10094/// end of the process, to reduce the number of emitted instructions and further
10095/// analysis/transformations.
10096class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10097 bool IsFinalized = false;
10098 SmallVector<int> CommonMask;
10100 const TargetTransformInfo &TTI;
10102 SmallDenseSet<Value *> VectorizedVals;
10103 BoUpSLP &R;
10104 SmallPtrSetImpl<Value *> &CheckedExtracts;
10105 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10106 /// While set, still trying to estimate the cost for the same nodes and we
10107 /// can delay actual cost estimation (virtual shuffle instruction emission).
10108 /// May help better estimate the cost if same nodes must be permuted + allows
10109 /// to move most of the long shuffles cost estimation to TTI.
10110 bool SameNodesEstimated = true;
10111
10112 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10113 if (Ty->getScalarType()->isPointerTy()) {
10117 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10118 Ty->getScalarType());
10119 if (auto *VTy = dyn_cast<VectorType>(Ty))
10120 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10121 return Res;
10122 }
10123 return Constant::getAllOnesValue(Ty);
10124 }
10125
10126 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10127 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10128 return TTI::TCC_Free;
10129 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10130 InstructionCost GatherCost = 0;
10131 SmallVector<Value *> Gathers(VL);
10132 if (!Root && isSplat(VL)) {
10133 // Found the broadcasting of the single scalar, calculate the cost as
10134 // the broadcast.
10135 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10136 assert(It != VL.end() && "Expected at least one non-undef value.");
10137 // Add broadcast for non-identity shuffle only.
10138 bool NeedShuffle =
10139 count(VL, *It) > 1 &&
10140 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10141 if (!NeedShuffle) {
10142 if (isa<FixedVectorType>(ScalarTy)) {
10143 assert(SLPReVec && "FixedVectorType is not expected.");
10144 return TTI.getShuffleCost(
10145 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10146 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10147 cast<FixedVectorType>(ScalarTy));
10148 }
10149 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10150 CostKind, std::distance(VL.begin(), It),
10151 PoisonValue::get(VecTy), *It);
10152 }
10153
10154 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10155 transform(VL, ShuffleMask.begin(), [](Value *V) {
10156 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10157 });
10158 InstructionCost InsertCost =
10159 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10160 PoisonValue::get(VecTy), *It);
10161 return InsertCost + ::getShuffleCost(TTI,
10163 VecTy, ShuffleMask, CostKind,
10164 /*Index=*/0, /*SubTp=*/nullptr,
10165 /*Args=*/*It);
10166 }
10167 return GatherCost +
10168 (all_of(Gathers, IsaPred<UndefValue>)
10170 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10171 ScalarTy));
10172 };
10173
10174 /// Compute the cost of creating a vector containing the extracted values from
10175 /// \p VL.
10177 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10178 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10179 unsigned NumParts) {
10180 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10181 unsigned NumElts =
10182 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10183 auto *EE = dyn_cast<ExtractElementInst>(V);
10184 if (!EE)
10185 return Sz;
10186 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10187 if (!VecTy)
10188 return Sz;
10189 return std::max(Sz, VecTy->getNumElements());
10190 });
10191 // FIXME: this must be moved to TTI for better estimation.
10192 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10193 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10195 -> std::optional<TTI::ShuffleKind> {
10196 if (NumElts <= EltsPerVector)
10197 return std::nullopt;
10198 int OffsetReg0 =
10199 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10200 [](int S, int I) {
10201 if (I == PoisonMaskElem)
10202 return S;
10203 return std::min(S, I);
10204 }),
10205 EltsPerVector);
10206 int OffsetReg1 = OffsetReg0;
10207 DenseSet<int> RegIndices;
10208 // Check that if trying to permute same single/2 input vectors.
10210 int FirstRegId = -1;
10211 Indices.assign(1, OffsetReg0);
10212 for (auto [Pos, I] : enumerate(Mask)) {
10213 if (I == PoisonMaskElem)
10214 continue;
10215 int Idx = I - OffsetReg0;
10216 int RegId =
10217 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10218 if (FirstRegId < 0)
10219 FirstRegId = RegId;
10220 RegIndices.insert(RegId);
10221 if (RegIndices.size() > 2)
10222 return std::nullopt;
10223 if (RegIndices.size() == 2) {
10224 ShuffleKind = TTI::SK_PermuteTwoSrc;
10225 if (Indices.size() == 1) {
10226 OffsetReg1 = alignDown(
10227 std::accumulate(
10228 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10229 [&](int S, int I) {
10230 if (I == PoisonMaskElem)
10231 return S;
10232 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10233 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10234 if (RegId == FirstRegId)
10235 return S;
10236 return std::min(S, I);
10237 }),
10238 EltsPerVector);
10239 Indices.push_back(OffsetReg1 % NumElts);
10240 }
10241 Idx = I - OffsetReg1;
10242 }
10243 I = (Idx % NumElts) % EltsPerVector +
10244 (RegId == FirstRegId ? 0 : EltsPerVector);
10245 }
10246 return ShuffleKind;
10247 };
10249
10250 // Process extracts in blocks of EltsPerVector to check if the source vector
10251 // operand can be re-used directly. If not, add the cost of creating a
10252 // shuffle to extract the values into a vector register.
10253 for (unsigned Part : seq<unsigned>(NumParts)) {
10254 if (!ShuffleKinds[Part])
10255 continue;
10256 ArrayRef<int> MaskSlice = Mask.slice(
10257 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10258 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10259 copy(MaskSlice, SubMask.begin());
10261 std::optional<TTI::ShuffleKind> RegShuffleKind =
10262 CheckPerRegistersShuffle(SubMask, Indices);
10263 if (!RegShuffleKind) {
10264 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10266 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10267 Cost +=
10268 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10269 getWidenedType(ScalarTy, NumElts), MaskSlice);
10270 continue;
10271 }
10272 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10273 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10274 Cost +=
10275 ::getShuffleCost(TTI, *RegShuffleKind,
10276 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10277 }
10278 const unsigned BaseVF = getFullVectorNumberOfElements(
10279 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10280 for (unsigned Idx : Indices) {
10281 assert((Idx + EltsPerVector) <= BaseVF &&
10282 "SK_ExtractSubvector index out of range");
10284 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10285 Idx, getWidenedType(ScalarTy, EltsPerVector));
10286 }
10287 // Second attempt to check, if just a permute is better estimated than
10288 // subvector extract.
10289 SubMask.assign(NumElts, PoisonMaskElem);
10290 copy(MaskSlice, SubMask.begin());
10291 InstructionCost OriginalCost = ::getShuffleCost(
10292 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10293 if (OriginalCost < Cost)
10294 Cost = OriginalCost;
10295 }
10296 return Cost;
10297 }
10298 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
10299 /// shuffle emission.
10300 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
10301 ArrayRef<int> Mask) {
10302 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10303 if (Mask[Idx] != PoisonMaskElem)
10304 CommonMask[Idx] = Idx;
10305 }
10306 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10307 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10308 /// elements.
10309 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10310 ArrayRef<int> Mask, unsigned Part,
10311 unsigned SliceSize) {
10312 if (SameNodesEstimated) {
10313 // Delay the cost estimation if the same nodes are reshuffling.
10314 // If we already requested the cost of reshuffling of E1 and E2 before, no
10315 // need to estimate another cost with the sub-Mask, instead include this
10316 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10317 // estimation.
10318 if ((InVectors.size() == 2 &&
10319 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10320 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10321 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10322 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10323 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10324 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10325 "Expected all poisoned elements.");
10326 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10327 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10328 return;
10329 }
10330 // Found non-matching nodes - need to estimate the cost for the matched
10331 // and transform mask.
10332 Cost += createShuffle(InVectors.front(),
10333 InVectors.size() == 1 ? nullptr : InVectors.back(),
10334 CommonMask);
10335 transformMaskAfterShuffle(CommonMask, CommonMask);
10336 } else if (InVectors.size() == 2) {
10337 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10338 transformMaskAfterShuffle(CommonMask, CommonMask);
10339 }
10340 SameNodesEstimated = false;
10341 if (!E2 && InVectors.size() == 1) {
10342 unsigned VF = E1.getVectorFactor();
10343 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10344 VF = std::max(VF,
10345 cast<FixedVectorType>(V1->getType())->getNumElements());
10346 } else {
10347 const auto *E = cast<const TreeEntry *>(InVectors.front());
10348 VF = std::max(VF, E->getVectorFactor());
10349 }
10350 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10351 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10352 CommonMask[Idx] = Mask[Idx] + VF;
10353 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10354 transformMaskAfterShuffle(CommonMask, CommonMask);
10355 } else {
10356 auto P = InVectors.front();
10357 Cost += createShuffle(&E1, E2, Mask);
10358 unsigned VF = Mask.size();
10359 if (Value *V1 = P.dyn_cast<Value *>()) {
10360 VF = std::max(VF,
10361 getNumElements(V1->getType()));
10362 } else {
10363 const auto *E = cast<const TreeEntry *>(P);
10364 VF = std::max(VF, E->getVectorFactor());
10365 }
10366 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10367 if (Mask[Idx] != PoisonMaskElem)
10368 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10369 Cost += createShuffle(P, InVectors.front(), CommonMask);
10370 transformMaskAfterShuffle(CommonMask, CommonMask);
10371 }
10372 }
10373
10374 class ShuffleCostBuilder {
10375 const TargetTransformInfo &TTI;
10376
10377 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10378 int Index = -1;
10379 return Mask.empty() ||
10380 (VF == Mask.size() &&
10383 Index == 0);
10384 }
10385
10386 public:
10387 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10388 ~ShuffleCostBuilder() = default;
10389 InstructionCost createShuffleVector(Value *V1, Value *,
10390 ArrayRef<int> Mask) const {
10391 // Empty mask or identity mask are free.
10392 unsigned VF =
10393 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10394 if (isEmptyOrIdentity(Mask, VF))
10395 return TTI::TCC_Free;
10396 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10397 cast<VectorType>(V1->getType()), Mask);
10398 }
10399 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10400 // Empty mask or identity mask are free.
10401 unsigned VF =
10402 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10403 if (isEmptyOrIdentity(Mask, VF))
10404 return TTI::TCC_Free;
10405 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10406 cast<VectorType>(V1->getType()), Mask);
10407 }
10408 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10409 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10410 return TTI::TCC_Free;
10411 }
10412 void resizeToMatch(Value *&, Value *&) const {}
10413 };
10414
10415 /// Smart shuffle instruction emission, walks through shuffles trees and
10416 /// tries to find the best matching vector for the actual shuffle
10417 /// instruction.
10419 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10421 ArrayRef<int> Mask) {
10422 ShuffleCostBuilder Builder(TTI);
10423 SmallVector<int> CommonMask(Mask);
10424 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10425 unsigned CommonVF = Mask.size();
10426 InstructionCost ExtraCost = 0;
10427 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10428 unsigned VF) -> InstructionCost {
10429 if (E.isGather() && allConstant(E.Scalars))
10430 return TTI::TCC_Free;
10431 Type *EScalarTy = E.Scalars.front()->getType();
10432 bool IsSigned = true;
10433 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10434 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10435 IsSigned = It->second.second;
10436 }
10437 if (EScalarTy != ScalarTy) {
10438 unsigned CastOpcode = Instruction::Trunc;
10439 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10440 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10441 if (DstSz > SrcSz)
10442 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10443 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10444 getWidenedType(EScalarTy, VF),
10445 TTI::CastContextHint::None, CostKind);
10446 }
10447 return TTI::TCC_Free;
10448 };
10449 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10450 if (isa<Constant>(V))
10451 return TTI::TCC_Free;
10452 auto *VecTy = cast<VectorType>(V->getType());
10453 Type *EScalarTy = VecTy->getElementType();
10454 if (EScalarTy != ScalarTy) {
10455 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10456 unsigned CastOpcode = Instruction::Trunc;
10457 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10458 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10459 if (DstSz > SrcSz)
10460 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10461 return TTI.getCastInstrCost(
10462 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10463 VecTy, TTI::CastContextHint::None, CostKind);
10464 }
10465 return TTI::TCC_Free;
10466 };
10467 if (!V1 && !V2 && !P2.isNull()) {
10468 // Shuffle 2 entry nodes.
10469 const TreeEntry *E = cast<const TreeEntry *>(P1);
10470 unsigned VF = E->getVectorFactor();
10471 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10472 CommonVF = std::max(VF, E2->getVectorFactor());
10473 assert(all_of(Mask,
10474 [=](int Idx) {
10475 return Idx < 2 * static_cast<int>(CommonVF);
10476 }) &&
10477 "All elements in mask must be less than 2 * CommonVF.");
10478 if (E->Scalars.size() == E2->Scalars.size()) {
10479 SmallVector<int> EMask = E->getCommonMask();
10480 SmallVector<int> E2Mask = E2->getCommonMask();
10481 if (!EMask.empty() || !E2Mask.empty()) {
10482 for (int &Idx : CommonMask) {
10483 if (Idx == PoisonMaskElem)
10484 continue;
10485 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10486 Idx = EMask[Idx];
10487 else if (Idx >= static_cast<int>(CommonVF))
10488 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10489 E->Scalars.size();
10490 }
10491 }
10492 CommonVF = E->Scalars.size();
10493 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10494 GetNodeMinBWAffectedCost(*E2, CommonVF);
10495 } else {
10496 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10497 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10498 }
10499 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10500 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10501 } else if (!V1 && P2.isNull()) {
10502 // Shuffle single entry node.
10503 const TreeEntry *E = cast<const TreeEntry *>(P1);
10504 unsigned VF = E->getVectorFactor();
10505 CommonVF = VF;
10506 assert(
10507 all_of(Mask,
10508 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10509 "All elements in mask must be less than CommonVF.");
10510 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10511 SmallVector<int> EMask = E->getCommonMask();
10512 assert(!EMask.empty() && "Expected non-empty common mask.");
10513 for (int &Idx : CommonMask) {
10514 if (Idx != PoisonMaskElem)
10515 Idx = EMask[Idx];
10516 }
10517 CommonVF = E->Scalars.size();
10518 } else if (unsigned Factor = E->getInterleaveFactor();
10519 Factor > 0 && E->Scalars.size() != Mask.size() &&
10521 Factor)) {
10522 // Deinterleaved nodes are free.
10523 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10524 }
10525 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10526 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10527 // Not identity/broadcast? Try to see if the original vector is better.
10528 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10529 CommonVF == CommonMask.size() &&
10530 any_of(enumerate(CommonMask),
10531 [](const auto &&P) {
10532 return P.value() != PoisonMaskElem &&
10533 static_cast<unsigned>(P.value()) != P.index();
10534 }) &&
10535 any_of(CommonMask,
10536 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10537 SmallVector<int> ReorderMask;
10538 inversePermutation(E->ReorderIndices, ReorderMask);
10539 ::addMask(CommonMask, ReorderMask);
10540 }
10541 } else if (V1 && P2.isNull()) {
10542 // Shuffle single vector.
10543 ExtraCost += GetValueMinBWAffectedCost(V1);
10544 CommonVF = getVF(V1);
10545 assert(
10546 all_of(Mask,
10547 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10548 "All elements in mask must be less than CommonVF.");
10549 } else if (V1 && !V2) {
10550 // Shuffle vector and tree node.
10551 unsigned VF = getVF(V1);
10552 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10553 CommonVF = std::max(VF, E2->getVectorFactor());
10554 assert(all_of(Mask,
10555 [=](int Idx) {
10556 return Idx < 2 * static_cast<int>(CommonVF);
10557 }) &&
10558 "All elements in mask must be less than 2 * CommonVF.");
10559 if (E2->Scalars.size() == VF && VF != CommonVF) {
10560 SmallVector<int> E2Mask = E2->getCommonMask();
10561 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10562 for (int &Idx : CommonMask) {
10563 if (Idx == PoisonMaskElem)
10564 continue;
10565 if (Idx >= static_cast<int>(CommonVF))
10566 Idx = E2Mask[Idx - CommonVF] + VF;
10567 }
10568 CommonVF = VF;
10569 }
10570 ExtraCost += GetValueMinBWAffectedCost(V1);
10571 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10572 ExtraCost += GetNodeMinBWAffectedCost(
10573 *E2, std::min(CommonVF, E2->getVectorFactor()));
10574 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10575 } else if (!V1 && V2) {
10576 // Shuffle vector and tree node.
10577 unsigned VF = getVF(V2);
10578 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10579 CommonVF = std::max(VF, E1->getVectorFactor());
10580 assert(all_of(Mask,
10581 [=](int Idx) {
10582 return Idx < 2 * static_cast<int>(CommonVF);
10583 }) &&
10584 "All elements in mask must be less than 2 * CommonVF.");
10585 if (E1->Scalars.size() == VF && VF != CommonVF) {
10586 SmallVector<int> E1Mask = E1->getCommonMask();
10587 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10588 for (int &Idx : CommonMask) {
10589 if (Idx == PoisonMaskElem)
10590 continue;
10591 if (Idx >= static_cast<int>(CommonVF))
10592 Idx = E1Mask[Idx - CommonVF] + VF;
10593 else
10594 Idx = E1Mask[Idx];
10595 }
10596 CommonVF = VF;
10597 }
10598 ExtraCost += GetNodeMinBWAffectedCost(
10599 *E1, std::min(CommonVF, E1->getVectorFactor()));
10600 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10601 ExtraCost += GetValueMinBWAffectedCost(V2);
10602 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10603 } else {
10604 assert(V1 && V2 && "Expected both vectors.");
10605 unsigned VF = getVF(V1);
10606 CommonVF = std::max(VF, getVF(V2));
10607 assert(all_of(Mask,
10608 [=](int Idx) {
10609 return Idx < 2 * static_cast<int>(CommonVF);
10610 }) &&
10611 "All elements in mask must be less than 2 * CommonVF.");
10612 ExtraCost +=
10613 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10614 if (V1->getType() != V2->getType()) {
10615 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10616 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10617 } else {
10618 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10619 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10620 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10621 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10622 }
10623 }
10624 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10625 assert(SLPReVec && "FixedVectorType is not expected.");
10627 CommonMask);
10628 }
10629 InVectors.front() =
10630 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10631 if (InVectors.size() == 2)
10632 InVectors.pop_back();
10633 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10634 V1, V2, CommonMask, Builder);
10635 }
10636
10637public:
10639 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10640 SmallPtrSetImpl<Value *> &CheckedExtracts)
10641 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10642 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10643 CheckedExtracts(CheckedExtracts) {}
10644 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10645 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10646 unsigned NumParts, bool &UseVecBaseAsInput) {
10647 UseVecBaseAsInput = false;
10648 if (Mask.empty())
10649 return nullptr;
10650 Value *VecBase = nullptr;
10651 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10652 if (!E->ReorderIndices.empty()) {
10653 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10654 E->ReorderIndices.end());
10655 reorderScalars(VL, ReorderMask);
10656 }
10657 // Check if it can be considered reused if same extractelements were
10658 // vectorized already.
10659 bool PrevNodeFound = any_of(
10660 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10661 [&](const std::unique_ptr<TreeEntry> &TE) {
10662 return ((!TE->isAltShuffle() &&
10663 TE->getOpcode() == Instruction::ExtractElement) ||
10664 TE->isGather()) &&
10665 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10666 return VL.size() > Data.index() &&
10667 (Mask[Data.index()] == PoisonMaskElem ||
10668 isa<UndefValue>(VL[Data.index()]) ||
10669 Data.value() == VL[Data.index()]);
10670 });
10671 });
10672 SmallPtrSet<Value *, 4> UniqueBases;
10673 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10674 for (unsigned Part : seq<unsigned>(NumParts)) {
10675 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10676 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10677 for (auto [I, V] :
10678 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10679 // Ignore non-extractelement scalars.
10680 if (isa<UndefValue>(V) ||
10681 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10682 continue;
10683 // If all users of instruction are going to be vectorized and this
10684 // instruction itself is not going to be vectorized, consider this
10685 // instruction as dead and remove its cost from the final cost of the
10686 // vectorized tree.
10687 // Also, avoid adjusting the cost for extractelements with multiple uses
10688 // in different graph entries.
10689 auto *EE = cast<ExtractElementInst>(V);
10690 VecBase = EE->getVectorOperand();
10691 UniqueBases.insert(VecBase);
10692 const TreeEntry *VE = R.getTreeEntry(V);
10693 if (!CheckedExtracts.insert(V).second ||
10694 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10695 any_of(EE->users(),
10696 [&](User *U) {
10697 return isa<GetElementPtrInst>(U) &&
10698 !R.areAllUsersVectorized(cast<Instruction>(U),
10699 &VectorizedVals);
10700 }) ||
10701 (VE && VE != E))
10702 continue;
10703 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10704 if (!EEIdx)
10705 continue;
10706 unsigned Idx = *EEIdx;
10707 // Take credit for instruction that will become dead.
10708 if (EE->hasOneUse() || !PrevNodeFound) {
10709 Instruction *Ext = EE->user_back();
10710 if (isa<SExtInst, ZExtInst>(Ext) &&
10711 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10712 // Use getExtractWithExtendCost() to calculate the cost of
10713 // extractelement/ext pair.
10714 Cost -=
10715 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10716 EE->getVectorOperandType(), Idx);
10717 // Add back the cost of s|zext which is subtracted separately.
10719 Ext->getOpcode(), Ext->getType(), EE->getType(),
10720 TTI::getCastContextHint(Ext), CostKind, Ext);
10721 continue;
10722 }
10723 }
10724 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10725 CostKind, Idx);
10726 }
10727 }
10728 // Check that gather of extractelements can be represented as just a
10729 // shuffle of a single/two vectors the scalars are extracted from.
10730 // Found the bunch of extractelement instructions that must be gathered
10731 // into a vector and can be represented as a permutation elements in a
10732 // single input vector or of 2 input vectors.
10733 // Done for reused if same extractelements were vectorized already.
10734 if (!PrevNodeFound)
10735 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10736 InVectors.assign(1, E);
10737 CommonMask.assign(Mask.begin(), Mask.end());
10738 transformMaskAfterShuffle(CommonMask, CommonMask);
10739 SameNodesEstimated = false;
10740 if (NumParts != 1 && UniqueBases.size() != 1) {
10741 UseVecBaseAsInput = true;
10742 VecBase =
10743 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10744 }
10745 return VecBase;
10746 }
10747 /// Checks if the specified entry \p E needs to be delayed because of its
10748 /// dependency nodes.
10749 std::optional<InstructionCost>
10750 needToDelay(const TreeEntry *,
10752 // No need to delay the cost estimation during analysis.
10753 return std::nullopt;
10754 }
10755 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10756 if (&E1 == &E2) {
10757 assert(all_of(Mask,
10758 [&](int Idx) {
10759 return Idx < static_cast<int>(E1.getVectorFactor());
10760 }) &&
10761 "Expected single vector shuffle mask.");
10762 add(E1, Mask);
10763 return;
10764 }
10765 if (InVectors.empty()) {
10766 CommonMask.assign(Mask.begin(), Mask.end());
10767 InVectors.assign({&E1, &E2});
10768 return;
10769 }
10770 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10771 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10772 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10773 if (NumParts == 0 || NumParts >= Mask.size() ||
10774 MaskVecTy->getNumElements() % NumParts != 0 ||
10775 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10776 MaskVecTy->getNumElements() / NumParts))
10777 NumParts = 1;
10778 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10779 const auto *It =
10780 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10781 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10782 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10783 }
10784 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10785 if (InVectors.empty()) {
10786 CommonMask.assign(Mask.begin(), Mask.end());
10787 InVectors.assign(1, &E1);
10788 return;
10789 }
10790 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10791 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10792 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10793 if (NumParts == 0 || NumParts >= Mask.size() ||
10794 MaskVecTy->getNumElements() % NumParts != 0 ||
10795 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10796 MaskVecTy->getNumElements() / NumParts))
10797 NumParts = 1;
10798 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10799 const auto *It =
10800 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10801 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10802 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10803 if (!SameNodesEstimated && InVectors.size() == 1)
10804 InVectors.emplace_back(&E1);
10805 }
10806 /// Adds 2 input vectors and the mask for their shuffling.
10807 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10808 // May come only for shuffling of 2 vectors with extractelements, already
10809 // handled in adjustExtracts.
10810 assert(InVectors.size() == 1 &&
10811 all_of(enumerate(CommonMask),
10812 [&](auto P) {
10813 if (P.value() == PoisonMaskElem)
10814 return Mask[P.index()] == PoisonMaskElem;
10815 auto *EI = cast<ExtractElementInst>(
10816 cast<const TreeEntry *>(InVectors.front())
10817 ->getOrdered(P.index()));
10818 return EI->getVectorOperand() == V1 ||
10819 EI->getVectorOperand() == V2;
10820 }) &&
10821 "Expected extractelement vectors.");
10822 }
10823 /// Adds another one input vector and the mask for the shuffling.
10824 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10825 if (InVectors.empty()) {
10826 assert(CommonMask.empty() && !ForExtracts &&
10827 "Expected empty input mask/vectors.");
10828 CommonMask.assign(Mask.begin(), Mask.end());
10829 InVectors.assign(1, V1);
10830 return;
10831 }
10832 if (ForExtracts) {
10833 // No need to add vectors here, already handled them in adjustExtracts.
10834 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10835 !CommonMask.empty() &&
10836 all_of(enumerate(CommonMask),
10837 [&](auto P) {
10838 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10839 ->getOrdered(P.index());
10840 if (P.value() == PoisonMaskElem)
10841 return P.value() == Mask[P.index()] ||
10842 isa<UndefValue>(Scalar);
10843 if (isa<Constant>(V1))
10844 return true;
10845 auto *EI = cast<ExtractElementInst>(Scalar);
10846 return EI->getVectorOperand() == V1;
10847 }) &&
10848 "Expected only tree entry for extractelement vectors.");
10849 return;
10850 }
10851 assert(!InVectors.empty() && !CommonMask.empty() &&
10852 "Expected only tree entries from extracts/reused buildvectors.");
10853 unsigned VF = getVF(V1);
10854 if (InVectors.size() == 2) {
10855 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10856 transformMaskAfterShuffle(CommonMask, CommonMask);
10857 VF = std::max<unsigned>(VF, CommonMask.size());
10858 } else if (const auto *InTE =
10859 InVectors.front().dyn_cast<const TreeEntry *>()) {
10860 VF = std::max(VF, InTE->getVectorFactor());
10861 } else {
10862 VF = std::max(
10863 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10864 ->getNumElements());
10865 }
10866 InVectors.push_back(V1);
10867 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10868 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10869 CommonMask[Idx] = Mask[Idx] + VF;
10870 }
10871 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10872 Value *Root = nullptr) {
10873 Cost += getBuildVectorCost(VL, Root);
10874 if (!Root) {
10875 // FIXME: Need to find a way to avoid use of getNullValue here.
10877 unsigned VF = VL.size();
10878 if (MaskVF != 0)
10879 VF = std::min(VF, MaskVF);
10880 for (Value *V : VL.take_front(VF)) {
10881 if (isa<UndefValue>(V)) {
10882 Vals.push_back(cast<Constant>(V));
10883 continue;
10884 }
10885 Vals.push_back(Constant::getNullValue(V->getType()));
10886 }
10887 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10888 assert(SLPReVec && "FixedVectorType is not expected.");
10889 // When REVEC is enabled, we need to expand vector types into scalar
10890 // types.
10891 unsigned VecTyNumElements = VecTy->getNumElements();
10892 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10893 for (auto [I, V] : enumerate(Vals)) {
10894 Type *ScalarTy = V->getType()->getScalarType();
10895 Constant *NewVal;
10896 if (isa<PoisonValue>(V))
10897 NewVal = PoisonValue::get(ScalarTy);
10898 else if (isa<UndefValue>(V))
10899 NewVal = UndefValue::get(ScalarTy);
10900 else
10901 NewVal = Constant::getNullValue(ScalarTy);
10902 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10903 NewVal);
10904 }
10905 Vals.swap(NewVals);
10906 }
10907 return ConstantVector::get(Vals);
10908 }
10911 cast<FixedVectorType>(Root->getType())->getNumElements()),
10912 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10913 }
10915 /// Finalize emission of the shuffles.
10918 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10919 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10920 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10921 IsFinalized = true;
10922 if (Action) {
10923 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10924 if (InVectors.size() == 2)
10925 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10926 else
10927 Cost += createShuffle(Vec, nullptr, CommonMask);
10928 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10929 if (CommonMask[Idx] != PoisonMaskElem)
10930 CommonMask[Idx] = Idx;
10931 assert(VF > 0 &&
10932 "Expected vector length for the final value before action.");
10933 Value *V = cast<Value *>(Vec);
10934 Action(V, CommonMask);
10935 InVectors.front() = V;
10936 }
10937 if (!SubVectors.empty()) {
10938 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10939 if (InVectors.size() == 2)
10940 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10941 else
10942 Cost += createShuffle(Vec, nullptr, CommonMask);
10943 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10944 if (CommonMask[Idx] != PoisonMaskElem)
10945 CommonMask[Idx] = Idx;
10946 // Add subvectors permutation cost.
10947 if (!SubVectorsMask.empty()) {
10948 assert(SubVectorsMask.size() <= CommonMask.size() &&
10949 "Expected same size of masks for subvectors and common mask.");
10950 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10951 copy(SubVectorsMask, SVMask.begin());
10952 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10953 if (I2 != PoisonMaskElem) {
10954 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10955 I1 = I2 + CommonMask.size();
10956 }
10957 }
10959 getWidenedType(ScalarTy, CommonMask.size()),
10960 SVMask, CostKind);
10961 }
10962 for (auto [E, Idx] : SubVectors) {
10963 Type *EScalarTy = E->Scalars.front()->getType();
10964 bool IsSigned = true;
10965 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10966 EScalarTy =
10967 IntegerType::get(EScalarTy->getContext(), It->second.first);
10968 IsSigned = It->second.second;
10969 }
10970 if (ScalarTy != EScalarTy) {
10971 unsigned CastOpcode = Instruction::Trunc;
10972 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10973 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10974 if (DstSz > SrcSz)
10975 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10977 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10978 getWidenedType(EScalarTy, E->getVectorFactor()),
10980 }
10983 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
10984 getWidenedType(ScalarTy, E->getVectorFactor()));
10985 if (!CommonMask.empty()) {
10986 std::iota(std::next(CommonMask.begin(), Idx),
10987 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
10988 Idx);
10989 }
10990 }
10991 }
10992
10993 if (!ExtMask.empty()) {
10994 if (CommonMask.empty()) {
10995 CommonMask.assign(ExtMask.begin(), ExtMask.end());
10996 } else {
10997 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
10998 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
10999 if (ExtMask[I] == PoisonMaskElem)
11000 continue;
11001 NewMask[I] = CommonMask[ExtMask[I]];
11002 }
11003 CommonMask.swap(NewMask);
11004 }
11005 }
11006 if (CommonMask.empty()) {
11007 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11008 return Cost;
11009 }
11010 return Cost +
11011 createShuffle(InVectors.front(),
11012 InVectors.size() == 2 ? InVectors.back() : nullptr,
11013 CommonMask);
11014 }
11015
11017 assert((IsFinalized || CommonMask.empty()) &&
11018 "Shuffle construction must be finalized.");
11019 }
11020};
11021
11022const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
11023 unsigned Idx) const {
11024 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
11025 return VE;
11026 const auto *It =
11027 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11028 return TE->isGather() &&
11029 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
11030 return EI.EdgeIdx == Idx && EI.UserTE == E;
11031 }) != TE->UserTreeIndices.end();
11032 });
11033 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
11034 return It->get();
11035}
11036
11037TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
11038 if (TE.State == TreeEntry::ScatterVectorize ||
11039 TE.State == TreeEntry::StridedVectorize)
11041 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11042 !TE.isAltShuffle()) {
11043 if (TE.ReorderIndices.empty())
11045 SmallVector<int> Mask;
11046 inversePermutation(TE.ReorderIndices, Mask);
11047 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
11049 }
11051}
11052
11053/// Builds the arguments types vector for the given call instruction with the
11054/// given \p ID for the specified vector factor.
11057 const unsigned VF, unsigned MinBW,
11058 const TargetTransformInfo *TTI) {
11059 SmallVector<Type *> ArgTys;
11060 for (auto [Idx, Arg] : enumerate(CI->args())) {
11063 ArgTys.push_back(Arg->getType());
11064 continue;
11065 }
11066 if (MinBW > 0) {
11067 ArgTys.push_back(
11068 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11069 continue;
11070 }
11071 }
11072 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11073 }
11074 return ArgTys;
11075}
11076
11078BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11079 SmallPtrSetImpl<Value *> &CheckedExtracts) {
11080 ArrayRef<Value *> VL = E->Scalars;
11081
11082 Type *ScalarTy = getValueType(VL[0]);
11083 if (!isValidElementType(ScalarTy))
11086
11087 // If we have computed a smaller type for the expression, update VecTy so
11088 // that the costs will be accurate.
11089 auto It = MinBWs.find(E);
11090 Type *OrigScalarTy = ScalarTy;
11091 if (It != MinBWs.end()) {
11092 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11093 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11094 if (VecTy)
11095 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
11096 }
11097 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11098 unsigned EntryVF = E->getVectorFactor();
11099 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
11100
11101 if (E->isGather()) {
11102 if (allConstant(VL))
11103 return 0;
11104 if (isa<InsertElementInst>(VL[0]))
11106 if (isa<CmpInst>(VL.front()))
11107 ScalarTy = VL.front()->getType();
11108 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11109 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11110 }
11111 InstructionCost CommonCost = 0;
11113 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11114 !isReverseOrder(E->ReorderIndices))) {
11115 SmallVector<int> NewMask;
11116 if (E->getOpcode() == Instruction::Store) {
11117 // For stores the order is actually a mask.
11118 NewMask.resize(E->ReorderIndices.size());
11119 copy(E->ReorderIndices, NewMask.begin());
11120 } else {
11121 inversePermutation(E->ReorderIndices, NewMask);
11122 }
11123 ::addMask(Mask, NewMask);
11124 }
11125 if (!E->ReuseShuffleIndices.empty())
11126 ::addMask(Mask, E->ReuseShuffleIndices);
11127 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11128 CommonCost =
11129 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11130 assert((E->State == TreeEntry::Vectorize ||
11131 E->State == TreeEntry::ScatterVectorize ||
11132 E->State == TreeEntry::StridedVectorize) &&
11133 "Unhandled state");
11134 assert(E->getOpcode() &&
11135 ((allSameType(VL) && allSameBlock(VL)) ||
11136 (E->getOpcode() == Instruction::GetElementPtr &&
11137 E->getMainOp()->getType()->isPointerTy())) &&
11138 "Invalid VL");
11139 Instruction *VL0 = E->getMainOp();
11140 unsigned ShuffleOrOp =
11141 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11142 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11143 ShuffleOrOp = E->CombinedOp;
11144 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11145 const unsigned Sz = UniqueValues.size();
11146 SmallBitVector UsedScalars(Sz, false);
11147 for (unsigned I = 0; I < Sz; ++I) {
11148 if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11149 continue;
11150 UsedScalars.set(I);
11151 }
11152 auto GetCastContextHint = [&](Value *V) {
11153 if (const TreeEntry *OpTE = getTreeEntry(V))
11154 return getCastContextHint(*OpTE);
11155 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11156 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11157 !SrcState.isAltShuffle())
11160 };
11161 auto GetCostDiff =
11162 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11164 // Calculate the cost of this instruction.
11165 InstructionCost ScalarCost = 0;
11166 if (isa<CastInst, CallInst>(VL0)) {
11167 // For some of the instructions no need to calculate cost for each
11168 // particular instruction, we can use the cost of the single
11169 // instruction x total number of scalar instructions.
11170 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11171 } else {
11172 for (unsigned I = 0; I < Sz; ++I) {
11173 if (UsedScalars.test(I))
11174 continue;
11175 ScalarCost += ScalarEltCost(I);
11176 }
11177 }
11178
11179 InstructionCost VecCost = VectorCost(CommonCost);
11180 // Check if the current node must be resized, if the parent node is not
11181 // resized.
11182 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11183 E->Idx != 0 &&
11184 (E->getOpcode() != Instruction::Load ||
11185 !E->UserTreeIndices.empty())) {
11186 const EdgeInfo &EI =
11187 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11188 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11189 });
11190 if (EI.UserTE->getOpcode() != Instruction::Select ||
11191 EI.EdgeIdx != 0) {
11192 auto UserBWIt = MinBWs.find(EI.UserTE);
11193 Type *UserScalarTy =
11194 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11195 if (UserBWIt != MinBWs.end())
11196 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11197 UserBWIt->second.first);
11198 if (ScalarTy != UserScalarTy) {
11199 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11200 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11201 unsigned VecOpcode;
11202 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11203 if (BWSz > SrcBWSz)
11204 VecOpcode = Instruction::Trunc;
11205 else
11206 VecOpcode =
11207 It->second.second ? Instruction::SExt : Instruction::ZExt;
11208 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11209 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11210 CostKind);
11211 }
11212 }
11213 }
11214 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11215 ScalarCost, "Calculated costs for Tree"));
11216 return VecCost - ScalarCost;
11217 };
11218 // Calculate cost difference from vectorizing set of GEPs.
11219 // Negative value means vectorizing is profitable.
11220 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11221 assert((E->State == TreeEntry::Vectorize ||
11222 E->State == TreeEntry::StridedVectorize) &&
11223 "Entry state expected to be Vectorize or StridedVectorize here.");
11224 InstructionCost ScalarCost = 0;
11225 InstructionCost VecCost = 0;
11226 std::tie(ScalarCost, VecCost) = getGEPCosts(
11227 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11228 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11229 "Calculated GEPs cost for Tree"));
11230
11231 return VecCost - ScalarCost;
11232 };
11233
11234 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11235 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11236 if (MinMaxID == Intrinsic::not_intrinsic)
11238 Type *CanonicalType = Ty;
11239 if (CanonicalType->isPtrOrPtrVectorTy())
11240 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11241 CanonicalType->getContext(),
11242 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11243
11244 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11245 {CanonicalType, CanonicalType});
11246 InstructionCost IntrinsicCost =
11247 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11248 // If the selects are the only uses of the compares, they will be
11249 // dead and we can adjust the cost by removing their cost.
11250 if (VI && SelectOnly) {
11251 assert((!Ty->isVectorTy() || SLPReVec) &&
11252 "Expected only for scalar type.");
11253 auto *CI = cast<CmpInst>(VI->getOperand(0));
11254 IntrinsicCost -= TTI->getCmpSelInstrCost(
11255 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11256 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11257 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11258 }
11259 return IntrinsicCost;
11260 };
11261 switch (ShuffleOrOp) {
11262 case Instruction::PHI: {
11263 // Count reused scalars.
11264 InstructionCost ScalarCost = 0;
11266 for (Value *V : UniqueValues) {
11267 auto *PHI = dyn_cast<PHINode>(V);
11268 if (!PHI)
11269 continue;
11270
11271 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11272 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11273 Value *Op = PHI->getIncomingValue(I);
11274 Operands[I] = Op;
11275 }
11276 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11277 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11278 if (!OpTE->ReuseShuffleIndices.empty())
11279 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11280 OpTE->Scalars.size());
11281 }
11282
11283 return CommonCost - ScalarCost;
11284 }
11285 case Instruction::ExtractValue:
11286 case Instruction::ExtractElement: {
11287 auto GetScalarCost = [&](unsigned Idx) {
11288 if (isa<PoisonValue>(UniqueValues[Idx]))
11290
11291 auto *I = cast<Instruction>(UniqueValues[Idx]);
11292 VectorType *SrcVecTy;
11293 if (ShuffleOrOp == Instruction::ExtractElement) {
11294 auto *EE = cast<ExtractElementInst>(I);
11295 SrcVecTy = EE->getVectorOperandType();
11296 } else {
11297 auto *EV = cast<ExtractValueInst>(I);
11298 Type *AggregateTy = EV->getAggregateOperand()->getType();
11299 unsigned NumElts;
11300 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11301 NumElts = ATy->getNumElements();
11302 else
11303 NumElts = AggregateTy->getStructNumElements();
11304 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11305 }
11306 if (I->hasOneUse()) {
11307 Instruction *Ext = I->user_back();
11308 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11309 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11310 // Use getExtractWithExtendCost() to calculate the cost of
11311 // extractelement/ext pair.
11313 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11314 // Subtract the cost of s|zext which is subtracted separately.
11316 Ext->getOpcode(), Ext->getType(), I->getType(),
11318 return Cost;
11319 }
11320 }
11321 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11323 };
11324 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11325 return GetCostDiff(GetScalarCost, GetVectorCost);
11326 }
11327 case Instruction::InsertElement: {
11328 assert(E->ReuseShuffleIndices.empty() &&
11329 "Unique insertelements only are expected.");
11330 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11331 unsigned const NumElts = SrcVecTy->getNumElements();
11332 unsigned const NumScalars = VL.size();
11333
11334 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11335
11336 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11337 unsigned OffsetBeg = *getElementIndex(VL.front());
11338 unsigned OffsetEnd = OffsetBeg;
11339 InsertMask[OffsetBeg] = 0;
11340 for (auto [I, V] : enumerate(VL.drop_front())) {
11341 unsigned Idx = *getElementIndex(V);
11342 if (OffsetBeg > Idx)
11343 OffsetBeg = Idx;
11344 else if (OffsetEnd < Idx)
11345 OffsetEnd = Idx;
11346 InsertMask[Idx] = I + 1;
11347 }
11348 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11349 if (NumOfParts > 0 && NumOfParts < NumElts)
11350 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11351 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11352 VecScalarsSz;
11353 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11354 unsigned InsertVecSz = std::min<unsigned>(
11355 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11356 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11357 bool IsWholeSubvector =
11358 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11359 // Check if we can safely insert a subvector. If it is not possible, just
11360 // generate a whole-sized vector and shuffle the source vector and the new
11361 // subvector.
11362 if (OffsetBeg + InsertVecSz > VecSz) {
11363 // Align OffsetBeg to generate correct mask.
11364 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11365 InsertVecSz = VecSz;
11366 }
11367
11368 APInt DemandedElts = APInt::getZero(NumElts);
11369 // TODO: Add support for Instruction::InsertValue.
11371 if (!E->ReorderIndices.empty()) {
11372 inversePermutation(E->ReorderIndices, Mask);
11373 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11374 } else {
11375 Mask.assign(VecSz, PoisonMaskElem);
11376 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11377 }
11378 bool IsIdentity = true;
11379 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11380 Mask.swap(PrevMask);
11381 for (unsigned I = 0; I < NumScalars; ++I) {
11382 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11383 DemandedElts.setBit(InsertIdx);
11384 IsIdentity &= InsertIdx - OffsetBeg == I;
11385 Mask[InsertIdx - OffsetBeg] = I;
11386 }
11387 assert(Offset < NumElts && "Failed to find vector index offset");
11388
11390 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11391 /*Insert*/ true, /*Extract*/ false,
11392 CostKind);
11393
11394 // First cost - resize to actual vector size if not identity shuffle or
11395 // need to shift the vector.
11396 // Do not calculate the cost if the actual size is the register size and
11397 // we can merge this shuffle with the following SK_Select.
11398 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11399 if (!IsIdentity)
11401 InsertVecTy, Mask);
11402 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11403 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11404 }));
11405 // Second cost - permutation with subvector, if some elements are from the
11406 // initial vector or inserting a subvector.
11407 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11408 // subvector of ActualVecTy.
11409 SmallBitVector InMask =
11410 isUndefVector(FirstInsert->getOperand(0),
11411 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11412 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11413 if (InsertVecSz != VecSz) {
11414 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11415 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11416 CostKind, OffsetBeg - Offset, InsertVecTy);
11417 } else {
11418 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11419 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11420 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11421 I <= End; ++I)
11422 if (Mask[I] != PoisonMaskElem)
11423 Mask[I] = I + VecSz;
11424 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11425 Mask[I] =
11426 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11427 Cost +=
11428 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11429 }
11430 }
11431 return Cost;
11432 }
11433 case Instruction::ZExt:
11434 case Instruction::SExt:
11435 case Instruction::FPToUI:
11436 case Instruction::FPToSI:
11437 case Instruction::FPExt:
11438 case Instruction::PtrToInt:
11439 case Instruction::IntToPtr:
11440 case Instruction::SIToFP:
11441 case Instruction::UIToFP:
11442 case Instruction::Trunc:
11443 case Instruction::FPTrunc:
11444 case Instruction::BitCast: {
11445 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11446 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11447 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11448 unsigned Opcode = ShuffleOrOp;
11449 unsigned VecOpcode = Opcode;
11450 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11451 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11452 // Check if the values are candidates to demote.
11453 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11454 if (SrcIt != MinBWs.end()) {
11455 SrcBWSz = SrcIt->second.first;
11456 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11457 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11458 SrcVecTy =
11459 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11460 }
11461 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11462 if (BWSz == SrcBWSz) {
11463 VecOpcode = Instruction::BitCast;
11464 } else if (BWSz < SrcBWSz) {
11465 VecOpcode = Instruction::Trunc;
11466 } else if (It != MinBWs.end()) {
11467 assert(BWSz > SrcBWSz && "Invalid cast!");
11468 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11469 } else if (SrcIt != MinBWs.end()) {
11470 assert(BWSz > SrcBWSz && "Invalid cast!");
11471 VecOpcode =
11472 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11473 }
11474 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11475 !SrcIt->second.second) {
11476 VecOpcode = Instruction::UIToFP;
11477 }
11478 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11479 assert(Idx == 0 && "Expected 0 index only");
11480 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11481 VL0->getOperand(0)->getType(),
11483 };
11484 auto GetVectorCost = [=](InstructionCost CommonCost) {
11485 // Do not count cost here if minimum bitwidth is in effect and it is just
11486 // a bitcast (here it is just a noop).
11487 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11488 return CommonCost;
11489 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11490 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11491
11492 bool IsArithmeticExtendedReduction =
11493 E->Idx == 0 && UserIgnoreList &&
11494 all_of(*UserIgnoreList, [](Value *V) {
11495 auto *I = cast<Instruction>(V);
11496 return is_contained({Instruction::Add, Instruction::FAdd,
11497 Instruction::Mul, Instruction::FMul,
11498 Instruction::And, Instruction::Or,
11499 Instruction::Xor},
11500 I->getOpcode());
11501 });
11502 if (IsArithmeticExtendedReduction &&
11503 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11504 return CommonCost;
11505 return CommonCost +
11506 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11507 VecOpcode == Opcode ? VI : nullptr);
11508 };
11509 return GetCostDiff(GetScalarCost, GetVectorCost);
11510 }
11511 case Instruction::FCmp:
11512 case Instruction::ICmp:
11513 case Instruction::Select: {
11514 CmpPredicate VecPred, SwappedVecPred;
11515 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11516 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11517 match(VL0, MatchCmp))
11518 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11519 else
11520 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11523 auto GetScalarCost = [&](unsigned Idx) {
11524 if (isa<PoisonValue>(UniqueValues[Idx]))
11526
11527 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11528 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11531 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11532 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11533 !match(VI, MatchCmp)) ||
11534 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11535 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11536 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11539
11541 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11542 CostKind, getOperandInfo(VI->getOperand(0)),
11543 getOperandInfo(VI->getOperand(1)), VI);
11544 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11545 if (IntrinsicCost.isValid())
11546 ScalarCost = IntrinsicCost;
11547
11548 return ScalarCost;
11549 };
11550 auto GetVectorCost = [&](InstructionCost CommonCost) {
11551 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11552
11553 InstructionCost VecCost =
11554 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11555 CostKind, getOperandInfo(E->getOperand(0)),
11556 getOperandInfo(E->getOperand(1)), VL0);
11557 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11558 auto *CondType =
11559 getWidenedType(SI->getCondition()->getType(), VL.size());
11560 unsigned CondNumElements = CondType->getNumElements();
11561 unsigned VecTyNumElements = getNumElements(VecTy);
11562 assert(VecTyNumElements >= CondNumElements &&
11563 VecTyNumElements % CondNumElements == 0 &&
11564 "Cannot vectorize Instruction::Select");
11565 if (CondNumElements != VecTyNumElements) {
11566 // When the return type is i1 but the source is fixed vector type, we
11567 // need to duplicate the condition value.
11568 VecCost += ::getShuffleCost(
11569 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11570 createReplicatedMask(VecTyNumElements / CondNumElements,
11571 CondNumElements));
11572 }
11573 }
11574 return VecCost + CommonCost;
11575 };
11576 return GetCostDiff(GetScalarCost, GetVectorCost);
11577 }
11578 case TreeEntry::MinMax: {
11579 auto GetScalarCost = [&](unsigned Idx) {
11580 return GetMinMaxCost(OrigScalarTy);
11581 };
11582 auto GetVectorCost = [&](InstructionCost CommonCost) {
11583 InstructionCost VecCost = GetMinMaxCost(VecTy);
11584 return VecCost + CommonCost;
11585 };
11586 return GetCostDiff(GetScalarCost, GetVectorCost);
11587 }
11588 case Instruction::FNeg:
11589 case Instruction::Add:
11590 case Instruction::FAdd:
11591 case Instruction::Sub:
11592 case Instruction::FSub:
11593 case Instruction::Mul:
11594 case Instruction::FMul:
11595 case Instruction::UDiv:
11596 case Instruction::SDiv:
11597 case Instruction::FDiv:
11598 case Instruction::URem:
11599 case Instruction::SRem:
11600 case Instruction::FRem:
11601 case Instruction::Shl:
11602 case Instruction::LShr:
11603 case Instruction::AShr:
11604 case Instruction::And:
11605 case Instruction::Or:
11606 case Instruction::Xor: {
11607 auto GetScalarCost = [&](unsigned Idx) {
11608 if (isa<PoisonValue>(UniqueValues[Idx]))
11610
11611 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11612 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11613 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11614 TTI::OperandValueInfo Op2Info =
11615 TTI::getOperandInfo(VI->getOperand(OpIdx));
11616 SmallVector<const Value *> Operands(VI->operand_values());
11617 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11618 Op1Info, Op2Info, Operands, VI);
11619 };
11620 auto GetVectorCost = [=](InstructionCost CommonCost) {
11621 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11622 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11623 ArrayRef<Value *> Ops = E->getOperand(I);
11624 if (all_of(Ops, [&](Value *Op) {
11625 auto *CI = dyn_cast<ConstantInt>(Op);
11626 return CI && CI->getValue().countr_one() >= It->second.first;
11627 }))
11628 return CommonCost;
11629 }
11630 }
11631 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11632 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11633 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11634 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11635 Op2Info, {}, nullptr, TLI) +
11636 CommonCost;
11637 };
11638 return GetCostDiff(GetScalarCost, GetVectorCost);
11639 }
11640 case Instruction::GetElementPtr: {
11641 return CommonCost + GetGEPCostDiff(VL, VL0);
11642 }
11643 case Instruction::Load: {
11644 auto GetScalarCost = [&](unsigned Idx) {
11645 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11646 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11647 VI->getAlign(), VI->getPointerAddressSpace(),
11649 };
11650 auto *LI0 = cast<LoadInst>(VL0);
11651 auto GetVectorCost = [&](InstructionCost CommonCost) {
11652 InstructionCost VecLdCost;
11653 switch (E->State) {
11654 case TreeEntry::Vectorize:
11655 if (unsigned Factor = E->getInterleaveFactor()) {
11656 VecLdCost = TTI->getInterleavedMemoryOpCost(
11657 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11658 LI0->getPointerAddressSpace(), CostKind);
11659
11660 } else {
11661 VecLdCost = TTI->getMemoryOpCost(
11662 Instruction::Load, VecTy, LI0->getAlign(),
11663 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11664 }
11665 break;
11666 case TreeEntry::StridedVectorize: {
11667 Align CommonAlignment =
11668 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11669 VecLdCost = TTI->getStridedMemoryOpCost(
11670 Instruction::Load, VecTy, LI0->getPointerOperand(),
11671 /*VariableMask=*/false, CommonAlignment, CostKind);
11672 break;
11673 }
11674 case TreeEntry::ScatterVectorize: {
11675 Align CommonAlignment =
11676 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11677 VecLdCost = TTI->getGatherScatterOpCost(
11678 Instruction::Load, VecTy, LI0->getPointerOperand(),
11679 /*VariableMask=*/false, CommonAlignment, CostKind);
11680 break;
11681 }
11682 case TreeEntry::CombinedVectorize:
11683 case TreeEntry::NeedToGather:
11684 llvm_unreachable("Unexpected vectorization state.");
11685 }
11686 return VecLdCost + CommonCost;
11687 };
11688
11689 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11690 // If this node generates masked gather load then it is not a terminal node.
11691 // Hence address operand cost is estimated separately.
11692 if (E->State == TreeEntry::ScatterVectorize)
11693 return Cost;
11694
11695 // Estimate cost of GEPs since this tree node is a terminator.
11696 SmallVector<Value *> PointerOps(VL.size());
11697 for (auto [I, V] : enumerate(VL))
11698 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11699 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11700 }
11701 case Instruction::Store: {
11702 bool IsReorder = !E->ReorderIndices.empty();
11703 auto GetScalarCost = [=](unsigned Idx) {
11704 auto *VI = cast<StoreInst>(VL[Idx]);
11705 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11706 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11707 VI->getAlign(), VI->getPointerAddressSpace(),
11708 CostKind, OpInfo, VI);
11709 };
11710 auto *BaseSI =
11711 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11712 auto GetVectorCost = [=](InstructionCost CommonCost) {
11713 // We know that we can merge the stores. Calculate the cost.
11714 InstructionCost VecStCost;
11715 if (E->State == TreeEntry::StridedVectorize) {
11716 Align CommonAlignment =
11717 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11718 VecStCost = TTI->getStridedMemoryOpCost(
11719 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11720 /*VariableMask=*/false, CommonAlignment, CostKind);
11721 } else {
11722 assert(E->State == TreeEntry::Vectorize &&
11723 "Expected either strided or consecutive stores.");
11724 if (unsigned Factor = E->getInterleaveFactor()) {
11725 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11726 "No reused shuffles expected");
11727 CommonCost = 0;
11728 VecStCost = TTI->getInterleavedMemoryOpCost(
11729 Instruction::Store, VecTy, Factor, std::nullopt,
11730 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11731 } else {
11732 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11733 VecStCost = TTI->getMemoryOpCost(
11734 Instruction::Store, VecTy, BaseSI->getAlign(),
11735 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11736 }
11737 }
11738 return VecStCost + CommonCost;
11739 };
11740 SmallVector<Value *> PointerOps(VL.size());
11741 for (auto [I, V] : enumerate(VL)) {
11742 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11743 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11744 }
11745
11746 return GetCostDiff(GetScalarCost, GetVectorCost) +
11747 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11748 }
11749 case Instruction::Call: {
11750 auto GetScalarCost = [&](unsigned Idx) {
11751 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11754 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11755 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11756 }
11759 CI->getFunctionType()->params(), CostKind);
11760 };
11761 auto GetVectorCost = [=](InstructionCost CommonCost) {
11762 auto *CI = cast<CallInst>(VL0);
11765 CI, ID, VecTy->getNumElements(),
11766 It != MinBWs.end() ? It->second.first : 0, TTI);
11767 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11768 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11769 };
11770 return GetCostDiff(GetScalarCost, GetVectorCost);
11771 }
11772 case Instruction::ShuffleVector: {
11773 if (!SLPReVec || E->isAltShuffle())
11774 assert(E->isAltShuffle() &&
11775 ((Instruction::isBinaryOp(E->getOpcode()) &&
11776 Instruction::isBinaryOp(E->getAltOpcode())) ||
11777 (Instruction::isCast(E->getOpcode()) &&
11778 Instruction::isCast(E->getAltOpcode())) ||
11779 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11780 "Invalid Shuffle Vector Operand");
11781 // Try to find the previous shuffle node with the same operands and same
11782 // main/alternate ops.
11783 auto TryFindNodeWithEqualOperands = [=]() {
11784 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11785 if (TE.get() == E)
11786 break;
11787 if (TE->isAltShuffle() &&
11788 ((TE->getOpcode() == E->getOpcode() &&
11789 TE->getAltOpcode() == E->getAltOpcode()) ||
11790 (TE->getOpcode() == E->getAltOpcode() &&
11791 TE->getAltOpcode() == E->getOpcode())) &&
11792 TE->hasEqualOperands(*E))
11793 return true;
11794 }
11795 return false;
11796 };
11797 auto GetScalarCost = [&](unsigned Idx) {
11798 if (isa<PoisonValue>(UniqueValues[Idx]))
11800
11801 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11802 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11803 (void)E;
11804 return TTI->getInstructionCost(VI, CostKind);
11805 };
11806 // Need to clear CommonCost since the final shuffle cost is included into
11807 // vector cost.
11808 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11809 // VecCost is equal to sum of the cost of creating 2 vectors
11810 // and the cost of creating shuffle.
11811 InstructionCost VecCost = 0;
11812 if (TryFindNodeWithEqualOperands()) {
11813 LLVM_DEBUG({
11814 dbgs() << "SLP: diamond match for alternate node found.\n";
11815 E->dump();
11816 });
11817 // No need to add new vector costs here since we're going to reuse
11818 // same main/alternate vector ops, just do different shuffling.
11819 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11820 VecCost =
11821 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11822 VecCost +=
11823 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11824 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11825 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11826 VecCost = TTIRef.getCmpSelInstrCost(
11827 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11828 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11829 VL0);
11830 VecCost += TTIRef.getCmpSelInstrCost(
11831 E->getOpcode(), VecTy, MaskTy,
11832 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11833 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11834 E->getAltOp());
11835 } else {
11836 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11837 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11838 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11839 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11840 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11841 unsigned SrcBWSz =
11842 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11843 if (SrcIt != MinBWs.end()) {
11844 SrcBWSz = SrcIt->second.first;
11845 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11846 SrcTy = getWidenedType(SrcSclTy, VL.size());
11847 }
11848 if (BWSz <= SrcBWSz) {
11849 if (BWSz < SrcBWSz)
11850 VecCost =
11851 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11853 LLVM_DEBUG({
11854 dbgs()
11855 << "SLP: alternate extension, which should be truncated.\n";
11856 E->dump();
11857 });
11858 return VecCost;
11859 }
11860 }
11861 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11863 VecCost +=
11864 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11866 }
11868 E->buildAltOpShuffleMask(
11869 [&](Instruction *I) {
11870 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11871 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11872 *TLI);
11873 },
11874 Mask);
11876 FinalVecTy, Mask, CostKind);
11877 // Patterns like [fadd,fsub] can be combined into a single instruction
11878 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11879 // need to take into account their order when looking for the most used
11880 // order.
11881 unsigned Opcode0 = E->getOpcode();
11882 unsigned Opcode1 = E->getAltOpcode();
11883 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11884 // If this pattern is supported by the target then we consider the
11885 // order.
11886 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11887 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11888 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11889 return AltVecCost < VecCost ? AltVecCost : VecCost;
11890 }
11891 // TODO: Check the reverse order too.
11892 return VecCost;
11893 };
11894 if (SLPReVec && !E->isAltShuffle())
11895 return GetCostDiff(
11896 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11897 // If a group uses mask in order, the shufflevector can be
11898 // eliminated by instcombine. Then the cost is 0.
11899 assert(isa<ShuffleVectorInst>(VL.front()) &&
11900 "Not supported shufflevector usage.");
11901 auto *SV = cast<ShuffleVectorInst>(VL.front());
11902 unsigned SVNumElements =
11903 cast<FixedVectorType>(SV->getOperand(0)->getType())
11904 ->getNumElements();
11905 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11906 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11907 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11908 int NextIndex = 0;
11909 if (!all_of(Group, [&](Value *V) {
11910 assert(isa<ShuffleVectorInst>(V) &&
11911 "Not supported shufflevector usage.");
11912 auto *SV = cast<ShuffleVectorInst>(V);
11913 int Index;
11914 [[maybe_unused]] bool IsExtractSubvectorMask =
11915 SV->isExtractSubvectorMask(Index);
11916 assert(IsExtractSubvectorMask &&
11917 "Not supported shufflevector usage.");
11918 if (NextIndex != Index)
11919 return false;
11920 NextIndex += SV->getShuffleMask().size();
11921 return true;
11922 }))
11923 return ::getShuffleCost(
11925 calculateShufflevectorMask(E->Scalars));
11926 }
11927 return TTI::TCC_Free;
11928 });
11929 return GetCostDiff(GetScalarCost, GetVectorCost);
11930 }
11931 case Instruction::Freeze:
11932 return CommonCost;
11933 default:
11934 llvm_unreachable("Unknown instruction");
11935 }
11936}
11937
11938bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11939 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11940 << VectorizableTree.size() << " is fully vectorizable .\n");
11941
11942 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11944 return TE->isGather() &&
11945 !any_of(TE->Scalars,
11946 [this](Value *V) { return EphValues.contains(V); }) &&
11947 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11948 TE->Scalars.size() < Limit ||
11949 ((TE->getOpcode() == Instruction::ExtractElement ||
11950 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11951 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11952 (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
11953 any_of(TE->Scalars, IsaPred<LoadInst>));
11954 };
11955
11956 // We only handle trees of heights 1 and 2.
11957 if (VectorizableTree.size() == 1 &&
11958 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11959 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11960 (ForReduction &&
11961 AreVectorizableGathers(VectorizableTree[0].get(),
11962 VectorizableTree[0]->Scalars.size()) &&
11963 VectorizableTree[0]->getVectorFactor() > 2)))
11964 return true;
11965
11966 if (VectorizableTree.size() != 2)
11967 return false;
11968
11969 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11970 // with the second gather nodes if they have less scalar operands rather than
11971 // the initial tree element (may be profitable to shuffle the second gather)
11972 // or they are extractelements, which form shuffle.
11974 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11975 AreVectorizableGathers(VectorizableTree[1].get(),
11976 VectorizableTree[0]->Scalars.size()))
11977 return true;
11978
11979 // Gathering cost would be too much for tiny trees.
11980 if (VectorizableTree[0]->isGather() ||
11981 (VectorizableTree[1]->isGather() &&
11982 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11983 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11984 return false;
11985
11986 return true;
11987}
11988
11989static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
11991 bool MustMatchOrInst) {
11992 // Look past the root to find a source value. Arbitrarily follow the
11993 // path through operand 0 of any 'or'. Also, peek through optional
11994 // shift-left-by-multiple-of-8-bits.
11995 Value *ZextLoad = Root;
11996 const APInt *ShAmtC;
11997 bool FoundOr = false;
11998 while (!isa<ConstantExpr>(ZextLoad) &&
11999 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
12000 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
12001 ShAmtC->urem(8) == 0))) {
12002 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12003 ZextLoad = BinOp->getOperand(0);
12004 if (BinOp->getOpcode() == Instruction::Or)
12005 FoundOr = true;
12006 }
12007 // Check if the input is an extended load of the required or/shift expression.
12008 Value *Load;
12009 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12010 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
12011 return false;
12012
12013 // Require that the total load bit width is a legal integer type.
12014 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
12015 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
12016 Type *SrcTy = Load->getType();
12017 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
12018 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
12019 return false;
12020
12021 // Everything matched - assume that we can fold the whole sequence using
12022 // load combining.
12023 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
12024 << *(cast<Instruction>(Root)) << "\n");
12025
12026 return true;
12027}
12028
12030 if (RdxKind != RecurKind::Or)
12031 return false;
12032
12033 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12034 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12035 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
12036 /* MatchOr */ false);
12037}
12038
12040 // Peek through a final sequence of stores and check if all operations are
12041 // likely to be load-combined.
12042 unsigned NumElts = Stores.size();
12043 for (Value *Scalar : Stores) {
12044 Value *X;
12045 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
12046 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
12047 return false;
12048 }
12049 return true;
12050}
12051
12052bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
12053 if (!DebugCounter::shouldExecute(VectorizedGraphs))
12054 return true;
12055
12056 // Graph is empty - do nothing.
12057 if (VectorizableTree.empty()) {
12058 assert(ExternalUses.empty() && "We shouldn't have any external users");
12059
12060 return true;
12061 }
12062
12063 // No need to vectorize inserts of gathered values.
12064 if (VectorizableTree.size() == 2 &&
12065 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12066 VectorizableTree[1]->isGather() &&
12067 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12068 !(isSplat(VectorizableTree[1]->Scalars) ||
12069 allConstant(VectorizableTree[1]->Scalars))))
12070 return true;
12071
12072 // If the graph includes only PHI nodes and gathers, it is defnitely not
12073 // profitable for the vectorization, we can skip it, if the cost threshold is
12074 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12075 // gathers/buildvectors.
12076 constexpr int Limit = 4;
12077 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12078 !VectorizableTree.empty() &&
12079 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12080 return (TE->isGather() &&
12081 TE->getOpcode() != Instruction::ExtractElement &&
12082 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12083 TE->getOpcode() == Instruction::PHI;
12084 }))
12085 return true;
12086
12087 // We can vectorize the tree if its size is greater than or equal to the
12088 // minimum size specified by the MinTreeSize command line option.
12089 if (VectorizableTree.size() >= MinTreeSize)
12090 return false;
12091
12092 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12093 // can vectorize it if we can prove it fully vectorizable.
12094 if (isFullyVectorizableTinyTree(ForReduction))
12095 return false;
12096
12097 // Check if any of the gather node forms an insertelement buildvector
12098 // somewhere.
12099 bool IsAllowedSingleBVNode =
12100 VectorizableTree.size() > 1 ||
12101 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
12102 !VectorizableTree.front()->isAltShuffle() &&
12103 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12104 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12105 allSameBlock(VectorizableTree.front()->Scalars));
12106 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12107 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12108 return isa<ExtractElementInst, UndefValue>(V) ||
12109 (IsAllowedSingleBVNode &&
12110 !V->hasNUsesOrMore(UsesLimit) &&
12111 any_of(V->users(), IsaPred<InsertElementInst>));
12112 });
12113 }))
12114 return false;
12115
12116 if (VectorizableTree.back()->isGather() &&
12117 VectorizableTree.back()->isAltShuffle() &&
12118 VectorizableTree.back()->getVectorFactor() > 2 &&
12119 allSameBlock(VectorizableTree.back()->Scalars) &&
12120 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12122 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12123 VectorizableTree.back()->getVectorFactor()),
12124 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12125 /*Insert=*/true, /*Extract=*/false,
12127 return false;
12128
12129 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12130 // vectorizable.
12131 return true;
12132}
12133
12136 constexpr unsigned SmallTree = 3;
12137 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12138 getCanonicalGraphSize() <= SmallTree &&
12139 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12140 [](const std::unique_ptr<TreeEntry> &TE) {
12141 return TE->isGather() &&
12142 TE->getOpcode() == Instruction::Load &&
12143 !allSameBlock(TE->Scalars);
12144 }) == 1)
12145 return true;
12146 return false;
12147 }
12148 bool Res = false;
12149 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12150 TreeEntry &E = *VectorizableTree[Idx];
12151 if (!E.isGather())
12152 continue;
12153 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12154 return false;
12155 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12156 continue;
12157 Res = true;
12158 }
12159 return Res;
12160}
12161
12163 // Walk from the bottom of the tree to the top, tracking which values are
12164 // live. When we see a call instruction that is not part of our tree,
12165 // query TTI to see if there is a cost to keeping values live over it
12166 // (for example, if spills and fills are required).
12167 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12169
12171 Instruction *PrevInst = nullptr;
12172
12173 // The entries in VectorizableTree are not necessarily ordered by their
12174 // position in basic blocks. Collect them and order them by dominance so later
12175 // instructions are guaranteed to be visited first. For instructions in
12176 // different basic blocks, we only scan to the beginning of the block, so
12177 // their order does not matter, as long as all instructions in a basic block
12178 // are grouped together. Using dominance ensures a deterministic order.
12179 SmallVector<Instruction *, 16> OrderedScalars;
12180 for (const auto &TEPtr : VectorizableTree) {
12181 if (TEPtr->State != TreeEntry::Vectorize)
12182 continue;
12183 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12184 if (!Inst)
12185 continue;
12186 OrderedScalars.push_back(Inst);
12187 }
12188 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12189 auto *NodeA = DT->getNode(A->getParent());
12190 auto *NodeB = DT->getNode(B->getParent());
12191 assert(NodeA && "Should only process reachable instructions");
12192 assert(NodeB && "Should only process reachable instructions");
12193 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12194 "Different nodes should have different DFS numbers");
12195 if (NodeA != NodeB)
12196 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12197 return B->comesBefore(A);
12198 });
12199
12200 for (Instruction *Inst : OrderedScalars) {
12201 if (!PrevInst) {
12202 PrevInst = Inst;
12203 continue;
12204 }
12205
12206 // Update LiveValues.
12207 LiveValues.erase(PrevInst);
12208 for (auto &J : PrevInst->operands()) {
12209 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12210 LiveValues.insert(cast<Instruction>(&*J));
12211 }
12212
12213 LLVM_DEBUG({
12214 dbgs() << "SLP: #LV: " << LiveValues.size();
12215 for (auto *X : LiveValues)
12216 dbgs() << " " << X->getName();
12217 dbgs() << ", Looking at ";
12218 Inst->dump();
12219 });
12220
12221 // Now find the sequence of instructions between PrevInst and Inst.
12222 unsigned NumCalls = 0;
12223 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12224 PrevInstIt =
12225 PrevInst->getIterator().getReverse();
12226 while (InstIt != PrevInstIt) {
12227 if (PrevInstIt == PrevInst->getParent()->rend()) {
12228 PrevInstIt = Inst->getParent()->rbegin();
12229 continue;
12230 }
12231
12232 auto NoCallIntrinsic = [this](Instruction *I) {
12233 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12234 if (II->isAssumeLikeIntrinsic())
12235 return true;
12236 FastMathFlags FMF;
12238 for (auto &ArgOp : II->args())
12239 Tys.push_back(ArgOp->getType());
12240 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12241 FMF = FPMO->getFastMathFlags();
12242 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12243 FMF);
12244 InstructionCost IntrCost =
12247 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12248 if (IntrCost < CallCost)
12249 return true;
12250 }
12251 return false;
12252 };
12253
12254 // Debug information does not impact spill cost.
12255 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12256 &*PrevInstIt != PrevInst)
12257 NumCalls++;
12258
12259 ++PrevInstIt;
12260 }
12261
12262 if (NumCalls) {
12264 for (auto *II : LiveValues) {
12265 auto *ScalarTy = II->getType();
12266 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12267 ScalarTy = VectorTy->getElementType();
12268 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12269 }
12270 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12271 }
12272
12273 PrevInst = Inst;
12274 }
12275
12276 return Cost;
12277}
12278
12279/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12280/// buildvector sequence.
12282 const InsertElementInst *IE2) {
12283 if (IE1 == IE2)
12284 return false;
12285 const auto *I1 = IE1;
12286 const auto *I2 = IE2;
12287 const InsertElementInst *PrevI1;
12288 const InsertElementInst *PrevI2;
12289 unsigned Idx1 = *getElementIndex(IE1);
12290 unsigned Idx2 = *getElementIndex(IE2);
12291 do {
12292 if (I2 == IE1)
12293 return true;
12294 if (I1 == IE2)
12295 return false;
12296 PrevI1 = I1;
12297 PrevI2 = I2;
12298 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12299 getElementIndex(I1).value_or(Idx2) != Idx2)
12300 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12301 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12302 getElementIndex(I2).value_or(Idx1) != Idx1)
12303 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12304 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12305 llvm_unreachable("Two different buildvectors not expected.");
12306}
12307
12308namespace {
12309/// Returns incoming Value *, if the requested type is Value * too, or a default
12310/// value, otherwise.
12311struct ValueSelect {
12312 template <typename U>
12313 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12314 return V;
12315 }
12316 template <typename U>
12317 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12318 return U();
12319 }
12320};
12321} // namespace
12322
12323/// Does the analysis of the provided shuffle masks and performs the requested
12324/// actions on the vectors with the given shuffle masks. It tries to do it in
12325/// several steps.
12326/// 1. If the Base vector is not undef vector, resizing the very first mask to
12327/// have common VF and perform action for 2 input vectors (including non-undef
12328/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12329/// and processed as a shuffle of 2 elements.
12330/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12331/// action only for 1 vector with the given mask, if it is not the identity
12332/// mask.
12333/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12334/// vectors, combing the masks properly between the steps.
12335template <typename T>
12337 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12338 function_ref<unsigned(T *)> GetVF,
12339 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12341 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12342 SmallVector<int> Mask(ShuffleMask.begin()->second);
12343 auto VMIt = std::next(ShuffleMask.begin());
12344 T *Prev = nullptr;
12345 SmallBitVector UseMask =
12346 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12347 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12348 if (!IsBaseUndef.all()) {
12349 // Base is not undef, need to combine it with the next subvectors.
12350 std::pair<T *, bool> Res =
12351 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12352 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12353 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12354 if (Mask[Idx] == PoisonMaskElem)
12355 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12356 else
12357 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12358 }
12359 auto *V = ValueSelect::get<T *>(Base);
12360 (void)V;
12361 assert((!V || GetVF(V) == Mask.size()) &&
12362 "Expected base vector of VF number of elements.");
12363 Prev = Action(Mask, {nullptr, Res.first});
12364 } else if (ShuffleMask.size() == 1) {
12365 // Base is undef and only 1 vector is shuffled - perform the action only for
12366 // single vector, if the mask is not the identity mask.
12367 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12368 /*ForSingleMask=*/true);
12369 if (Res.second)
12370 // Identity mask is found.
12371 Prev = Res.first;
12372 else
12373 Prev = Action(Mask, {ShuffleMask.begin()->first});
12374 } else {
12375 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12376 // shuffles step by step, combining shuffle between the steps.
12377 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12378 unsigned Vec2VF = GetVF(VMIt->first);
12379 if (Vec1VF == Vec2VF) {
12380 // No need to resize the input vectors since they are of the same size, we
12381 // can shuffle them directly.
12382 ArrayRef<int> SecMask = VMIt->second;
12383 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12384 if (SecMask[I] != PoisonMaskElem) {
12385 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12386 Mask[I] = SecMask[I] + Vec1VF;
12387 }
12388 }
12389 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12390 } else {
12391 // Vectors of different sizes - resize and reshuffle.
12392 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12393 /*ForSingleMask=*/false);
12394 std::pair<T *, bool> Res2 =
12395 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12396 ArrayRef<int> SecMask = VMIt->second;
12397 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12398 if (Mask[I] != PoisonMaskElem) {
12399 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12400 if (Res1.second)
12401 Mask[I] = I;
12402 } else if (SecMask[I] != PoisonMaskElem) {
12403 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12404 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12405 }
12406 }
12407 Prev = Action(Mask, {Res1.first, Res2.first});
12408 }
12409 VMIt = std::next(VMIt);
12410 }
12411 bool IsBaseNotUndef = !IsBaseUndef.all();
12412 (void)IsBaseNotUndef;
12413 // Perform requested actions for the remaining masks/vectors.
12414 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12415 // Shuffle other input vectors, if any.
12416 std::pair<T *, bool> Res =
12417 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12418 ArrayRef<int> SecMask = VMIt->second;
12419 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12420 if (SecMask[I] != PoisonMaskElem) {
12421 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12422 "Multiple uses of scalars.");
12423 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12424 } else if (Mask[I] != PoisonMaskElem) {
12425 Mask[I] = I;
12426 }
12427 }
12428 Prev = Action(Mask, {Prev, Res.first});
12429 }
12430 return Prev;
12431}
12432
12433namespace {
12434/// Data type for handling buildvector sequences with the reused scalars from
12435/// other tree entries.
12436template <typename T> struct ShuffledInsertData {
12437 /// List of insertelements to be replaced by shuffles.
12438 SmallVector<InsertElementInst *> InsertElements;
12439 /// The parent vectors and shuffle mask for the given list of inserts.
12441};
12442} // namespace
12443
12446 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12447 << VectorizableTree.size() << ".\n");
12448
12449 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12450
12451 SmallPtrSet<Value *, 4> CheckedExtracts;
12452 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12453 TreeEntry &TE = *VectorizableTree[I];
12454 // No need to count the cost for combined entries, they are combined and
12455 // just skip their cost.
12456 if (TE.State == TreeEntry::CombinedVectorize) {
12457 LLVM_DEBUG(
12458 dbgs() << "SLP: Skipping cost for combined node that starts with "
12459 << *TE.Scalars[0] << ".\n";
12460 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12461 continue;
12462 }
12463 if (TE.isGather()) {
12464 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12465 E && E->getVectorFactor() == TE.getVectorFactor() &&
12466 E->isSame(TE.Scalars)) {
12467 // Some gather nodes might be absolutely the same as some vectorizable
12468 // nodes after reordering, need to handle it.
12469 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12470 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12471 << "SLP: Current total cost = " << Cost << "\n");
12472 continue;
12473 }
12474 }
12475
12476 // Exclude cost of gather loads nodes which are not used. These nodes were
12477 // built as part of the final attempt to vectorize gathered loads.
12478 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12479 "Expected gather nodes with users only.");
12480
12481 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12482 Cost += C;
12483 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12484 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12485 << "SLP: Current total cost = " << Cost << "\n");
12486 }
12487
12488 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12489 InstructionCost ExtractCost = 0;
12491 SmallVector<APInt> DemandedElts;
12492 SmallDenseSet<Value *, 4> UsedInserts;
12494 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12496 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12497 // Keep track {Scalar, Index, User} tuple.
12498 // On AArch64, this helps in fusing a mov instruction, associated with
12499 // extractelement, with fmul in the backend so that extractelement is free.
12501 for (ExternalUser &EU : ExternalUses) {
12502 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12503 }
12504 for (ExternalUser &EU : ExternalUses) {
12505 // Uses by ephemeral values are free (because the ephemeral value will be
12506 // removed prior to code generation, and so the extraction will be
12507 // removed as well).
12508 if (EphValues.count(EU.User))
12509 continue;
12510
12511 // Used in unreachable blocks or in EH pads (rarely executed) or is
12512 // terminated with unreachable instruction.
12513 if (BasicBlock *UserParent =
12514 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12515 UserParent &&
12516 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12517 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12518 continue;
12519
12520 // We only add extract cost once for the same scalar.
12521 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12522 !ExtractCostCalculated.insert(EU.Scalar).second)
12523 continue;
12524
12525 // No extract cost for vector "scalar"
12526 if (isa<FixedVectorType>(EU.Scalar->getType()))
12527 continue;
12528
12529 // If found user is an insertelement, do not calculate extract cost but try
12530 // to detect it as a final shuffled/identity match.
12531 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12532 VU && VU->getOperand(1) == EU.Scalar) {
12533 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12534 if (!UsedInserts.insert(VU).second)
12535 continue;
12536 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12537 if (InsertIdx) {
12538 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12539 auto *It = find_if(
12540 ShuffledInserts,
12541 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12542 // Checks if 2 insertelements are from the same buildvector.
12543 InsertElementInst *VecInsert = Data.InsertElements.front();
12545 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12546 Value *Op0 = II->getOperand(0);
12547 if (getTreeEntry(II) && !getTreeEntry(Op0))
12548 return nullptr;
12549 return Op0;
12550 });
12551 });
12552 int VecId = -1;
12553 if (It == ShuffledInserts.end()) {
12554 auto &Data = ShuffledInserts.emplace_back();
12555 Data.InsertElements.emplace_back(VU);
12556 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12557 VecId = ShuffledInserts.size() - 1;
12558 auto It = MinBWs.find(ScalarTE);
12559 if (It != MinBWs.end() &&
12560 VectorCasts
12561 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12562 .second) {
12563 unsigned BWSz = It->second.first;
12564 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12565 unsigned VecOpcode;
12566 if (DstBWSz < BWSz)
12567 VecOpcode = Instruction::Trunc;
12568 else
12569 VecOpcode =
12570 It->second.second ? Instruction::SExt : Instruction::ZExt;
12573 VecOpcode, FTy,
12574 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12575 FTy->getNumElements()),
12577 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12578 << " for extending externally used vector with "
12579 "non-equal minimum bitwidth.\n");
12580 Cost += C;
12581 }
12582 } else {
12583 if (isFirstInsertElement(VU, It->InsertElements.front()))
12584 It->InsertElements.front() = VU;
12585 VecId = std::distance(ShuffledInserts.begin(), It);
12586 }
12587 int InIdx = *InsertIdx;
12588 SmallVectorImpl<int> &Mask =
12589 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12590 if (Mask.empty())
12591 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12592 Mask[InIdx] = EU.Lane;
12593 DemandedElts[VecId].setBit(InIdx);
12594 continue;
12595 }
12596 }
12597 }
12598
12600 // If we plan to rewrite the tree in a smaller type, we will need to sign
12601 // extend the extracted value back to the original type. Here, we account
12602 // for the extract and the added cost of the sign extend if needed.
12603 InstructionCost ExtraCost = TTI::TCC_Free;
12604 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12605 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12606 auto It = MinBWs.find(Entry);
12607 if (It != MinBWs.end()) {
12608 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12609 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12610 ? Instruction::ZExt
12611 : Instruction::SExt;
12612 VecTy = getWidenedType(MinTy, BundleWidth);
12613 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12614 VecTy, EU.Lane);
12615 } else {
12616 ExtraCost =
12617 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12618 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12619 }
12620 // Leave the scalar instructions as is if they are cheaper than extracts.
12621 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12622 Entry->getOpcode() == Instruction::Load) {
12623 // Checks if the user of the external scalar is phi in loop body.
12624 auto IsPhiInLoop = [&](const ExternalUser &U) {
12625 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12626 auto *I = cast<Instruction>(U.Scalar);
12627 const Loop *L = LI->getLoopFor(Phi->getParent());
12628 return L && (Phi->getParent() == I->getParent() ||
12629 L == LI->getLoopFor(I->getParent()));
12630 }
12631 return false;
12632 };
12633 if (!ValueToExtUses) {
12634 ValueToExtUses.emplace();
12635 for_each(enumerate(ExternalUses), [&](const auto &P) {
12636 // Ignore phis in loops.
12637 if (IsPhiInLoop(P.value()))
12638 return;
12639
12640 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12641 });
12642 }
12643 // Can use original instruction, if no operands vectorized or they are
12644 // marked as externally used already.
12645 auto *Inst = cast<Instruction>(EU.Scalar);
12646 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12647 auto OperandIsScalar = [&](Value *V) {
12648 if (!getTreeEntry(V)) {
12649 // Some extractelements might be not vectorized, but
12650 // transformed into shuffle and removed from the function,
12651 // consider it here.
12652 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12653 return !EE->hasOneUse() || !MustGather.contains(EE);
12654 return true;
12655 }
12656 return ValueToExtUses->contains(V);
12657 };
12658 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12659 bool CanBeUsedAsScalarCast = false;
12660 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12661 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12662 Op && all_of(Op->operands(), OperandIsScalar)) {
12663 InstructionCost OpCost =
12664 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12666 : 0;
12667 if (ScalarCost + OpCost <= ExtraCost) {
12668 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12669 ScalarCost += OpCost;
12670 }
12671 }
12672 }
12673 if (CanBeUsedAsScalar) {
12674 bool KeepScalar = ScalarCost <= ExtraCost;
12675 // Try to keep original scalar if the user is the phi node from the same
12676 // block as the root phis, currently vectorized. It allows to keep
12677 // better ordering info of PHIs, being vectorized currently.
12678 bool IsProfitablePHIUser =
12679 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12680 VectorizableTree.front()->Scalars.size() > 2)) &&
12681 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12682 !Inst->hasNUsesOrMore(UsesLimit) &&
12683 none_of(Inst->users(),
12684 [&](User *U) {
12685 auto *PHIUser = dyn_cast<PHINode>(U);
12686 return (!PHIUser ||
12687 PHIUser->getParent() !=
12688 cast<Instruction>(
12689 VectorizableTree.front()->getMainOp())
12690 ->getParent()) &&
12691 !getTreeEntry(U);
12692 }) &&
12693 count_if(Entry->Scalars, [&](Value *V) {
12694 return ValueToExtUses->contains(V);
12695 }) <= 2;
12696 if (IsProfitablePHIUser) {
12697 KeepScalar = true;
12698 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12699 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12700 (!GatheredLoadsEntriesFirst.has_value() ||
12701 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12702 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12703 return ValueToExtUses->contains(V);
12704 });
12705 auto It = ExtractsCount.find(Entry);
12706 if (It != ExtractsCount.end()) {
12707 assert(ScalarUsesCount >= It->getSecond().size() &&
12708 "Expected total number of external uses not less than "
12709 "number of scalar uses.");
12710 ScalarUsesCount -= It->getSecond().size();
12711 }
12712 // Keep original scalar if number of externally used instructions in
12713 // the same entry is not power of 2. It may help to do some extra
12714 // vectorization for now.
12715 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12716 }
12717 if (KeepScalar) {
12718 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12719 for_each(Inst->operands(), [&](Value *V) {
12720 auto It = ValueToExtUses->find(V);
12721 if (It != ValueToExtUses->end()) {
12722 // Replace all uses to avoid compiler crash.
12723 ExternalUses[It->second].User = nullptr;
12724 }
12725 });
12726 ExtraCost = ScalarCost;
12727 if (!IsPhiInLoop(EU))
12728 ExtractsCount[Entry].insert(Inst);
12729 if (CanBeUsedAsScalarCast) {
12730 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12731 // Update the users of the operands of the cast operand to avoid
12732 // compiler crash.
12733 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12734 for_each(IOp->operands(), [&](Value *V) {
12735 auto It = ValueToExtUses->find(V);
12736 if (It != ValueToExtUses->end()) {
12737 // Replace all uses to avoid compiler crash.
12738 ExternalUses[It->second].User = nullptr;
12739 }
12740 });
12741 }
12742 }
12743 }
12744 }
12745 }
12746
12747 ExtractCost += ExtraCost;
12748 }
12749 // Insert externals for extract of operands of casts to be emitted as scalars
12750 // instead of extractelement.
12751 for (Value *V : ScalarOpsFromCasts) {
12752 ExternalUsesAsOriginalScalar.insert(V);
12753 if (const TreeEntry *E = getTreeEntry(V)) {
12754 ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
12755 }
12756 }
12757 // Add reduced value cost, if resized.
12758 if (!VectorizedVals.empty()) {
12759 const TreeEntry &Root = *VectorizableTree.front();
12760 auto BWIt = MinBWs.find(&Root);
12761 if (BWIt != MinBWs.end()) {
12762 Type *DstTy = Root.Scalars.front()->getType();
12763 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12764 unsigned SrcSz =
12765 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12766 if (OriginalSz != SrcSz) {
12767 unsigned Opcode = Instruction::Trunc;
12768 if (OriginalSz > SrcSz)
12769 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12770 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12771 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12772 assert(SLPReVec && "Only supported by REVEC.");
12773 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12774 }
12775 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12778 }
12779 }
12780 }
12781
12782 InstructionCost SpillCost = getSpillCost();
12783 Cost += SpillCost + ExtractCost;
12784 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12785 bool) {
12786 InstructionCost C = 0;
12787 unsigned VF = Mask.size();
12788 unsigned VecVF = TE->getVectorFactor();
12789 if (VF != VecVF &&
12790 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12792 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12793 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12794 OrigMask.begin());
12796 getWidenedType(TE->getMainOp()->getType(), VecVF),
12797 OrigMask);
12798 LLVM_DEBUG(
12799 dbgs() << "SLP: Adding cost " << C
12800 << " for final shuffle of insertelement external users.\n";
12801 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12802 Cost += C;
12803 return std::make_pair(TE, true);
12804 }
12805 return std::make_pair(TE, false);
12806 };
12807 // Calculate the cost of the reshuffled vectors, if any.
12808 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12809 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12810 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12811 unsigned VF = 0;
12812 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12814 assert((TEs.size() == 1 || TEs.size() == 2) &&
12815 "Expected exactly 1 or 2 tree entries.");
12816 if (TEs.size() == 1) {
12817 if (VF == 0)
12818 VF = TEs.front()->getVectorFactor();
12819 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12820 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12821 !all_of(enumerate(Mask), [=](const auto &Data) {
12822 return Data.value() == PoisonMaskElem ||
12823 (Data.index() < VF &&
12824 static_cast<int>(Data.index()) == Data.value());
12825 })) {
12828 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12829 << " for final shuffle of insertelement "
12830 "external users.\n";
12831 TEs.front()->dump();
12832 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12833 Cost += C;
12834 }
12835 } else {
12836 if (VF == 0) {
12837 if (TEs.front() &&
12838 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12839 VF = TEs.front()->getVectorFactor();
12840 else
12841 VF = Mask.size();
12842 }
12843 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12846 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12847 << " for final shuffle of vector node and external "
12848 "insertelement users.\n";
12849 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12850 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12851 Cost += C;
12852 }
12853 VF = Mask.size();
12854 return TEs.back();
12855 };
12856 (void)performExtractsShuffleAction<const TreeEntry>(
12857 MutableArrayRef(Vector.data(), Vector.size()), Base,
12858 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12859 EstimateShufflesCost);
12861 cast<FixedVectorType>(
12862 ShuffledInserts[I].InsertElements.front()->getType()),
12863 DemandedElts[I],
12864 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12865 Cost -= InsertCost;
12866 }
12867
12868 // Add the cost for reduced value resize (if required).
12869 if (ReductionBitWidth != 0) {
12870 assert(UserIgnoreList && "Expected reduction tree.");
12871 const TreeEntry &E = *VectorizableTree.front();
12872 auto It = MinBWs.find(&E);
12873 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12874 unsigned SrcSize = It->second.first;
12875 unsigned DstSize = ReductionBitWidth;
12876 unsigned Opcode = Instruction::Trunc;
12877 if (SrcSize < DstSize) {
12878 bool IsArithmeticExtendedReduction =
12879 all_of(*UserIgnoreList, [](Value *V) {
12880 auto *I = cast<Instruction>(V);
12881 return is_contained({Instruction::Add, Instruction::FAdd,
12882 Instruction::Mul, Instruction::FMul,
12883 Instruction::And, Instruction::Or,
12884 Instruction::Xor},
12885 I->getOpcode());
12886 });
12887 if (IsArithmeticExtendedReduction)
12888 Opcode =
12889 Instruction::BitCast; // Handle it by getExtendedReductionCost
12890 else
12891 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12892 }
12893 if (Opcode != Instruction::BitCast) {
12894 auto *SrcVecTy =
12895 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12896 auto *DstVecTy =
12897 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12898 TTI::CastContextHint CCH = getCastContextHint(E);
12899 InstructionCost CastCost;
12900 switch (E.getOpcode()) {
12901 case Instruction::SExt:
12902 case Instruction::ZExt:
12903 case Instruction::Trunc: {
12904 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12905 CCH = getCastContextHint(*OpTE);
12906 break;
12907 }
12908 default:
12909 break;
12910 }
12911 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12913 Cost += CastCost;
12914 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12915 << " for final resize for reduction from " << SrcVecTy
12916 << " to " << DstVecTy << "\n";
12917 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12918 }
12919 }
12920 }
12921
12922#ifndef NDEBUG
12923 SmallString<256> Str;
12924 {
12926 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12927 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12928 << "SLP: Total Cost = " << Cost << ".\n";
12929 }
12930 LLVM_DEBUG(dbgs() << Str);
12931 if (ViewSLPTree)
12932 ViewGraph(this, "SLP" + F->getName(), false, Str);
12933#endif
12934
12935 return Cost;
12936}
12937
12938/// Tries to find extractelement instructions with constant indices from fixed
12939/// vector type and gather such instructions into a bunch, which highly likely
12940/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12941/// successful, the matched scalars are replaced by poison values in \p VL for
12942/// future analysis.
12943std::optional<TTI::ShuffleKind>
12944BoUpSLP::tryToGatherSingleRegisterExtractElements(
12946 // Scan list of gathered scalars for extractelements that can be represented
12947 // as shuffles.
12949 SmallVector<int> UndefVectorExtracts;
12950 for (int I = 0, E = VL.size(); I < E; ++I) {
12951 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12952 if (!EI) {
12953 if (isa<UndefValue>(VL[I]))
12954 UndefVectorExtracts.push_back(I);
12955 continue;
12956 }
12957 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12958 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12959 continue;
12960 std::optional<unsigned> Idx = getExtractIndex(EI);
12961 // Undefined index.
12962 if (!Idx) {
12963 UndefVectorExtracts.push_back(I);
12964 continue;
12965 }
12966 if (Idx >= VecTy->getNumElements()) {
12967 UndefVectorExtracts.push_back(I);
12968 continue;
12969 }
12970 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12971 ExtractMask.reset(*Idx);
12972 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12973 UndefVectorExtracts.push_back(I);
12974 continue;
12975 }
12976 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12977 }
12978 // Sort the vector operands by the maximum number of uses in extractelements.
12980 VectorOpToIdx.takeVector();
12981 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
12982 return P1.second.size() > P2.second.size();
12983 });
12984 // Find the best pair of the vectors or a single vector.
12985 const int UndefSz = UndefVectorExtracts.size();
12986 unsigned SingleMax = 0;
12987 unsigned PairMax = 0;
12988 if (!Vectors.empty()) {
12989 SingleMax = Vectors.front().second.size() + UndefSz;
12990 if (Vectors.size() > 1) {
12991 auto *ItNext = std::next(Vectors.begin());
12992 PairMax = SingleMax + ItNext->second.size();
12993 }
12994 }
12995 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12996 return std::nullopt;
12997 // Check if better to perform a shuffle of 2 vectors or just of a single
12998 // vector.
12999 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
13000 SmallVector<Value *> GatheredExtracts(
13001 VL.size(), PoisonValue::get(VL.front()->getType()));
13002 if (SingleMax >= PairMax && SingleMax) {
13003 for (int Idx : Vectors.front().second)
13004 std::swap(GatheredExtracts[Idx], VL[Idx]);
13005 } else if (!Vectors.empty()) {
13006 for (unsigned Idx : {0, 1})
13007 for (int Idx : Vectors[Idx].second)
13008 std::swap(GatheredExtracts[Idx], VL[Idx]);
13009 }
13010 // Add extracts from undefs too.
13011 for (int Idx : UndefVectorExtracts)
13012 std::swap(GatheredExtracts[Idx], VL[Idx]);
13013 // Check that gather of extractelements can be represented as just a
13014 // shuffle of a single/two vectors the scalars are extracted from.
13015 std::optional<TTI::ShuffleKind> Res =
13016 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
13017 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
13018 // TODO: try to check other subsets if possible.
13019 // Restore the original VL if attempt was not successful.
13020 copy(SavedVL, VL.begin());
13021 return std::nullopt;
13022 }
13023 // Restore unused scalars from mask, if some of the extractelements were not
13024 // selected for shuffle.
13025 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
13026 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
13027 isa<UndefValue>(GatheredExtracts[I])) {
13028 std::swap(VL[I], GatheredExtracts[I]);
13029 continue;
13030 }
13031 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
13032 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13033 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13034 is_contained(UndefVectorExtracts, I))
13035 continue;
13036 }
13037 return Res;
13038}
13039
13040/// Tries to find extractelement instructions with constant indices from fixed
13041/// vector type and gather such instructions into a bunch, which highly likely
13042/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
13043/// successful, the matched scalars are replaced by poison values in \p VL for
13044/// future analysis.
13046BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
13048 unsigned NumParts) const {
13049 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
13050 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
13051 Mask.assign(VL.size(), PoisonMaskElem);
13052 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13053 for (unsigned Part : seq<unsigned>(NumParts)) {
13054 // Scan list of gathered scalars for extractelements that can be represented
13055 // as shuffles.
13057 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13058 SmallVector<int> SubMask;
13059 std::optional<TTI::ShuffleKind> Res =
13060 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13061 ShufflesRes[Part] = Res;
13062 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13063 }
13064 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13065 return Res.has_value();
13066 }))
13067 ShufflesRes.clear();
13068 return ShufflesRes;
13069}
13070
13071std::optional<TargetTransformInfo::ShuffleKind>
13072BoUpSLP::isGatherShuffledSingleRegisterEntry(
13073 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
13074 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
13075 Entries.clear();
13076 // TODO: currently checking only for Scalars in the tree entry, need to count
13077 // reused elements too for better cost estimation.
13078 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
13079 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13080 : TE->UserTreeIndices.front();
13081 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13082 const BasicBlock *TEInsertBlock = nullptr;
13083 // Main node of PHI entries keeps the correct order of operands/incoming
13084 // blocks.
13085 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13086 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13087 TEInsertPt = TEInsertBlock->getTerminator();
13088 } else {
13089 TEInsertBlock = TEInsertPt->getParent();
13090 }
13091 if (!DT->isReachableFromEntry(TEInsertBlock))
13092 return std::nullopt;
13093 auto *NodeUI = DT->getNode(TEInsertBlock);
13094 assert(NodeUI && "Should only process reachable instructions");
13095 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13096 auto CheckOrdering = [&](const Instruction *InsertPt) {
13097 // Argument InsertPt is an instruction where vector code for some other
13098 // tree entry (one that shares one or more scalars with TE) is going to be
13099 // generated. This lambda returns true if insertion point of vector code
13100 // for the TE dominates that point (otherwise dependency is the other way
13101 // around). The other node is not limited to be of a gather kind. Gather
13102 // nodes are not scheduled and their vector code is inserted before their
13103 // first user. If user is PHI, that is supposed to be at the end of a
13104 // predecessor block. Otherwise it is the last instruction among scalars of
13105 // the user node. So, instead of checking dependency between instructions
13106 // themselves, we check dependency between their insertion points for vector
13107 // code (since each scalar instruction ends up as a lane of a vector
13108 // instruction).
13109 const BasicBlock *InsertBlock = InsertPt->getParent();
13110 auto *NodeEUI = DT->getNode(InsertBlock);
13111 if (!NodeEUI)
13112 return false;
13113 assert((NodeUI == NodeEUI) ==
13114 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13115 "Different nodes should have different DFS numbers");
13116 // Check the order of the gather nodes users.
13117 if (TEInsertPt->getParent() != InsertBlock &&
13118 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13119 return false;
13120 if (TEInsertPt->getParent() == InsertBlock &&
13121 TEInsertPt->comesBefore(InsertPt))
13122 return false;
13123 return true;
13124 };
13125 // Find all tree entries used by the gathered values. If no common entries
13126 // found - not a shuffle.
13127 // Here we build a set of tree nodes for each gathered value and trying to
13128 // find the intersection between these sets. If we have at least one common
13129 // tree node for each gathered value - we have just a permutation of the
13130 // single vector. If we have 2 different sets, we're in situation where we
13131 // have a permutation of 2 input vectors.
13133 DenseMap<Value *, int> UsedValuesEntry;
13134 for (Value *V : VL) {
13135 if (isConstant(V))
13136 continue;
13137 // Build a list of tree entries where V is used.
13139 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13140 if (TEPtr == TE || TEPtr->Idx == 0)
13141 continue;
13142 assert(any_of(TEPtr->Scalars,
13143 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13144 "Must contain at least single gathered value.");
13145 assert(TEPtr->UserTreeIndices.size() == 1 &&
13146 "Expected only single user of a gather node.");
13147 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13148
13149 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13150 const Instruction *InsertPt =
13151 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13152 : &getLastInstructionInBundle(UseEI.UserTE);
13153 if (TEInsertPt == InsertPt) {
13154 // If 2 gathers are operands of the same entry (regardless of whether
13155 // user is PHI or else), compare operands indices, use the earlier one
13156 // as the base.
13157 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13158 continue;
13159 // If the user instruction is used for some reason in different
13160 // vectorized nodes - make it depend on index.
13161 if (TEUseEI.UserTE != UseEI.UserTE &&
13162 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13163 continue;
13164 }
13165
13166 // Check if the user node of the TE comes after user node of TEPtr,
13167 // otherwise TEPtr depends on TE.
13168 if ((TEInsertBlock != InsertPt->getParent() ||
13169 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13170 !CheckOrdering(InsertPt))
13171 continue;
13172 VToTEs.insert(TEPtr);
13173 }
13174 if (const TreeEntry *VTE = getTreeEntry(V)) {
13175 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13176 if (VTE->State != TreeEntry::Vectorize) {
13177 auto It = MultiNodeScalars.find(V);
13178 if (It == MultiNodeScalars.end())
13179 continue;
13180 VTE = *It->getSecond().begin();
13181 // Iterate through all vectorized nodes.
13182 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
13183 return MTE->State == TreeEntry::Vectorize;
13184 });
13185 if (MIt == It->getSecond().end())
13186 continue;
13187 VTE = *MIt;
13188 }
13189 }
13190 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13191 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13192 continue;
13193 VToTEs.insert(VTE);
13194 }
13195 if (VToTEs.empty())
13196 continue;
13197 if (UsedTEs.empty()) {
13198 // The first iteration, just insert the list of nodes to vector.
13199 UsedTEs.push_back(VToTEs);
13200 UsedValuesEntry.try_emplace(V, 0);
13201 } else {
13202 // Need to check if there are any previously used tree nodes which use V.
13203 // If there are no such nodes, consider that we have another one input
13204 // vector.
13205 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13206 unsigned Idx = 0;
13207 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13208 // Do we have a non-empty intersection of previously listed tree entries
13209 // and tree entries using current V?
13210 set_intersect(VToTEs, Set);
13211 if (!VToTEs.empty()) {
13212 // Yes, write the new subset and continue analysis for the next
13213 // scalar.
13214 Set.swap(VToTEs);
13215 break;
13216 }
13217 VToTEs = SavedVToTEs;
13218 ++Idx;
13219 }
13220 // No non-empty intersection found - need to add a second set of possible
13221 // source vectors.
13222 if (Idx == UsedTEs.size()) {
13223 // If the number of input vectors is greater than 2 - not a permutation,
13224 // fallback to the regular gather.
13225 // TODO: support multiple reshuffled nodes.
13226 if (UsedTEs.size() == 2)
13227 continue;
13228 UsedTEs.push_back(SavedVToTEs);
13229 Idx = UsedTEs.size() - 1;
13230 }
13231 UsedValuesEntry.try_emplace(V, Idx);
13232 }
13233 }
13234
13235 if (UsedTEs.empty()) {
13236 Entries.clear();
13237 return std::nullopt;
13238 }
13239
13240 unsigned VF = 0;
13241 if (UsedTEs.size() == 1) {
13242 // Keep the order to avoid non-determinism.
13243 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13244 UsedTEs.front().end());
13245 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13246 return TE1->Idx < TE2->Idx;
13247 });
13248 // Try to find the perfect match in another gather node at first.
13249 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13250 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13251 });
13252 if (It != FirstEntries.end() &&
13253 ((*It)->getVectorFactor() == VL.size() ||
13254 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13255 TE->ReuseShuffleIndices.size() == VL.size() &&
13256 (*It)->isSame(TE->Scalars)))) {
13257 Entries.push_back(*It);
13258 if ((*It)->getVectorFactor() == VL.size()) {
13259 std::iota(std::next(Mask.begin(), Part * VL.size()),
13260 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13261 } else {
13262 SmallVector<int> CommonMask = TE->getCommonMask();
13263 copy(CommonMask, Mask.begin());
13264 }
13265 // Clear undef scalars.
13266 for (unsigned I : seq<unsigned>(VL.size()))
13267 if (isa<PoisonValue>(VL[I]))
13268 Mask[Part * VL.size() + I] = PoisonMaskElem;
13270 }
13271 // No perfect match, just shuffle, so choose the first tree node from the
13272 // tree.
13273 Entries.push_back(FirstEntries.front());
13274 VF = FirstEntries.front()->getVectorFactor();
13275 } else {
13276 // Try to find nodes with the same vector factor.
13277 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13278 // Keep the order of tree nodes to avoid non-determinism.
13280 for (const TreeEntry *TE : UsedTEs.front()) {
13281 unsigned VF = TE->getVectorFactor();
13282 auto It = VFToTE.find(VF);
13283 if (It != VFToTE.end()) {
13284 if (It->second->Idx > TE->Idx)
13285 It->getSecond() = TE;
13286 continue;
13287 }
13288 VFToTE.try_emplace(VF, TE);
13289 }
13290 // Same, keep the order to avoid non-determinism.
13291 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13292 UsedTEs.back().end());
13293 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13294 return TE1->Idx < TE2->Idx;
13295 });
13296 for (const TreeEntry *TE : SecondEntries) {
13297 auto It = VFToTE.find(TE->getVectorFactor());
13298 if (It != VFToTE.end()) {
13299 VF = It->first;
13300 Entries.push_back(It->second);
13301 Entries.push_back(TE);
13302 break;
13303 }
13304 }
13305 // No 2 source vectors with the same vector factor - just choose 2 with max
13306 // index.
13307 if (Entries.empty()) {
13308 Entries.push_back(*llvm::max_element(
13309 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13310 return TE1->Idx < TE2->Idx;
13311 }));
13312 Entries.push_back(SecondEntries.front());
13313 VF = std::max(Entries.front()->getVectorFactor(),
13314 Entries.back()->getVectorFactor());
13315 } else {
13316 VF = Entries.front()->getVectorFactor();
13317 }
13318 }
13319
13320 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13321 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13322 // vectorized.
13323 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13324 auto *PHI = cast<PHINode>(V);
13325 auto *PHI1 = cast<PHINode>(V1);
13326 // Check that all incoming values are compatible/from same parent (if they
13327 // are instructions).
13328 // The incoming values are compatible if they all are constants, or
13329 // instruction with the same/alternate opcodes from the same basic block.
13330 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13331 Value *In = PHI->getIncomingValue(I);
13332 Value *In1 = PHI1->getIncomingValue(I);
13333 if (isConstant(In) && isConstant(In1))
13334 continue;
13335 if (!getSameOpcode({In, In1}, *TLI))
13336 return false;
13337 if (cast<Instruction>(In)->getParent() !=
13338 cast<Instruction>(In1)->getParent())
13339 return false;
13340 }
13341 return true;
13342 };
13343 // Check if the value can be ignored during analysis for shuffled gathers.
13344 // We suppose it is better to ignore instruction, which do not form splats,
13345 // are not vectorized/not extractelements (these instructions will be handled
13346 // by extractelements processing) or may form vector node in future.
13347 auto MightBeIgnored = [=](Value *V) {
13348 auto *I = dyn_cast<Instruction>(V);
13349 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13351 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13352 };
13353 // Check that the neighbor instruction may form a full vector node with the
13354 // current instruction V. It is possible, if they have same/alternate opcode
13355 // and same parent basic block.
13356 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13357 Value *V1 = VL[Idx];
13358 bool UsedInSameVTE = false;
13359 auto It = UsedValuesEntry.find(V1);
13360 if (It != UsedValuesEntry.end())
13361 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13362 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13363 getSameOpcode({V, V1}, *TLI) &&
13364 cast<Instruction>(V)->getParent() ==
13365 cast<Instruction>(V1)->getParent() &&
13366 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13367 };
13368 // Build a shuffle mask for better cost estimation and vector emission.
13369 SmallBitVector UsedIdxs(Entries.size());
13371 for (int I = 0, E = VL.size(); I < E; ++I) {
13372 Value *V = VL[I];
13373 auto It = UsedValuesEntry.find(V);
13374 if (It == UsedValuesEntry.end())
13375 continue;
13376 // Do not try to shuffle scalars, if they are constants, or instructions
13377 // that can be vectorized as a result of the following vector build
13378 // vectorization.
13379 if (isConstant(V) || (MightBeIgnored(V) &&
13380 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13381 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13382 continue;
13383 unsigned Idx = It->second;
13384 EntryLanes.emplace_back(Idx, I);
13385 UsedIdxs.set(Idx);
13386 }
13387 // Iterate through all shuffled scalars and select entries, which can be used
13388 // for final shuffle.
13390 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13391 if (!UsedIdxs.test(I))
13392 continue;
13393 // Fix the entry number for the given scalar. If it is the first entry, set
13394 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13395 // These indices are used when calculating final shuffle mask as the vector
13396 // offset.
13397 for (std::pair<unsigned, int> &Pair : EntryLanes)
13398 if (Pair.first == I)
13399 Pair.first = TempEntries.size();
13400 TempEntries.push_back(Entries[I]);
13401 }
13402 Entries.swap(TempEntries);
13403 if (EntryLanes.size() == Entries.size() &&
13404 !VL.equals(ArrayRef(TE->Scalars)
13405 .slice(Part * VL.size(),
13406 std::min<int>(VL.size(), TE->Scalars.size())))) {
13407 // We may have here 1 or 2 entries only. If the number of scalars is equal
13408 // to the number of entries, no need to do the analysis, it is not very
13409 // profitable. Since VL is not the same as TE->Scalars, it means we already
13410 // have some shuffles before. Cut off not profitable case.
13411 Entries.clear();
13412 return std::nullopt;
13413 }
13414 // Build the final mask, check for the identity shuffle, if possible.
13415 bool IsIdentity = Entries.size() == 1;
13416 // Pair.first is the offset to the vector, while Pair.second is the index of
13417 // scalar in the list.
13418 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13419 unsigned Idx = Part * VL.size() + Pair.second;
13420 Mask[Idx] =
13421 Pair.first * VF +
13422 (ForOrder ? std::distance(
13423 Entries[Pair.first]->Scalars.begin(),
13424 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13425 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13426 IsIdentity &= Mask[Idx] == Pair.second;
13427 }
13428 if (ForOrder || IsIdentity || Entries.empty()) {
13429 switch (Entries.size()) {
13430 case 1:
13431 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13433 break;
13434 case 2:
13435 if (EntryLanes.size() > 2 || VL.size() <= 2)
13437 break;
13438 default:
13439 break;
13440 }
13441 } else if (!isa<VectorType>(VL.front()->getType()) &&
13442 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13443 // Do the cost estimation if shuffle beneficial than buildvector.
13444 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13445 std::next(Mask.begin(), (Part + 1) * VL.size()));
13446 int MinElement = SubMask.front(), MaxElement = SubMask.front();
13447 for (int Idx : SubMask) {
13448 if (Idx == PoisonMaskElem)
13449 continue;
13450 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13451 MinElement = Idx;
13452 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13453 MaxElement = Idx;
13454 }
13455 assert(MaxElement >= 0 && MinElement >= 0 &&
13456 MaxElement % VF >= MinElement % VF &&
13457 "Expected at least single element.");
13458 unsigned NewVF = std::max<unsigned>(
13459 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13460 (MaxElement % VF) -
13461 (MinElement % VF) + 1));
13462 if (NewVF < VF) {
13463 for_each(SubMask, [&](int &Idx) {
13464 if (Idx == PoisonMaskElem)
13465 return;
13466 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13467 (Idx >= static_cast<int>(VF) ? NewVF : 0);
13468 });
13469 } else {
13470 NewVF = VF;
13471 }
13472
13474 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
13475 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13476 auto GetShuffleCost = [&,
13479 VectorType *VecTy) -> InstructionCost {
13480 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13482 Mask, Entries.front()->getInterleaveFactor()))
13483 return TTI::TCC_Free;
13484 return ::getShuffleCost(TTI,
13485 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13487 VecTy, Mask, CostKind);
13488 };
13489 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13490 InstructionCost FirstShuffleCost = 0;
13491 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13492 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13493 FirstShuffleCost = ShuffleCost;
13494 } else {
13495 // Transform mask to include only first entry.
13496 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13497 bool IsIdentity = true;
13498 for (auto [I, Idx] : enumerate(FirstMask)) {
13499 if (Idx >= static_cast<int>(NewVF)) {
13501 } else {
13502 DemandedElts.clearBit(I);
13503 if (Idx != PoisonMaskElem)
13504 IsIdentity &= static_cast<int>(I) == Idx;
13505 }
13506 }
13507 if (!IsIdentity)
13508 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13509 FirstShuffleCost += TTI->getScalarizationOverhead(
13510 MaskVecTy, DemandedElts, /*Insert=*/true,
13511 /*Extract=*/false, CostKind);
13512 }
13513 InstructionCost SecondShuffleCost = 0;
13514 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13515 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13516 SecondShuffleCost = ShuffleCost;
13517 } else {
13518 // Transform mask to include only first entry.
13519 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13520 bool IsIdentity = true;
13521 for (auto [I, Idx] : enumerate(SecondMask)) {
13522 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
13524 } else {
13525 DemandedElts.clearBit(I);
13526 if (Idx != PoisonMaskElem) {
13527 Idx -= NewVF;
13528 IsIdentity &= static_cast<int>(I) == Idx;
13529 }
13530 }
13531 }
13532 if (!IsIdentity)
13533 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13534 SecondShuffleCost += TTI->getScalarizationOverhead(
13535 MaskVecTy, DemandedElts, /*Insert=*/true,
13536 /*Extract=*/false, CostKind);
13537 }
13538 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13539 for (auto [I, Idx] : enumerate(SubMask))
13540 if (Idx == PoisonMaskElem)
13541 DemandedElts.clearBit(I);
13542 InstructionCost BuildVectorCost =
13543 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13544 /*Extract=*/false, CostKind);
13545 const TreeEntry *BestEntry = nullptr;
13546 if (FirstShuffleCost < ShuffleCost) {
13547 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13548 std::next(Mask.begin(), (Part + 1) * VL.size()),
13549 [&](int &Idx) {
13550 if (Idx >= static_cast<int>(VF))
13551 Idx = PoisonMaskElem;
13552 });
13553 BestEntry = Entries.front();
13554 ShuffleCost = FirstShuffleCost;
13555 }
13556 if (SecondShuffleCost < ShuffleCost) {
13557 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13558 std::next(Mask.begin(), (Part + 1) * VL.size()),
13559 [&](int &Idx) {
13560 if (Idx < static_cast<int>(VF))
13561 Idx = PoisonMaskElem;
13562 else
13563 Idx -= VF;
13564 });
13565 BestEntry = Entries[1];
13566 ShuffleCost = SecondShuffleCost;
13567 }
13568 if (BuildVectorCost >= ShuffleCost) {
13569 if (BestEntry) {
13570 Entries.clear();
13571 Entries.push_back(BestEntry);
13572 }
13573 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13575 }
13576 }
13577 Entries.clear();
13578 // Clear the corresponding mask elements.
13579 std::fill(std::next(Mask.begin(), Part * VL.size()),
13580 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13581 return std::nullopt;
13582}
13583
13585BoUpSLP::isGatherShuffledEntry(
13586 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13587 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13588 bool ForOrder) {
13589 assert(NumParts > 0 && NumParts < VL.size() &&
13590 "Expected positive number of registers.");
13591 Entries.clear();
13592 // No need to check for the topmost gather node.
13593 if (TE == VectorizableTree.front().get() &&
13594 (!GatheredLoadsEntriesFirst.has_value() ||
13595 none_of(ArrayRef(VectorizableTree).drop_front(),
13596 [](const std::unique_ptr<TreeEntry> &TE) {
13597 return !TE->isGather();
13598 })))
13599 return {};
13600 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
13601 if (TE->isNonPowOf2Vec())
13602 return {};
13603 Mask.assign(VL.size(), PoisonMaskElem);
13604 assert((TE->UserTreeIndices.size() == 1 ||
13605 TE == VectorizableTree.front().get()) &&
13606 "Expected only single user of the gather node.");
13607 assert(VL.size() % NumParts == 0 &&
13608 "Number of scalars must be divisible by NumParts.");
13609 if (!TE->UserTreeIndices.empty() &&
13610 TE->UserTreeIndices.front().UserTE->isGather() &&
13611 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13612 assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
13613 isSplat(TE->Scalars)) &&
13614 "Expected splat or extractelements only node.");
13615 return {};
13616 }
13617 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13619 for (unsigned Part : seq<unsigned>(NumParts)) {
13620 ArrayRef<Value *> SubVL =
13621 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13622 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13623 std::optional<TTI::ShuffleKind> SubRes =
13624 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13625 ForOrder);
13626 if (!SubRes)
13627 SubEntries.clear();
13628 Res.push_back(SubRes);
13629 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13630 SubEntries.front()->getVectorFactor() == VL.size() &&
13631 (SubEntries.front()->isSame(TE->Scalars) ||
13632 SubEntries.front()->isSame(VL))) {
13633 SmallVector<const TreeEntry *> LocalSubEntries;
13634 LocalSubEntries.swap(SubEntries);
13635 Entries.clear();
13636 Res.clear();
13637 std::iota(Mask.begin(), Mask.end(), 0);
13638 // Clear undef scalars.
13639 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13640 if (isa<PoisonValue>(VL[I]))
13642 Entries.emplace_back(1, LocalSubEntries.front());
13644 return Res;
13645 }
13646 }
13647 if (all_of(Res,
13648 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13649 Entries.clear();
13650 return {};
13651 }
13652 return Res;
13653}
13654
13655InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13656 Type *ScalarTy) const {
13657 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13658 bool DuplicateNonConst = false;
13659 // Find the cost of inserting/extracting values from the vector.
13660 // Check if the same elements are inserted several times and count them as
13661 // shuffle candidates.
13662 APInt ShuffledElements = APInt::getZero(VL.size());
13663 DenseMap<Value *, unsigned> UniqueElements;
13666 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13667 if (V->getType() != ScalarTy) {
13668 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13670 V = nullptr;
13671 }
13672 if (!ForPoisonSrc)
13673 Cost +=
13674 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13675 I, Constant::getNullValue(VecTy), V);
13676 };
13677 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13678 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13679 Value *V = VL[I];
13680 // No need to shuffle duplicates for constants.
13681 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13682 ShuffledElements.setBit(I);
13683 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13684 continue;
13685 }
13686
13687 auto Res = UniqueElements.try_emplace(V, I);
13688 if (Res.second) {
13689 EstimateInsertCost(I, V);
13690 ShuffleMask[I] = I;
13691 continue;
13692 }
13693
13694 DuplicateNonConst = true;
13695 ShuffledElements.setBit(I);
13696 ShuffleMask[I] = Res.first->second;
13697 }
13698 if (ForPoisonSrc) {
13699 if (isa<FixedVectorType>(ScalarTy)) {
13700 assert(SLPReVec && "Only supported by REVEC.");
13701 // We don't need to insert elements one by one. Instead, we can insert the
13702 // entire vector into the destination.
13703 Cost = 0;
13704 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13705 for (unsigned I : seq<unsigned>(VL.size()))
13706 if (!ShuffledElements[I])
13708 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13709 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13710 } else {
13712 /*DemandedElts*/ ~ShuffledElements,
13713 /*Insert*/ true,
13714 /*Extract*/ false, CostKind, VL);
13715 }
13716 }
13717 if (DuplicateNonConst)
13719 VecTy, ShuffleMask);
13720 return Cost;
13721}
13722
13723Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13724 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13725 if (Res)
13726 return *Res;
13727 // Get the basic block this bundle is in. All instructions in the bundle
13728 // should be in this block (except for extractelement-like instructions with
13729 // constant indices or gathered loads).
13730 auto *Front = E->getMainOp();
13731 auto *BB = Front->getParent();
13732 assert(((GatheredLoadsEntriesFirst.has_value() &&
13733 E->getOpcode() == Instruction::Load && E->isGather() &&
13734 E->Idx < *GatheredLoadsEntriesFirst) ||
13735 all_of(E->Scalars,
13736 [=](Value *V) -> bool {
13737 if (E->getOpcode() == Instruction::GetElementPtr &&
13738 !isa<GetElementPtrInst>(V))
13739 return true;
13740 auto *I = dyn_cast<Instruction>(V);
13741 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13742 isVectorLikeInstWithConstOps(I);
13743 })) &&
13744 "Expected gathered loads or GEPs or instructions from same basic "
13745 "block.");
13746
13747 auto FindLastInst = [&]() {
13748 Instruction *LastInst = Front;
13749 for (Value *V : E->Scalars) {
13750 auto *I = dyn_cast<Instruction>(V);
13751 if (!I)
13752 continue;
13753 if (LastInst->getParent() == I->getParent()) {
13754 if (LastInst->comesBefore(I))
13755 LastInst = I;
13756 continue;
13757 }
13758 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13759 !isa<GetElementPtrInst>(I)) ||
13760 (isVectorLikeInstWithConstOps(LastInst) &&
13762 (GatheredLoadsEntriesFirst.has_value() &&
13763 E->getOpcode() == Instruction::Load && E->isGather() &&
13764 E->Idx < *GatheredLoadsEntriesFirst)) &&
13765 "Expected vector-like or non-GEP in GEP node insts only.");
13766 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13767 LastInst = I;
13768 continue;
13769 }
13770 if (!DT->isReachableFromEntry(I->getParent()))
13771 continue;
13772 auto *NodeA = DT->getNode(LastInst->getParent());
13773 auto *NodeB = DT->getNode(I->getParent());
13774 assert(NodeA && "Should only process reachable instructions");
13775 assert(NodeB && "Should only process reachable instructions");
13776 assert((NodeA == NodeB) ==
13777 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13778 "Different nodes should have different DFS numbers");
13779 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13780 LastInst = I;
13781 }
13782 BB = LastInst->getParent();
13783 return LastInst;
13784 };
13785
13786 auto FindFirstInst = [&]() {
13787 Instruction *FirstInst = Front;
13788 for (Value *V : E->Scalars) {
13789 auto *I = dyn_cast<Instruction>(V);
13790 if (!I)
13791 continue;
13792 if (FirstInst->getParent() == I->getParent()) {
13793 if (I->comesBefore(FirstInst))
13794 FirstInst = I;
13795 continue;
13796 }
13797 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13798 !isa<GetElementPtrInst>(I)) ||
13799 (isVectorLikeInstWithConstOps(FirstInst) &&
13801 "Expected vector-like or non-GEP in GEP node insts only.");
13802 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13803 FirstInst = I;
13804 continue;
13805 }
13806 if (!DT->isReachableFromEntry(I->getParent()))
13807 continue;
13808 auto *NodeA = DT->getNode(FirstInst->getParent());
13809 auto *NodeB = DT->getNode(I->getParent());
13810 assert(NodeA && "Should only process reachable instructions");
13811 assert(NodeB && "Should only process reachable instructions");
13812 assert((NodeA == NodeB) ==
13813 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13814 "Different nodes should have different DFS numbers");
13815 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13816 FirstInst = I;
13817 }
13818 return FirstInst;
13819 };
13820
13821 // Set insertpoint for gathered loads to the very first load.
13822 if (GatheredLoadsEntriesFirst.has_value() &&
13823 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13824 E->getOpcode() == Instruction::Load) {
13825 Res = FindFirstInst();
13826 return *Res;
13827 }
13828
13829 // Set the insert point to the beginning of the basic block if the entry
13830 // should not be scheduled.
13831 if (doesNotNeedToSchedule(E->Scalars) ||
13832 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13833 if ((E->getOpcode() == Instruction::GetElementPtr &&
13834 any_of(E->Scalars,
13835 [](Value *V) {
13836 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13837 })) ||
13838 all_of(E->Scalars,
13839 [](Value *V) {
13840 return isa<PoisonValue>(V) ||
13841 (!isVectorLikeInstWithConstOps(V) &&
13842 isUsedOutsideBlock(V));
13843 }) ||
13844 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13845 return isa<ExtractElementInst, UndefValue>(V) ||
13846 areAllOperandsNonInsts(V);
13847 })))
13848 Res = FindLastInst();
13849 else
13850 Res = FindFirstInst();
13851 return *Res;
13852 }
13853
13854 // Find the last instruction. The common case should be that BB has been
13855 // scheduled, and the last instruction is VL.back(). So we start with
13856 // VL.back() and iterate over schedule data until we reach the end of the
13857 // bundle. The end of the bundle is marked by null ScheduleData.
13858 if (BlocksSchedules.count(BB) && !E->isGather()) {
13859 Value *V = E->isOneOf(E->Scalars.back());
13861 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13862 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13863 if (Bundle && Bundle->isPartOfBundle())
13864 for (; Bundle; Bundle = Bundle->NextInBundle)
13865 Res = Bundle->Inst;
13866 }
13867
13868 // LastInst can still be null at this point if there's either not an entry
13869 // for BB in BlocksSchedules or there's no ScheduleData available for
13870 // VL.back(). This can be the case if buildTree_rec aborts for various
13871 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13872 // size is reached, etc.). ScheduleData is initialized in the scheduling
13873 // "dry-run".
13874 //
13875 // If this happens, we can still find the last instruction by brute force. We
13876 // iterate forwards from Front (inclusive) until we either see all
13877 // instructions in the bundle or reach the end of the block. If Front is the
13878 // last instruction in program order, LastInst will be set to Front, and we
13879 // will visit all the remaining instructions in the block.
13880 //
13881 // One of the reasons we exit early from buildTree_rec is to place an upper
13882 // bound on compile-time. Thus, taking an additional compile-time hit here is
13883 // not ideal. However, this should be exceedingly rare since it requires that
13884 // we both exit early from buildTree_rec and that the bundle be out-of-order
13885 // (causing us to iterate all the way to the end of the block).
13886 if (!Res)
13887 Res = FindLastInst();
13888 assert(Res && "Failed to find last instruction in bundle");
13889 return *Res;
13890}
13891
13892void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13893 auto *Front = E->getMainOp();
13894 Instruction *LastInst = &getLastInstructionInBundle(E);
13895 assert(LastInst && "Failed to find last instruction in bundle");
13896 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13897 // If the instruction is PHI, set the insert point after all the PHIs.
13898 bool IsPHI = isa<PHINode>(LastInst);
13899 if (IsPHI)
13900 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13901 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13902 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13903 } else {
13904 // Set the insertion point after the last instruction in the bundle. Set the
13905 // debug location to Front.
13906 Builder.SetInsertPoint(
13907 LastInst->getParent(),
13909 }
13910 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13911}
13912
13913Value *BoUpSLP::gather(
13914 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13915 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13916 // List of instructions/lanes from current block and/or the blocks which are
13917 // part of the current loop. These instructions will be inserted at the end to
13918 // make it possible to optimize loops and hoist invariant instructions out of
13919 // the loops body with better chances for success.
13921 SmallSet<int, 4> PostponedIndices;
13922 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13923 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13925 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13926 InsertBB = InsertBB->getSinglePredecessor();
13927 return InsertBB && InsertBB == InstBB;
13928 };
13929 for (int I = 0, E = VL.size(); I < E; ++I) {
13930 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13931 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13932 getTreeEntry(Inst) ||
13933 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13934 PostponedIndices.insert(I).second)
13935 PostponedInsts.emplace_back(Inst, I);
13936 }
13937
13938 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13939 Type *Ty) {
13940 Value *Scalar = V;
13941 if (Scalar->getType() != Ty) {
13942 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13943 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13944 Value *V = Scalar;
13945 if (auto *CI = dyn_cast<CastInst>(Scalar);
13946 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13947 Value *Op = CI->getOperand(0);
13948 if (auto *IOp = dyn_cast<Instruction>(Op);
13949 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13950 V = Op;
13951 }
13952 Scalar = Builder.CreateIntCast(
13953 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13954 }
13955
13956 Instruction *InsElt;
13957 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13958 assert(SLPReVec && "FixedVectorType is not expected.");
13959 Vec = InsElt = cast<Instruction>(createInsertVector(
13960 Builder, Vec, Scalar, Pos * getNumElements(VecTy)));
13961 auto *II = dyn_cast<IntrinsicInst>(InsElt);
13962 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13963 return Vec;
13964 } else {
13965 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13966 InsElt = dyn_cast<InsertElementInst>(Vec);
13967 if (!InsElt)
13968 return Vec;
13969 }
13970 GatherShuffleExtractSeq.insert(InsElt);
13971 CSEBlocks.insert(InsElt->getParent());
13972 // Add to our 'need-to-extract' list.
13973 if (isa<Instruction>(V)) {
13974 if (TreeEntry *Entry = getTreeEntry(V)) {
13975 // Find which lane we need to extract.
13976 User *UserOp = nullptr;
13977 if (Scalar != V) {
13978 if (auto *SI = dyn_cast<Instruction>(Scalar))
13979 UserOp = SI;
13980 } else {
13981 UserOp = InsElt;
13982 }
13983 if (UserOp) {
13984 unsigned FoundLane = Entry->findLaneForValue(V);
13985 ExternalUses.emplace_back(V, UserOp, FoundLane);
13986 }
13987 }
13988 }
13989 return Vec;
13990 };
13991 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13992 Value *Vec = PoisonValue::get(VecTy);
13993 SmallVector<int> NonConsts;
13995 std::iota(Mask.begin(), Mask.end(), 0);
13996 Value *OriginalRoot = Root;
13997 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13998 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13999 SV->getOperand(0)->getType() == VecTy) {
14000 Root = SV->getOperand(0);
14001 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14002 }
14003 // Insert constant values at first.
14004 for (int I = 0, E = VL.size(); I < E; ++I) {
14005 if (PostponedIndices.contains(I))
14006 continue;
14007 if (!isConstant(VL[I])) {
14008 NonConsts.push_back(I);
14009 continue;
14010 }
14011 if (isa<PoisonValue>(VL[I]))
14012 continue;
14013 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14014 Mask[I] = I + E;
14015 }
14016 if (Root) {
14017 if (isa<PoisonValue>(Vec)) {
14018 Vec = OriginalRoot;
14019 } else {
14020 Vec = CreateShuffle(Root, Vec, Mask);
14021 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
14022 OI && OI->hasNUses(0) &&
14023 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14024 return TE->VectorizedValue == OI;
14025 }))
14026 eraseInstruction(OI);
14027 }
14028 }
14029 // Insert non-constant values.
14030 for (int I : NonConsts)
14031 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14032 // Append instructions, which are/may be part of the loop, in the end to make
14033 // it possible to hoist non-loop-based instructions.
14034 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14035 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14036
14037 return Vec;
14038}
14039
14040/// Merges shuffle masks and emits final shuffle instruction, if required. It
14041/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14042/// when the actual shuffle instruction is generated only if this is actually
14043/// required. Otherwise, the shuffle instruction emission is delayed till the
14044/// end of the process, to reduce the number of emitted instructions and further
14045/// analysis/transformations.
14046/// The class also will look through the previously emitted shuffle instructions
14047/// and properly mark indices in mask as undef.
14048/// For example, given the code
14049/// \code
14050/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
14051/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
14052/// \endcode
14053/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
14054/// look through %s1 and %s2 and emit
14055/// \code
14056/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14057/// \endcode
14058/// instead.
14059/// If 2 operands are of different size, the smallest one will be resized and
14060/// the mask recalculated properly.
14061/// For example, given the code
14062/// \code
14063/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
14064/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
14065/// \endcode
14066/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
14067/// look through %s1 and %s2 and emit
14068/// \code
14069/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14070/// \endcode
14071/// instead.
14072class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
14073 bool IsFinalized = false;
14074 /// Combined mask for all applied operands and masks. It is built during
14075 /// analysis and actual emission of shuffle vector instructions.
14076 SmallVector<int> CommonMask;
14077 /// List of operands for the shuffle vector instruction. It hold at max 2
14078 /// operands, if the 3rd is going to be added, the first 2 are combined into
14079 /// shuffle with \p CommonMask mask, the first operand sets to be the
14080 /// resulting shuffle and the second operand sets to be the newly added
14081 /// operand. The \p CommonMask is transformed in the proper way after that.
14082 SmallVector<Value *, 2> InVectors;
14083 IRBuilderBase &Builder;
14084 BoUpSLP &R;
14085
14086 class ShuffleIRBuilder {
14087 IRBuilderBase &Builder;
14088 /// Holds all of the instructions that we gathered.
14089 SetVector<Instruction *> &GatherShuffleExtractSeq;
14090 /// A list of blocks that we are going to CSE.
14091 DenseSet<BasicBlock *> &CSEBlocks;
14092 /// Data layout.
14093 const DataLayout &DL;
14094
14095 public:
14096 ShuffleIRBuilder(IRBuilderBase &Builder,
14097 SetVector<Instruction *> &GatherShuffleExtractSeq,
14098 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
14099 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14100 CSEBlocks(CSEBlocks), DL(DL) {}
14101 ~ShuffleIRBuilder() = default;
14102 /// Creates shufflevector for the 2 operands with the given mask.
14103 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
14104 if (V1->getType() != V2->getType()) {
14106 V1->getType()->isIntOrIntVectorTy() &&
14107 "Expected integer vector types only.");
14108 if (V1->getType() != V2->getType()) {
14109 if (cast<VectorType>(V2->getType())
14110 ->getElementType()
14111 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14112 ->getElementType()
14113 ->getIntegerBitWidth())
14114 V2 = Builder.CreateIntCast(
14115 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
14116 else
14117 V1 = Builder.CreateIntCast(
14118 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
14119 }
14120 }
14121 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14122 if (auto *I = dyn_cast<Instruction>(Vec)) {
14123 GatherShuffleExtractSeq.insert(I);
14124 CSEBlocks.insert(I->getParent());
14125 }
14126 return Vec;
14127 }
14128 /// Creates permutation of the single vector operand with the given mask, if
14129 /// it is not identity mask.
14130 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
14131 if (Mask.empty())
14132 return V1;
14133 unsigned VF = Mask.size();
14134 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14135 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
14136 return V1;
14137 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14138 if (auto *I = dyn_cast<Instruction>(Vec)) {
14139 GatherShuffleExtractSeq.insert(I);
14140 CSEBlocks.insert(I->getParent());
14141 }
14142 return Vec;
14143 }
14144 Value *createIdentity(Value *V) { return V; }
14145 Value *createPoison(Type *Ty, unsigned VF) {
14146 return PoisonValue::get(getWidenedType(Ty, VF));
14147 }
14148 /// Resizes 2 input vector to match the sizes, if the they are not equal
14149 /// yet. The smallest vector is resized to the size of the larger vector.
14150 void resizeToMatch(Value *&V1, Value *&V2) {
14151 if (V1->getType() == V2->getType())
14152 return;
14153 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14154 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14155 int VF = std::max(V1VF, V2VF);
14156 int MinVF = std::min(V1VF, V2VF);
14157 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
14158 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14159 0);
14160 Value *&Op = MinVF == V1VF ? V1 : V2;
14161 Op = Builder.CreateShuffleVector(Op, IdentityMask);
14162 if (auto *I = dyn_cast<Instruction>(Op)) {
14163 GatherShuffleExtractSeq.insert(I);
14164 CSEBlocks.insert(I->getParent());
14165 }
14166 if (MinVF == V1VF)
14167 V1 = Op;
14168 else
14169 V2 = Op;
14170 }
14171 };
14172
14173 /// Smart shuffle instruction emission, walks through shuffles trees and
14174 /// tries to find the best matching vector for the actual shuffle
14175 /// instruction.
14176 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
14177 assert(V1 && "Expected at least one vector value.");
14178 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14179 R.CSEBlocks, *R.DL);
14180 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14181 ShuffleBuilder);
14182 }
14183
14184 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
14185 /// shuffle emission.
14186 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
14187 ArrayRef<int> Mask) {
14188 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14189 if (Mask[Idx] != PoisonMaskElem)
14190 CommonMask[Idx] = Idx;
14191 }
14192
14193 /// Cast value \p V to the vector type with the same number of elements, but
14194 /// the base type \p ScalarTy.
14195 Value *castToScalarTyElem(Value *V,
14196 std::optional<bool> IsSigned = std::nullopt) {
14197 auto *VecTy = cast<VectorType>(V->getType());
14198 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
14199 if (VecTy->getElementType() == ScalarTy->getScalarType())
14200 return V;
14201 return Builder.CreateIntCast(
14202 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14203 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
14204 }
14205
14206public:
14208 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14209
14210 /// Adjusts extractelements after reusing them.
14211 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14212 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14213 unsigned NumParts, bool &UseVecBaseAsInput) {
14214 UseVecBaseAsInput = false;
14215 SmallPtrSet<Value *, 4> UniqueBases;
14216 Value *VecBase = nullptr;
14217 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14218 if (!E->ReorderIndices.empty()) {
14219 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14220 E->ReorderIndices.end());
14221 reorderScalars(VL, ReorderMask);
14222 }
14223 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14224 int Idx = Mask[I];
14225 if (Idx == PoisonMaskElem)
14226 continue;
14227 auto *EI = cast<ExtractElementInst>(VL[I]);
14228 VecBase = EI->getVectorOperand();
14229 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
14230 VecBase = TE->VectorizedValue;
14231 assert(VecBase && "Expected vectorized value.");
14232 UniqueBases.insert(VecBase);
14233 // If the only one use is vectorized - can delete the extractelement
14234 // itself.
14235 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14236 (NumParts != 1 && count(VL, EI) > 1) ||
14237 any_of(EI->users(), [&](User *U) {
14238 const TreeEntry *UTE = R.getTreeEntry(U);
14239 return !UTE || R.MultiNodeScalars.contains(U) ||
14240 (isa<GetElementPtrInst>(U) &&
14241 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14242 count_if(R.VectorizableTree,
14243 [&](const std::unique_ptr<TreeEntry> &TE) {
14244 return any_of(TE->UserTreeIndices,
14245 [&](const EdgeInfo &Edge) {
14246 return Edge.UserTE == UTE;
14247 }) &&
14248 is_contained(VL, EI);
14249 }) != 1;
14250 }))
14251 continue;
14252 R.eraseInstruction(EI);
14253 }
14254 if (NumParts == 1 || UniqueBases.size() == 1) {
14255 assert(VecBase && "Expected vectorized value.");
14256 return castToScalarTyElem(VecBase);
14257 }
14258 UseVecBaseAsInput = true;
14259 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14260 for (auto [I, Idx] : enumerate(Mask))
14261 if (Idx != PoisonMaskElem)
14262 Idx = I;
14263 };
14264 // Perform multi-register vector shuffle, joining them into a single virtual
14265 // long vector.
14266 // Need to shuffle each part independently and then insert all this parts
14267 // into a long virtual vector register, forming the original vector.
14268 Value *Vec = nullptr;
14269 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14270 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14271 for (unsigned Part : seq<unsigned>(NumParts)) {
14272 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14273 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14274 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14275 constexpr int MaxBases = 2;
14276 SmallVector<Value *, MaxBases> Bases(MaxBases);
14277 auto VLMask = zip(SubVL, SubMask);
14278 const unsigned VF = std::accumulate(
14279 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14280 if (std::get<1>(D) == PoisonMaskElem)
14281 return S;
14282 Value *VecOp =
14283 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14284 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14285 VecOp = TE->VectorizedValue;
14286 assert(VecOp && "Expected vectorized value.");
14287 const unsigned Size =
14288 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14289 return std::max(S, Size);
14290 });
14291 for (const auto [V, I] : VLMask) {
14292 if (I == PoisonMaskElem)
14293 continue;
14294 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14295 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14296 VecOp = TE->VectorizedValue;
14297 assert(VecOp && "Expected vectorized value.");
14298 VecOp = castToScalarTyElem(VecOp);
14299 Bases[I / VF] = VecOp;
14300 }
14301 if (!Bases.front())
14302 continue;
14303 Value *SubVec;
14304 if (Bases.back()) {
14305 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14306 TransformToIdentity(SubMask);
14307 } else {
14308 SubVec = Bases.front();
14309 }
14310 if (!Vec) {
14311 Vec = SubVec;
14312 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14313 [&](unsigned P) {
14314 ArrayRef<int> SubMask =
14315 Mask.slice(P * SliceSize,
14316 getNumElems(Mask.size(),
14317 SliceSize, P));
14318 return all_of(SubMask, [](int Idx) {
14319 return Idx == PoisonMaskElem;
14320 });
14321 })) &&
14322 "Expected first part or all previous parts masked.");
14323 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14324 } else {
14325 unsigned NewVF =
14326 cast<FixedVectorType>(Vec->getType())->getNumElements();
14327 if (Vec->getType() != SubVec->getType()) {
14328 unsigned SubVecVF =
14329 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14330 NewVF = std::max(NewVF, SubVecVF);
14331 }
14332 // Adjust SubMask.
14333 for (int &Idx : SubMask)
14334 if (Idx != PoisonMaskElem)
14335 Idx += NewVF;
14336 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14337 Vec = createShuffle(Vec, SubVec, VecMask);
14338 TransformToIdentity(VecMask);
14339 }
14340 }
14341 copy(VecMask, Mask.begin());
14342 return Vec;
14343 }
14344 /// Checks if the specified entry \p E needs to be delayed because of its
14345 /// dependency nodes.
14346 std::optional<Value *>
14347 needToDelay(const TreeEntry *E,
14349 // No need to delay emission if all deps are ready.
14350 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14351 return all_of(
14352 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14353 }))
14354 return std::nullopt;
14355 // Postpone gather emission, will be emitted after the end of the
14356 // process to keep correct order.
14357 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14358 return Builder.CreateAlignedLoad(
14359 ResVecTy,
14361 MaybeAlign());
14362 }
14363 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14364 /// shuffling.
14365 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14366 Value *V1 = E1.VectorizedValue;
14367 if (V1->getType()->isIntOrIntVectorTy())
14368 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14369 if (isa<PoisonValue>(V))
14370 return false;
14371 return !isKnownNonNegative(
14372 V, SimplifyQuery(*R.DL));
14373 }));
14374 Value *V2 = E2.VectorizedValue;
14375 if (V2->getType()->isIntOrIntVectorTy())
14376 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14377 if (isa<PoisonValue>(V))
14378 return false;
14379 return !isKnownNonNegative(
14380 V, SimplifyQuery(*R.DL));
14381 }));
14382 add(V1, V2, Mask);
14383 }
14384 /// Adds single input vector (in form of tree entry) and the mask for its
14385 /// shuffling.
14386 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14387 Value *V1 = E1.VectorizedValue;
14388 if (V1->getType()->isIntOrIntVectorTy())
14389 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14390 if (isa<PoisonValue>(V))
14391 return false;
14392 return !isKnownNonNegative(
14393 V, SimplifyQuery(*R.DL));
14394 }));
14395 add(V1, Mask);
14396 }
14397 /// Adds 2 input vectors and the mask for their shuffling.
14398 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14399 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14400 assert(isa<FixedVectorType>(V1->getType()) &&
14401 isa<FixedVectorType>(V2->getType()) &&
14402 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14403 V1 = castToScalarTyElem(V1);
14404 V2 = castToScalarTyElem(V2);
14405 if (InVectors.empty()) {
14406 InVectors.push_back(V1);
14407 InVectors.push_back(V2);
14408 CommonMask.assign(Mask.begin(), Mask.end());
14409 return;
14410 }
14411 Value *Vec = InVectors.front();
14412 if (InVectors.size() == 2) {
14413 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14414 transformMaskAfterShuffle(CommonMask, CommonMask);
14415 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14416 Mask.size()) {
14417 Vec = createShuffle(Vec, nullptr, CommonMask);
14418 transformMaskAfterShuffle(CommonMask, CommonMask);
14419 }
14420 V1 = createShuffle(V1, V2, Mask);
14421 unsigned VF = std::max(getVF(V1), getVF(Vec));
14422 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14423 if (Mask[Idx] != PoisonMaskElem)
14424 CommonMask[Idx] = Idx + VF;
14425 InVectors.front() = Vec;
14426 if (InVectors.size() == 2)
14427 InVectors.back() = V1;
14428 else
14429 InVectors.push_back(V1);
14430 }
14431 /// Adds another one input vector and the mask for the shuffling.
14432 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14433 assert(isa<FixedVectorType>(V1->getType()) &&
14434 "castToScalarTyElem expects V1 to be FixedVectorType");
14435 V1 = castToScalarTyElem(V1);
14436 if (InVectors.empty()) {
14437 InVectors.push_back(V1);
14438 CommonMask.assign(Mask.begin(), Mask.end());
14439 return;
14440 }
14441 const auto *It = find(InVectors, V1);
14442 if (It == InVectors.end()) {
14443 if (InVectors.size() == 2 ||
14444 InVectors.front()->getType() != V1->getType()) {
14445 Value *V = InVectors.front();
14446 if (InVectors.size() == 2) {
14447 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14448 transformMaskAfterShuffle(CommonMask, CommonMask);
14449 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14450 CommonMask.size()) {
14451 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14452 transformMaskAfterShuffle(CommonMask, CommonMask);
14453 }
14454 unsigned VF = std::max(CommonMask.size(), Mask.size());
14455 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14456 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14457 CommonMask[Idx] =
14458 V->getType() != V1->getType()
14459 ? Idx + VF
14460 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14461 ->getNumElements();
14462 if (V->getType() != V1->getType())
14463 V1 = createShuffle(V1, nullptr, Mask);
14464 InVectors.front() = V;
14465 if (InVectors.size() == 2)
14466 InVectors.back() = V1;
14467 else
14468 InVectors.push_back(V1);
14469 return;
14470 }
14471 // Check if second vector is required if the used elements are already
14472 // used from the first one.
14473 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14474 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14475 InVectors.push_back(V1);
14476 break;
14477 }
14478 }
14479 int VF = getVF(V1);
14480 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14481 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14482 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14483 }
14484 /// Adds another one input vector and the mask for the shuffling.
14486 SmallVector<int> NewMask;
14487 inversePermutation(Order, NewMask);
14488 add(V1, NewMask);
14489 }
14490 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14491 Value *Root = nullptr) {
14492 return R.gather(VL, Root, ScalarTy,
14493 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14494 return createShuffle(V1, V2, Mask);
14495 });
14496 }
14497 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14498 /// Finalize emission of the shuffles.
14499 /// \param Action the action (if any) to be performed before final applying of
14500 /// the \p ExtMask mask.
14501 Value *
14503 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14504 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14505 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14506 IsFinalized = true;
14507 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
14508 SmallVector<int> NewExtMask(ExtMask);
14509 if (ScalarTyNumElements != 1) {
14510 assert(SLPReVec && "FixedVectorType is not expected.");
14511 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask);
14512 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask);
14513 ExtMask = NewExtMask;
14514 }
14515 if (Action) {
14516 Value *Vec = InVectors.front();
14517 if (InVectors.size() == 2) {
14518 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14519 InVectors.pop_back();
14520 } else {
14521 Vec = createShuffle(Vec, nullptr, CommonMask);
14522 }
14523 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14524 if (CommonMask[Idx] != PoisonMaskElem)
14525 CommonMask[Idx] = Idx;
14526 assert(VF > 0 &&
14527 "Expected vector length for the final value before action.");
14528 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14529 if (VecVF < VF) {
14530 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14531 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14532 Vec = createShuffle(Vec, nullptr, ResizeMask);
14533 }
14534 Action(Vec, CommonMask);
14535 InVectors.front() = Vec;
14536 }
14537 if (!SubVectors.empty()) {
14538 Value *Vec = InVectors.front();
14539 if (InVectors.size() == 2) {
14540 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14541 InVectors.pop_back();
14542 } else {
14543 Vec = createShuffle(Vec, nullptr, CommonMask);
14544 }
14545 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14546 if (CommonMask[Idx] != PoisonMaskElem)
14547 CommonMask[Idx] = Idx;
14548 auto CreateSubVectors = [&](Value *Vec,
14549 SmallVectorImpl<int> &CommonMask) {
14550 for (auto [E, Idx] : SubVectors) {
14551 Value *V = E->VectorizedValue;
14552 if (V->getType()->isIntOrIntVectorTy())
14553 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14554 if (isa<PoisonValue>(V))
14555 return false;
14556 return !isKnownNonNegative(
14557 V, SimplifyQuery(*R.DL));
14558 }));
14559 unsigned InsertionIndex = Idx * ScalarTyNumElements;
14560 Vec = createInsertVector(
14561 Builder, Vec, V, InsertionIndex,
14562 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
14563 _3));
14564 if (!CommonMask.empty()) {
14565 std::iota(
14566 std::next(CommonMask.begin(), InsertionIndex),
14567 std::next(CommonMask.begin(),
14568 (Idx + E->getVectorFactor()) * ScalarTyNumElements),
14569 InsertionIndex);
14570 }
14571 }
14572 return Vec;
14573 };
14574 if (SubVectorsMask.empty()) {
14575 Vec = CreateSubVectors(Vec, CommonMask);
14576 } else {
14577 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14578 copy(SubVectorsMask, SVMask.begin());
14579 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14580 if (I2 != PoisonMaskElem) {
14581 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14582 I1 = I2 + CommonMask.size();
14583 }
14584 }
14585 Value *InsertVec =
14586 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14587 Vec = createShuffle(InsertVec, Vec, SVMask);
14588 for (unsigned I : seq<unsigned>(CommonMask.size())) {
14589 if (SVMask[I] != PoisonMaskElem)
14590 CommonMask[I] = I;
14591 }
14592 }
14593 InVectors.front() = Vec;
14594 }
14595
14596 if (!ExtMask.empty()) {
14597 if (CommonMask.empty()) {
14598 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14599 } else {
14600 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14601 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14602 if (ExtMask[I] == PoisonMaskElem)
14603 continue;
14604 NewMask[I] = CommonMask[ExtMask[I]];
14605 }
14606 CommonMask.swap(NewMask);
14607 }
14608 }
14609 if (CommonMask.empty()) {
14610 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14611 return InVectors.front();
14612 }
14613 if (InVectors.size() == 2)
14614 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14615 return createShuffle(InVectors.front(), nullptr, CommonMask);
14616 }
14617
14619 assert((IsFinalized || CommonMask.empty()) &&
14620 "Shuffle construction must be finalized.");
14621 }
14622};
14623
14624BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14625 unsigned NodeIdx) {
14626 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14627 InstructionsState S = getSameOpcode(VL, *TLI);
14628 // Special processing for GEPs bundle, which may include non-gep values.
14629 if (!S && VL.front()->getType()->isPointerTy()) {
14630 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14631 if (It != VL.end())
14632 S = getSameOpcode(*It, *TLI);
14633 }
14634 if (!S)
14635 return nullptr;
14636 auto CheckSameVE = [&](const TreeEntry *VE) {
14637 return VE->isSame(VL) &&
14638 (any_of(VE->UserTreeIndices,
14639 [E, NodeIdx](const EdgeInfo &EI) {
14640 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14641 }) ||
14642 any_of(VectorizableTree,
14643 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14644 return TE->isOperandGatherNode(
14645 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14646 VE->isSame(TE->Scalars);
14647 }));
14648 };
14649 TreeEntry *VE = getTreeEntry(S.getMainOp());
14650 if (VE && CheckSameVE(VE))
14651 return VE;
14652 auto It = MultiNodeScalars.find(S.getMainOp());
14653 if (It != MultiNodeScalars.end()) {
14654 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
14655 return TE != VE && CheckSameVE(TE);
14656 });
14657 if (I != It->getSecond().end())
14658 return *I;
14659 }
14660 return nullptr;
14661}
14662
14663Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14664 bool PostponedPHIs) {
14665 ValueList &VL = E->getOperand(NodeIdx);
14666 const unsigned VF = VL.size();
14667 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14668 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14669 // V may be affected by MinBWs.
14670 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14671 // factor is the number of elements, not their type.
14672 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14673 unsigned NumElements = getNumElements(VL.front()->getType());
14674 ShuffleInstructionBuilder ShuffleBuilder(
14675 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14676 : ScalarTy,
14677 Builder, *this);
14678 ShuffleBuilder.add(V, Mask);
14680 E->CombinedEntriesWithIndices.size());
14681 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14682 [&](const auto &P) {
14683 return std::make_pair(VectorizableTree[P.first].get(),
14684 P.second);
14685 });
14686 assert((E->CombinedEntriesWithIndices.empty() ||
14687 E->ReorderIndices.empty()) &&
14688 "Expected either combined subnodes or reordering");
14689 return ShuffleBuilder.finalize({}, SubVectors, {});
14690 };
14691 Value *V = vectorizeTree(VE, PostponedPHIs);
14692 if (VF * getNumElements(VL[0]->getType()) !=
14693 cast<FixedVectorType>(V->getType())->getNumElements()) {
14694 if (!VE->ReuseShuffleIndices.empty()) {
14695 // Reshuffle to get only unique values.
14696 // If some of the scalars are duplicated in the vectorization
14697 // tree entry, we do not vectorize them but instead generate a
14698 // mask for the reuses. But if there are several users of the
14699 // same entry, they may have different vectorization factors.
14700 // This is especially important for PHI nodes. In this case, we
14701 // need to adapt the resulting instruction for the user
14702 // vectorization factor and have to reshuffle it again to take
14703 // only unique elements of the vector. Without this code the
14704 // function incorrectly returns reduced vector instruction with
14705 // the same elements, not with the unique ones.
14706
14707 // block:
14708 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14709 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14710 // ... (use %2)
14711 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14712 // br %block
14714 for (auto [I, V] : enumerate(VL)) {
14715 if (isa<PoisonValue>(V))
14716 continue;
14717 Mask[I] = VE->findLaneForValue(V);
14718 }
14719 V = FinalShuffle(V, Mask);
14720 } else {
14721 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14722 "Expected vectorization factor less "
14723 "than original vector size.");
14724 SmallVector<int> UniformMask(VF, 0);
14725 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14726 V = FinalShuffle(V, UniformMask);
14727 }
14728 }
14729 // Need to update the operand gather node, if actually the operand is not a
14730 // vectorized node, but the buildvector/gather node, which matches one of
14731 // the vectorized nodes.
14732 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14733 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14734 }) == VE->UserTreeIndices.end()) {
14735 auto *It =
14736 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14737 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14738 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14739 });
14740 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14741 (*It)->VectorizedValue = V;
14742 }
14743 return V;
14744 }
14745
14746 // Find the corresponding gather entry and vectorize it.
14747 // Allows to be more accurate with tree/graph transformations, checks for the
14748 // correctness of the transformations in many cases.
14749 auto *I = find_if(VectorizableTree,
14750 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14751 return TE->isOperandGatherNode({E, NodeIdx});
14752 });
14753 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14754 assert(I->get()->UserTreeIndices.size() == 1 &&
14755 "Expected only single user for the gather node.");
14756 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14757 return vectorizeTree(I->get(), PostponedPHIs);
14758}
14759
14760template <typename BVTy, typename ResTy, typename... Args>
14761ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14762 Args &...Params) {
14763 assert(E->isGather() && "Expected gather node.");
14764 unsigned VF = E->getVectorFactor();
14765
14766 bool NeedFreeze = false;
14767 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14768 E->ReuseShuffleIndices.end());
14769 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14770 // Clear values, to be replaced by insertvector instructions.
14771 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14772 for_each(MutableArrayRef(GatheredScalars)
14773 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14774 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14776 E->CombinedEntriesWithIndices.size());
14777 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14778 [&](const auto &P) {
14779 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14780 });
14781 // Build a mask out of the reorder indices and reorder scalars per this
14782 // mask.
14783 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14784 E->ReorderIndices.end());
14785 if (!ReorderMask.empty())
14786 reorderScalars(GatheredScalars, ReorderMask);
14787 SmallVector<int> SubVectorsMask;
14788 inversePermutation(E->ReorderIndices, SubVectorsMask);
14789 // Transform non-clustered elements in the mask to poison (-1).
14790 // "Clustered" operations will be reordered using this mask later.
14791 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14792 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14793 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14794 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14795 } else {
14796 SubVectorsMask.clear();
14797 }
14798 SmallVector<Value *> StoredGS(GatheredScalars);
14799 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14800 unsigned I, unsigned SliceSize,
14801 bool IsNotPoisonous) {
14802 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14803 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14804 }))
14805 return false;
14806 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14807 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14808 if (UserTE->getNumOperands() != 2)
14809 return false;
14810 if (!IsNotPoisonous) {
14811 auto *It =
14812 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14813 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14814 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14815 }) != TE->UserTreeIndices.end();
14816 });
14817 if (It == VectorizableTree.end())
14818 return false;
14819 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14820 if (!(*It)->ReorderIndices.empty()) {
14821 inversePermutation((*It)->ReorderIndices, ReorderMask);
14822 reorderScalars(GS, ReorderMask);
14823 }
14824 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14825 Value *V0 = std::get<0>(P);
14826 Value *V1 = std::get<1>(P);
14827 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14828 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14829 is_contained(E->Scalars, V1));
14830 }))
14831 return false;
14832 }
14833 int Idx;
14834 if ((Mask.size() < InputVF &&
14836 Idx == 0) ||
14837 (Mask.size() == InputVF &&
14838 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14839 std::iota(
14840 std::next(Mask.begin(), I * SliceSize),
14841 std::next(Mask.begin(),
14842 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14843 0);
14844 } else {
14845 unsigned IVal =
14846 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14847 std::fill(
14848 std::next(Mask.begin(), I * SliceSize),
14849 std::next(Mask.begin(),
14850 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14851 IVal);
14852 }
14853 return true;
14854 };
14855 BVTy ShuffleBuilder(ScalarTy, Params...);
14856 ResTy Res = ResTy();
14858 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14860 Value *ExtractVecBase = nullptr;
14861 bool UseVecBaseAsInput = false;
14864 Type *OrigScalarTy = GatheredScalars.front()->getType();
14865 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14866 unsigned NumParts = TTI->getNumberOfParts(VecTy);
14867 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14868 VecTy->getNumElements() % NumParts != 0 ||
14870 VecTy->getNumElements() / NumParts))
14871 NumParts = 1;
14872 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14873 // Check for gathered extracts.
14874 bool Resized = false;
14875 ExtractShuffles =
14876 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14877 if (!ExtractShuffles.empty()) {
14878 SmallVector<const TreeEntry *> ExtractEntries;
14879 for (auto [Idx, I] : enumerate(ExtractMask)) {
14880 if (I == PoisonMaskElem)
14881 continue;
14882 if (const auto *TE = getTreeEntry(
14883 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14884 ExtractEntries.push_back(TE);
14885 }
14886 if (std::optional<ResTy> Delayed =
14887 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14888 // Delay emission of gathers which are not ready yet.
14889 PostponedGathers.insert(E);
14890 // Postpone gather emission, will be emitted after the end of the
14891 // process to keep correct order.
14892 return *Delayed;
14893 }
14894 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14895 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14896 ExtractVecBase = VecBase;
14897 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14898 if (VF == VecBaseTy->getNumElements() &&
14899 GatheredScalars.size() != VF) {
14900 Resized = true;
14901 GatheredScalars.append(VF - GatheredScalars.size(),
14902 PoisonValue::get(OrigScalarTy));
14903 }
14904 }
14905 }
14906 // Gather extracts after we check for full matched gathers only.
14907 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
14908 ((E->getOpcode() == Instruction::Load ||
14909 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14910 any_of(E->Scalars,
14911 [this](Value *V) {
14912 return isa<LoadInst>(V) && getTreeEntry(V);
14913 })) ||
14914 E->isAltShuffle() ||
14915 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14916 isSplat(E->Scalars) ||
14917 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14918 GatherShuffles =
14919 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14920 }
14921 if (!GatherShuffles.empty()) {
14922 if (std::optional<ResTy> Delayed =
14923 ShuffleBuilder.needToDelay(E, Entries)) {
14924 // Delay emission of gathers which are not ready yet.
14925 PostponedGathers.insert(E);
14926 // Postpone gather emission, will be emitted after the end of the
14927 // process to keep correct order.
14928 return *Delayed;
14929 }
14930 if (GatherShuffles.size() == 1 &&
14931 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14932 Entries.front().front()->isSame(E->Scalars)) {
14933 // Perfect match in the graph, will reuse the previously vectorized
14934 // node. Cost is 0.
14935 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14936 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14937 // Restore the mask for previous partially matched values.
14938 Mask.resize(E->Scalars.size());
14939 const TreeEntry *FrontTE = Entries.front().front();
14940 if (FrontTE->ReorderIndices.empty() &&
14941 ((FrontTE->ReuseShuffleIndices.empty() &&
14942 E->Scalars.size() == FrontTE->Scalars.size()) ||
14943 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14944 std::iota(Mask.begin(), Mask.end(), 0);
14945 } else {
14946 for (auto [I, V] : enumerate(E->Scalars)) {
14947 if (isa<PoisonValue>(V)) {
14949 continue;
14950 }
14951 Mask[I] = FrontTE->findLaneForValue(V);
14952 }
14953 }
14954 ShuffleBuilder.add(*FrontTE, Mask);
14955 // Full matched entry found, no need to insert subvectors.
14956 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14957 return Res;
14958 }
14959 if (!Resized) {
14960 if (GatheredScalars.size() != VF &&
14961 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14962 return any_of(TEs, [&](const TreeEntry *TE) {
14963 return TE->getVectorFactor() == VF;
14964 });
14965 }))
14966 GatheredScalars.append(VF - GatheredScalars.size(),
14967 PoisonValue::get(OrigScalarTy));
14968 }
14969 // Remove shuffled elements from list of gathers.
14970 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14971 if (Mask[I] != PoisonMaskElem)
14972 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14973 }
14974 }
14975 }
14976 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14977 SmallVectorImpl<int> &ReuseMask,
14978 bool IsRootPoison) {
14979 // For splats with can emit broadcasts instead of gathers, so try to find
14980 // such sequences.
14981 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
14982 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14983 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
14984 SmallVector<int> UndefPos;
14985 DenseMap<Value *, unsigned> UniquePositions;
14986 // Gather unique non-const values and all constant values.
14987 // For repeated values, just shuffle them.
14988 int NumNonConsts = 0;
14989 int SinglePos = 0;
14990 for (auto [I, V] : enumerate(Scalars)) {
14991 if (isa<UndefValue>(V)) {
14992 if (!isa<PoisonValue>(V)) {
14993 ReuseMask[I] = I;
14994 UndefPos.push_back(I);
14995 }
14996 continue;
14997 }
14998 if (isConstant(V)) {
14999 ReuseMask[I] = I;
15000 continue;
15001 }
15002 ++NumNonConsts;
15003 SinglePos = I;
15004 Value *OrigV = V;
15005 Scalars[I] = PoisonValue::get(OrigScalarTy);
15006 if (IsSplat) {
15007 Scalars.front() = OrigV;
15008 ReuseMask[I] = 0;
15009 } else {
15010 const auto Res = UniquePositions.try_emplace(OrigV, I);
15011 Scalars[Res.first->second] = OrigV;
15012 ReuseMask[I] = Res.first->second;
15013 }
15014 }
15015 if (NumNonConsts == 1) {
15016 // Restore single insert element.
15017 if (IsSplat) {
15018 ReuseMask.assign(VF, PoisonMaskElem);
15019 std::swap(Scalars.front(), Scalars[SinglePos]);
15020 if (!UndefPos.empty() && UndefPos.front() == 0)
15021 Scalars.front() = UndefValue::get(OrigScalarTy);
15022 }
15023 ReuseMask[SinglePos] = SinglePos;
15024 } else if (!UndefPos.empty() && IsSplat) {
15025 // For undef values, try to replace them with the simple broadcast.
15026 // We can do it if the broadcasted value is guaranteed to be
15027 // non-poisonous, or by freezing the incoming scalar value first.
15028 auto *It = find_if(Scalars, [this, E](Value *V) {
15029 return !isa<UndefValue>(V) &&
15030 (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
15031 (E->UserTreeIndices.size() == 1 &&
15032 any_of(V->uses(), [E](const Use &U) {
15033 // Check if the value already used in the same operation in
15034 // one of the nodes already.
15035 return E->UserTreeIndices.front().EdgeIdx !=
15036 U.getOperandNo() &&
15037 is_contained(
15038 E->UserTreeIndices.front().UserTE->Scalars,
15039 U.getUser());
15040 })));
15041 });
15042 if (It != Scalars.end()) {
15043 // Replace undefs by the non-poisoned scalars and emit broadcast.
15044 int Pos = std::distance(Scalars.begin(), It);
15045 for (int I : UndefPos) {
15046 // Set the undef position to the non-poisoned scalar.
15047 ReuseMask[I] = Pos;
15048 // Replace the undef by the poison, in the mask it is replaced by
15049 // non-poisoned scalar already.
15050 if (I != Pos)
15051 Scalars[I] = PoisonValue::get(OrigScalarTy);
15052 }
15053 } else {
15054 // Replace undefs by the poisons, emit broadcast and then emit
15055 // freeze.
15056 for (int I : UndefPos) {
15057 ReuseMask[I] = PoisonMaskElem;
15058 if (isa<UndefValue>(Scalars[I]))
15059 Scalars[I] = PoisonValue::get(OrigScalarTy);
15060 }
15061 NeedFreeze = true;
15062 }
15063 }
15064 };
15065 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15066 bool IsNonPoisoned = true;
15067 bool IsUsedInExpr = true;
15068 Value *Vec1 = nullptr;
15069 if (!ExtractShuffles.empty()) {
15070 // Gather of extractelements can be represented as just a shuffle of
15071 // a single/two vectors the scalars are extracted from.
15072 // Find input vectors.
15073 Value *Vec2 = nullptr;
15074 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15075 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
15076 ExtractMask[I] = PoisonMaskElem;
15077 }
15078 if (UseVecBaseAsInput) {
15079 Vec1 = ExtractVecBase;
15080 } else {
15081 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15082 if (ExtractMask[I] == PoisonMaskElem)
15083 continue;
15084 if (isa<UndefValue>(E->Scalars[I]))
15085 continue;
15086 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15087 Value *VecOp = EI->getVectorOperand();
15088 if (const auto *TE = getTreeEntry(VecOp))
15089 if (TE->VectorizedValue)
15090 VecOp = TE->VectorizedValue;
15091 if (!Vec1) {
15092 Vec1 = VecOp;
15093 } else if (Vec1 != VecOp) {
15094 assert((!Vec2 || Vec2 == VecOp) &&
15095 "Expected only 1 or 2 vectors shuffle.");
15096 Vec2 = VecOp;
15097 }
15098 }
15099 }
15100 if (Vec2) {
15101 IsUsedInExpr = false;
15102 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
15103 isGuaranteedNotToBePoison(Vec2, AC);
15104 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15105 } else if (Vec1) {
15106 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
15107 IsUsedInExpr &= FindReusedSplat(
15108 ExtractMask,
15109 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15110 ExtractMask.size(), IsNotPoisonedVec);
15111 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
15112 IsNonPoisoned &= IsNotPoisonedVec;
15113 } else {
15114 IsUsedInExpr = false;
15115 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15116 /*ForExtracts=*/true);
15117 }
15118 }
15119 if (!GatherShuffles.empty()) {
15120 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
15121 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
15122 for (const auto [I, TEs] : enumerate(Entries)) {
15123 if (TEs.empty()) {
15124 assert(!GatherShuffles[I] &&
15125 "No shuffles with empty entries list expected.");
15126 continue;
15127 }
15128 assert((TEs.size() == 1 || TEs.size() == 2) &&
15129 "Expected shuffle of 1 or 2 entries.");
15130 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
15131 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
15132 VecMask.assign(VecMask.size(), PoisonMaskElem);
15133 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
15134 if (TEs.size() == 1) {
15135 bool IsNotPoisonedVec =
15136 TEs.front()->VectorizedValue
15137 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15138 : true;
15139 IsUsedInExpr &=
15140 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
15141 SliceSize, IsNotPoisonedVec);
15142 ShuffleBuilder.add(*TEs.front(), VecMask);
15143 IsNonPoisoned &= IsNotPoisonedVec;
15144 } else {
15145 IsUsedInExpr = false;
15146 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15147 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15148 IsNonPoisoned &=
15149 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15150 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15151 }
15152 }
15153 }
15154 // Try to figure out best way to combine values: build a shuffle and insert
15155 // elements or just build several shuffles.
15156 // Insert non-constant scalars.
15157 SmallVector<Value *> NonConstants(GatheredScalars);
15158 int EMSz = ExtractMask.size();
15159 int MSz = Mask.size();
15160 // Try to build constant vector and shuffle with it only if currently we
15161 // have a single permutation and more than 1 scalar constants.
15162 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15163 bool IsIdentityShuffle =
15164 ((UseVecBaseAsInput ||
15165 all_of(ExtractShuffles,
15166 [](const std::optional<TTI::ShuffleKind> &SK) {
15167 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15169 })) &&
15170 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
15171 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15172 (!GatherShuffles.empty() &&
15173 all_of(GatherShuffles,
15174 [](const std::optional<TTI::ShuffleKind> &SK) {
15175 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15177 }) &&
15178 none_of(Mask, [&](int I) { return I >= MSz; }) &&
15180 bool EnoughConstsForShuffle =
15181 IsSingleShuffle &&
15182 (none_of(GatheredScalars,
15183 [](Value *V) {
15184 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15185 }) ||
15186 any_of(GatheredScalars,
15187 [](Value *V) {
15188 return isa<Constant>(V) && !isa<UndefValue>(V);
15189 })) &&
15190 (!IsIdentityShuffle ||
15191 (GatheredScalars.size() == 2 &&
15192 any_of(GatheredScalars,
15193 [](Value *V) { return !isa<UndefValue>(V); })) ||
15194 count_if(GatheredScalars, [](Value *V) {
15195 return isa<Constant>(V) && !isa<PoisonValue>(V);
15196 }) > 1);
15197 // NonConstants array contains just non-constant values, GatheredScalars
15198 // contains only constant to build final vector and then shuffle.
15199 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
15200 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15201 NonConstants[I] = PoisonValue::get(OrigScalarTy);
15202 else
15203 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15204 }
15205 // Generate constants for final shuffle and build a mask for them.
15206 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15207 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
15208 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
15209 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15210 ShuffleBuilder.add(BV, BVMask);
15211 }
15212 if (all_of(NonConstants, [=](Value *V) {
15213 return isa<PoisonValue>(V) ||
15214 (IsSingleShuffle && ((IsIdentityShuffle &&
15215 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15216 }))
15217 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15218 SubVectorsMask);
15219 else
15220 Res = ShuffleBuilder.finalize(
15221 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15222 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
15223 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
15224 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
15225 });
15226 } else if (!allConstant(GatheredScalars)) {
15227 // Gather unique scalars and all constants.
15228 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
15229 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
15230 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15231 ShuffleBuilder.add(BV, ReuseMask);
15232 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15233 SubVectorsMask);
15234 } else {
15235 // Gather all constants.
15236 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15237 for (auto [I, V] : enumerate(GatheredScalars)) {
15238 if (!isa<PoisonValue>(V))
15239 Mask[I] = I;
15240 }
15241 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15242 ShuffleBuilder.add(BV, Mask);
15243 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15244 SubVectorsMask);
15245 }
15246
15247 if (NeedFreeze)
15248 Res = ShuffleBuilder.createFreeze(Res);
15249 return Res;
15250}
15251
15252Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15253 bool PostponedPHIs) {
15254 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15255 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15256 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15257 Builder, *this);
15258}
15259
15260/// \returns \p I after propagating metadata from \p VL only for instructions in
15261/// \p VL.
15264 for (Value *V : VL)
15265 if (isa<Instruction>(V))
15266 Insts.push_back(V);
15267 return llvm::propagateMetadata(Inst, Insts);
15268}
15269
15270Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15271 IRBuilderBase::InsertPointGuard Guard(Builder);
15272
15273 if (E->VectorizedValue &&
15274 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15275 E->isAltShuffle())) {
15276 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15277 return E->VectorizedValue;
15278 }
15279
15280 Value *V = E->Scalars.front();
15281 Type *ScalarTy = V->getType();
15282 if (!isa<CmpInst>(V))
15283 ScalarTy = getValueType(V);
15284 auto It = MinBWs.find(E);
15285 if (It != MinBWs.end()) {
15286 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15287 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15288 if (VecTy)
15289 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15290 }
15291 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15292 if (E->isGather()) {
15293 // Set insert point for non-reduction initial nodes.
15294 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15295 setInsertPointAfterBundle(E);
15296 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15297 E->VectorizedValue = Vec;
15298 return Vec;
15299 }
15300
15301 bool IsReverseOrder =
15302 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15303 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15304 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15305 if (E->getOpcode() == Instruction::Store &&
15306 E->State == TreeEntry::Vectorize) {
15308 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15309 E->ReorderIndices.size());
15310 ShuffleBuilder.add(V, Mask);
15311 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15312 ShuffleBuilder.addOrdered(V, {});
15313 } else {
15314 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15315 }
15317 E->CombinedEntriesWithIndices.size());
15318 transform(
15319 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15320 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15321 });
15322 assert(
15323 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15324 "Expected either combined subnodes or reordering");
15325 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15326 };
15327
15328 assert(!E->isGather() && "Unhandled state");
15329 unsigned ShuffleOrOp =
15330 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15331 Instruction *VL0 = E->getMainOp();
15332 auto GetOperandSignedness = [&](unsigned Idx) {
15333 const TreeEntry *OpE = getOperandEntry(E, Idx);
15334 bool IsSigned = false;
15335 auto It = MinBWs.find(OpE);
15336 if (It != MinBWs.end())
15337 IsSigned = It->second.second;
15338 else
15339 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15340 if (isa<PoisonValue>(V))
15341 return false;
15342 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15343 });
15344 return IsSigned;
15345 };
15346 switch (ShuffleOrOp) {
15347 case Instruction::PHI: {
15348 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15349 E != VectorizableTree.front().get() ||
15350 !E->UserTreeIndices.empty()) &&
15351 "PHI reordering is free.");
15352 if (PostponedPHIs && E->VectorizedValue)
15353 return E->VectorizedValue;
15354 auto *PH = cast<PHINode>(VL0);
15355 Builder.SetInsertPoint(PH->getParent(),
15356 PH->getParent()->getFirstNonPHIIt());
15357 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15358 if (PostponedPHIs || !E->VectorizedValue) {
15359 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15360 E->PHI = NewPhi;
15361 Value *V = NewPhi;
15362
15363 // Adjust insertion point once all PHI's have been generated.
15364 Builder.SetInsertPoint(PH->getParent(),
15365 PH->getParent()->getFirstInsertionPt());
15366 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15367
15368 V = FinalShuffle(V, E);
15369
15370 E->VectorizedValue = V;
15371 if (PostponedPHIs)
15372 return V;
15373 }
15374 PHINode *NewPhi = cast<PHINode>(E->PHI);
15375 // If phi node is fully emitted - exit.
15376 if (NewPhi->getNumIncomingValues() != 0)
15377 return NewPhi;
15378
15379 // PHINodes may have multiple entries from the same block. We want to
15380 // visit every block once.
15382
15383 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15385 BasicBlock *IBB = PH->getIncomingBlock(I);
15386
15387 // Stop emission if all incoming values are generated.
15388 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15389 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15390 return NewPhi;
15391 }
15392
15393 if (!VisitedBBs.insert(IBB).second) {
15394 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15395 continue;
15396 }
15397
15398 Builder.SetInsertPoint(IBB->getTerminator());
15399 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15400 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15401 if (VecTy != Vec->getType()) {
15402 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15403 MinBWs.contains(getOperandEntry(E, I))) &&
15404 "Expected item in MinBWs.");
15405 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15406 }
15407 NewPhi->addIncoming(Vec, IBB);
15408 }
15409
15410 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15411 "Invalid number of incoming values");
15412 assert(E->VectorizedValue && "Expected vectorized value.");
15413 return E->VectorizedValue;
15414 }
15415
15416 case Instruction::ExtractElement: {
15417 Value *V = E->getSingleOperand(0);
15418 if (const TreeEntry *TE = getTreeEntry(V))
15419 V = TE->VectorizedValue;
15420 setInsertPointAfterBundle(E);
15421 V = FinalShuffle(V, E);
15422 E->VectorizedValue = V;
15423 return V;
15424 }
15425 case Instruction::ExtractValue: {
15426 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15427 Builder.SetInsertPoint(LI);
15428 Value *Ptr = LI->getPointerOperand();
15429 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15430 Value *NewV = ::propagateMetadata(V, E->Scalars);
15431 NewV = FinalShuffle(NewV, E);
15432 E->VectorizedValue = NewV;
15433 return NewV;
15434 }
15435 case Instruction::InsertElement: {
15436 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15437 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15438 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15439 ArrayRef<Value *> Op = E->getOperand(1);
15440 Type *ScalarTy = Op.front()->getType();
15441 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15442 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15443 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15444 assert(Res.first > 0 && "Expected item in MinBWs.");
15445 V = Builder.CreateIntCast(
15446 V,
15448 ScalarTy,
15449 cast<FixedVectorType>(V->getType())->getNumElements()),
15450 Res.second);
15451 }
15452
15453 // Create InsertVector shuffle if necessary
15454 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15455 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15456 }));
15457 const unsigned NumElts =
15458 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15459 const unsigned NumScalars = E->Scalars.size();
15460
15461 unsigned Offset = *getElementIndex(VL0);
15462 assert(Offset < NumElts && "Failed to find vector index offset");
15463
15464 // Create shuffle to resize vector
15466 if (!E->ReorderIndices.empty()) {
15467 inversePermutation(E->ReorderIndices, Mask);
15468 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15469 } else {
15470 Mask.assign(NumElts, PoisonMaskElem);
15471 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15472 }
15473 // Create InsertVector shuffle if necessary
15474 bool IsIdentity = true;
15475 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15476 Mask.swap(PrevMask);
15477 for (unsigned I = 0; I < NumScalars; ++I) {
15478 Value *Scalar = E->Scalars[PrevMask[I]];
15479 unsigned InsertIdx = *getElementIndex(Scalar);
15480 IsIdentity &= InsertIdx - Offset == I;
15481 Mask[InsertIdx - Offset] = I;
15482 }
15483 if (!IsIdentity || NumElts != NumScalars) {
15484 Value *V2 = nullptr;
15485 bool IsVNonPoisonous =
15487 SmallVector<int> InsertMask(Mask);
15488 if (NumElts != NumScalars && Offset == 0) {
15489 // Follow all insert element instructions from the current buildvector
15490 // sequence.
15491 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15492 do {
15493 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15494 if (!InsertIdx)
15495 break;
15496 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15497 InsertMask[*InsertIdx] = *InsertIdx;
15498 if (!Ins->hasOneUse())
15499 break;
15500 Ins = dyn_cast_or_null<InsertElementInst>(
15501 Ins->getUniqueUndroppableUser());
15502 } while (Ins);
15503 SmallBitVector UseMask =
15504 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15505 SmallBitVector IsFirstPoison =
15506 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15507 SmallBitVector IsFirstUndef =
15508 isUndefVector(FirstInsert->getOperand(0), UseMask);
15509 if (!IsFirstPoison.all()) {
15510 unsigned Idx = 0;
15511 for (unsigned I = 0; I < NumElts; I++) {
15512 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15513 IsFirstUndef.test(I)) {
15514 if (IsVNonPoisonous) {
15515 InsertMask[I] = I < NumScalars ? I : 0;
15516 continue;
15517 }
15518 if (!V2)
15519 V2 = UndefValue::get(V->getType());
15520 if (Idx >= NumScalars)
15521 Idx = NumScalars - 1;
15522 InsertMask[I] = NumScalars + Idx;
15523 ++Idx;
15524 } else if (InsertMask[I] != PoisonMaskElem &&
15525 Mask[I] == PoisonMaskElem) {
15526 InsertMask[I] = PoisonMaskElem;
15527 }
15528 }
15529 } else {
15530 InsertMask = Mask;
15531 }
15532 }
15533 if (!V2)
15534 V2 = PoisonValue::get(V->getType());
15535 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15536 if (auto *I = dyn_cast<Instruction>(V)) {
15537 GatherShuffleExtractSeq.insert(I);
15538 CSEBlocks.insert(I->getParent());
15539 }
15540 }
15541
15542 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15543 for (unsigned I = 0; I < NumElts; I++) {
15544 if (Mask[I] != PoisonMaskElem)
15545 InsertMask[Offset + I] = I;
15546 }
15547 SmallBitVector UseMask =
15548 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15549 SmallBitVector IsFirstUndef =
15550 isUndefVector(FirstInsert->getOperand(0), UseMask);
15551 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15552 NumElts != NumScalars) {
15553 if (IsFirstUndef.all()) {
15554 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15555 SmallBitVector IsFirstPoison =
15556 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15557 if (!IsFirstPoison.all()) {
15558 for (unsigned I = 0; I < NumElts; I++) {
15559 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15560 InsertMask[I] = I + NumElts;
15561 }
15562 }
15563 V = Builder.CreateShuffleVector(
15564 V,
15565 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15566 : FirstInsert->getOperand(0),
15567 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15568 if (auto *I = dyn_cast<Instruction>(V)) {
15569 GatherShuffleExtractSeq.insert(I);
15570 CSEBlocks.insert(I->getParent());
15571 }
15572 }
15573 } else {
15574 SmallBitVector IsFirstPoison =
15575 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15576 for (unsigned I = 0; I < NumElts; I++) {
15577 if (InsertMask[I] == PoisonMaskElem)
15578 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15579 else
15580 InsertMask[I] += NumElts;
15581 }
15582 V = Builder.CreateShuffleVector(
15583 FirstInsert->getOperand(0), V, InsertMask,
15584 cast<Instruction>(E->Scalars.back())->getName());
15585 if (auto *I = dyn_cast<Instruction>(V)) {
15586 GatherShuffleExtractSeq.insert(I);
15587 CSEBlocks.insert(I->getParent());
15588 }
15589 }
15590 }
15591
15592 ++NumVectorInstructions;
15593 E->VectorizedValue = V;
15594 return V;
15595 }
15596 case Instruction::ZExt:
15597 case Instruction::SExt:
15598 case Instruction::FPToUI:
15599 case Instruction::FPToSI:
15600 case Instruction::FPExt:
15601 case Instruction::PtrToInt:
15602 case Instruction::IntToPtr:
15603 case Instruction::SIToFP:
15604 case Instruction::UIToFP:
15605 case Instruction::Trunc:
15606 case Instruction::FPTrunc:
15607 case Instruction::BitCast: {
15608 setInsertPointAfterBundle(E);
15609
15610 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15611 if (E->VectorizedValue) {
15612 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15613 return E->VectorizedValue;
15614 }
15615
15616 auto *CI = cast<CastInst>(VL0);
15617 Instruction::CastOps VecOpcode = CI->getOpcode();
15618 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15619 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15620 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15621 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15622 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15623 // Check if the values are candidates to demote.
15624 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15625 if (SrcIt != MinBWs.end())
15626 SrcBWSz = SrcIt->second.first;
15627 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15628 if (BWSz == SrcBWSz) {
15629 VecOpcode = Instruction::BitCast;
15630 } else if (BWSz < SrcBWSz) {
15631 VecOpcode = Instruction::Trunc;
15632 } else if (It != MinBWs.end()) {
15633 assert(BWSz > SrcBWSz && "Invalid cast!");
15634 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15635 } else if (SrcIt != MinBWs.end()) {
15636 assert(BWSz > SrcBWSz && "Invalid cast!");
15637 VecOpcode =
15638 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15639 }
15640 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15641 !SrcIt->second.second) {
15642 VecOpcode = Instruction::UIToFP;
15643 }
15644 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15645 ? InVec
15646 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15647 V = FinalShuffle(V, E);
15648
15649 E->VectorizedValue = V;
15650 ++NumVectorInstructions;
15651 return V;
15652 }
15653 case Instruction::FCmp:
15654 case Instruction::ICmp: {
15655 setInsertPointAfterBundle(E);
15656
15657 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15658 if (E->VectorizedValue) {
15659 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15660 return E->VectorizedValue;
15661 }
15662 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15663 if (E->VectorizedValue) {
15664 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15665 return E->VectorizedValue;
15666 }
15667 if (L->getType() != R->getType()) {
15668 assert((getOperandEntry(E, 0)->isGather() ||
15669 getOperandEntry(E, 1)->isGather() ||
15670 MinBWs.contains(getOperandEntry(E, 0)) ||
15671 MinBWs.contains(getOperandEntry(E, 1))) &&
15672 "Expected item in MinBWs.");
15673 if (cast<VectorType>(L->getType())
15674 ->getElementType()
15675 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15676 ->getElementType()
15677 ->getIntegerBitWidth()) {
15678 Type *CastTy = R->getType();
15679 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15680 } else {
15681 Type *CastTy = L->getType();
15682 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15683 }
15684 }
15685
15686 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15687 Value *V = Builder.CreateCmp(P0, L, R);
15688 propagateIRFlags(V, E->Scalars, VL0);
15689 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15690 ICmp->setSameSign(/*B=*/false);
15691 // Do not cast for cmps.
15692 VecTy = cast<FixedVectorType>(V->getType());
15693 V = FinalShuffle(V, E);
15694
15695 E->VectorizedValue = V;
15696 ++NumVectorInstructions;
15697 return V;
15698 }
15699 case Instruction::Select: {
15700 setInsertPointAfterBundle(E);
15701
15702 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15703 if (E->VectorizedValue) {
15704 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15705 return E->VectorizedValue;
15706 }
15707 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15708 if (E->VectorizedValue) {
15709 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15710 return E->VectorizedValue;
15711 }
15712 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15713 if (E->VectorizedValue) {
15714 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15715 return E->VectorizedValue;
15716 }
15717 if (True->getType() != VecTy || False->getType() != VecTy) {
15718 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15719 getOperandEntry(E, 2)->isGather() ||
15720 MinBWs.contains(getOperandEntry(E, 1)) ||
15721 MinBWs.contains(getOperandEntry(E, 2))) &&
15722 "Expected item in MinBWs.");
15723 if (True->getType() != VecTy)
15724 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15725 if (False->getType() != VecTy)
15726 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15727 }
15728
15729 unsigned CondNumElements = getNumElements(Cond->getType());
15730 unsigned TrueNumElements = getNumElements(True->getType());
15731 assert(TrueNumElements >= CondNumElements &&
15732 TrueNumElements % CondNumElements == 0 &&
15733 "Cannot vectorize Instruction::Select");
15734 assert(TrueNumElements == getNumElements(False->getType()) &&
15735 "Cannot vectorize Instruction::Select");
15736 if (CondNumElements != TrueNumElements) {
15737 // When the return type is i1 but the source is fixed vector type, we
15738 // need to duplicate the condition value.
15739 Cond = Builder.CreateShuffleVector(
15740 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15741 CondNumElements));
15742 }
15743 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15744 "Cannot vectorize Instruction::Select");
15745 Value *V = Builder.CreateSelect(Cond, True, False);
15746 V = FinalShuffle(V, E);
15747
15748 E->VectorizedValue = V;
15749 ++NumVectorInstructions;
15750 return V;
15751 }
15752 case Instruction::FNeg: {
15753 setInsertPointAfterBundle(E);
15754
15755 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15756
15757 if (E->VectorizedValue) {
15758 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15759 return E->VectorizedValue;
15760 }
15761
15762 Value *V = Builder.CreateUnOp(
15763 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15764 propagateIRFlags(V, E->Scalars, VL0);
15765 if (auto *I = dyn_cast<Instruction>(V))
15766 V = ::propagateMetadata(I, E->Scalars);
15767
15768 V = FinalShuffle(V, E);
15769
15770 E->VectorizedValue = V;
15771 ++NumVectorInstructions;
15772
15773 return V;
15774 }
15775 case Instruction::Freeze: {
15776 setInsertPointAfterBundle(E);
15777
15778 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15779
15780 if (E->VectorizedValue) {
15781 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15782 return E->VectorizedValue;
15783 }
15784
15785 if (Op->getType() != VecTy) {
15786 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15787 MinBWs.contains(getOperandEntry(E, 0))) &&
15788 "Expected item in MinBWs.");
15789 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15790 }
15791 Value *V = Builder.CreateFreeze(Op);
15792 V = FinalShuffle(V, E);
15793
15794 E->VectorizedValue = V;
15795 ++NumVectorInstructions;
15796
15797 return V;
15798 }
15799 case Instruction::Add:
15800 case Instruction::FAdd:
15801 case Instruction::Sub:
15802 case Instruction::FSub:
15803 case Instruction::Mul:
15804 case Instruction::FMul:
15805 case Instruction::UDiv:
15806 case Instruction::SDiv:
15807 case Instruction::FDiv:
15808 case Instruction::URem:
15809 case Instruction::SRem:
15810 case Instruction::FRem:
15811 case Instruction::Shl:
15812 case Instruction::LShr:
15813 case Instruction::AShr:
15814 case Instruction::And:
15815 case Instruction::Or:
15816 case Instruction::Xor: {
15817 setInsertPointAfterBundle(E);
15818
15819 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15820 if (E->VectorizedValue) {
15821 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15822 return E->VectorizedValue;
15823 }
15824 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15825 if (E->VectorizedValue) {
15826 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15827 return E->VectorizedValue;
15828 }
15829 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15830 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15831 ArrayRef<Value *> Ops = E->getOperand(I);
15832 if (all_of(Ops, [&](Value *Op) {
15833 auto *CI = dyn_cast<ConstantInt>(Op);
15834 return CI && CI->getValue().countr_one() >= It->second.first;
15835 })) {
15836 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15837 E->VectorizedValue = V;
15838 ++NumVectorInstructions;
15839 return V;
15840 }
15841 }
15842 }
15843 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15844 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15845 getOperandEntry(E, 1)->isGather() ||
15846 MinBWs.contains(getOperandEntry(E, 0)) ||
15847 MinBWs.contains(getOperandEntry(E, 1))) &&
15848 "Expected item in MinBWs.");
15849 if (LHS->getType() != VecTy)
15850 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15851 if (RHS->getType() != VecTy)
15852 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15853 }
15854
15855 Value *V = Builder.CreateBinOp(
15856 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15857 RHS);
15858 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15859 if (auto *I = dyn_cast<Instruction>(V)) {
15860 V = ::propagateMetadata(I, E->Scalars);
15861 // Drop nuw flags for abs(sub(commutative), true).
15862 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15863 any_of(E->Scalars, [](Value *V) {
15864 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15865 }))
15866 I->setHasNoUnsignedWrap(/*b=*/false);
15867 }
15868
15869 V = FinalShuffle(V, E);
15870
15871 E->VectorizedValue = V;
15872 ++NumVectorInstructions;
15873
15874 return V;
15875 }
15876 case Instruction::Load: {
15877 // Loads are inserted at the head of the tree because we don't want to
15878 // sink them all the way down past store instructions.
15879 setInsertPointAfterBundle(E);
15880
15881 LoadInst *LI = cast<LoadInst>(VL0);
15882 Instruction *NewLI;
15883 Value *PO = LI->getPointerOperand();
15884 if (E->State == TreeEntry::Vectorize) {
15885 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15886 } else if (E->State == TreeEntry::StridedVectorize) {
15887 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15888 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15889 PO = IsReverseOrder ? PtrN : Ptr0;
15890 std::optional<int> Diff = getPointersDiff(
15891 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15892 Type *StrideTy = DL->getIndexType(PO->getType());
15893 Value *StrideVal;
15894 if (Diff) {
15895 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15896 StrideVal =
15897 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15898 DL->getTypeAllocSize(ScalarTy));
15899 } else {
15900 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15901 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15902 return cast<LoadInst>(V)->getPointerOperand();
15903 });
15904 OrdersType Order;
15905 std::optional<Value *> Stride =
15906 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15907 &*Builder.GetInsertPoint());
15908 Value *NewStride =
15909 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15910 StrideVal = Builder.CreateMul(
15911 NewStride,
15912 ConstantInt::get(
15913 StrideTy,
15914 (IsReverseOrder ? -1 : 1) *
15915 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15916 }
15917 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15918 auto *Inst = Builder.CreateIntrinsic(
15919 Intrinsic::experimental_vp_strided_load,
15920 {VecTy, PO->getType(), StrideTy},
15921 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15922 Builder.getInt32(E->Scalars.size())});
15923 Inst->addParamAttr(
15924 /*ArgNo=*/0,
15925 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15926 NewLI = Inst;
15927 } else {
15928 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15929 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15930 if (E->VectorizedValue) {
15931 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15932 return E->VectorizedValue;
15933 }
15934 if (isa<FixedVectorType>(ScalarTy)) {
15935 assert(SLPReVec && "FixedVectorType is not expected.");
15936 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15937 // to expand VecPtr if ScalarTy is a vector type.
15938 unsigned ScalarTyNumElements =
15939 cast<FixedVectorType>(ScalarTy)->getNumElements();
15940 unsigned VecTyNumElements =
15941 cast<FixedVectorType>(VecTy)->getNumElements();
15942 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15943 "Cannot expand getelementptr.");
15944 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15945 SmallVector<Constant *> Indices(VecTyNumElements);
15946 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15947 return Builder.getInt64(I % ScalarTyNumElements);
15948 });
15949 VecPtr = Builder.CreateGEP(
15950 VecTy->getElementType(),
15951 Builder.CreateShuffleVector(
15952 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15953 ConstantVector::get(Indices));
15954 }
15955 // Use the minimum alignment of the gathered loads.
15956 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15957 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15958 }
15959 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15960
15961 V = FinalShuffle(V, E);
15962 E->VectorizedValue = V;
15963 ++NumVectorInstructions;
15964 return V;
15965 }
15966 case Instruction::Store: {
15967 auto *SI = cast<StoreInst>(VL0);
15968
15969 setInsertPointAfterBundle(E);
15970
15971 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15972 if (VecValue->getType() != VecTy)
15973 VecValue =
15974 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15975 VecValue = FinalShuffle(VecValue, E);
15976
15977 Value *Ptr = SI->getPointerOperand();
15978 Instruction *ST;
15979 if (E->State == TreeEntry::Vectorize) {
15980 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
15981 } else {
15982 assert(E->State == TreeEntry::StridedVectorize &&
15983 "Expected either strided or consecutive stores.");
15984 if (!E->ReorderIndices.empty()) {
15985 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15986 Ptr = SI->getPointerOperand();
15987 }
15988 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15989 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
15990 auto *Inst = Builder.CreateIntrinsic(
15991 Intrinsic::experimental_vp_strided_store,
15992 {VecTy, Ptr->getType(), StrideTy},
15993 {VecValue, Ptr,
15994 ConstantInt::get(
15995 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15996 Builder.getAllOnesMask(VecTy->getElementCount()),
15997 Builder.getInt32(E->Scalars.size())});
15998 Inst->addParamAttr(
15999 /*ArgNo=*/1,
16000 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
16001 ST = Inst;
16002 }
16003
16004 Value *V = ::propagateMetadata(ST, E->Scalars);
16005
16006 E->VectorizedValue = V;
16007 ++NumVectorInstructions;
16008 return V;
16009 }
16010 case Instruction::GetElementPtr: {
16011 auto *GEP0 = cast<GetElementPtrInst>(VL0);
16012 setInsertPointAfterBundle(E);
16013
16014 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16015 if (E->VectorizedValue) {
16016 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16017 return E->VectorizedValue;
16018 }
16019
16020 SmallVector<Value *> OpVecs;
16021 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
16022 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16023 if (E->VectorizedValue) {
16024 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16025 return E->VectorizedValue;
16026 }
16027 OpVecs.push_back(OpVec);
16028 }
16029
16030 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16031 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
16033 for (Value *V : E->Scalars) {
16034 if (isa<GetElementPtrInst>(V))
16035 GEPs.push_back(V);
16036 }
16037 V = ::propagateMetadata(I, GEPs);
16038 }
16039
16040 V = FinalShuffle(V, E);
16041
16042 E->VectorizedValue = V;
16043 ++NumVectorInstructions;
16044
16045 return V;
16046 }
16047 case Instruction::Call: {
16048 CallInst *CI = cast<CallInst>(VL0);
16049 setInsertPointAfterBundle(E);
16050
16052
16054 CI, ID, VecTy->getNumElements(),
16055 It != MinBWs.end() ? It->second.first : 0, TTI);
16056 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16057 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
16058 VecCallCosts.first <= VecCallCosts.second;
16059
16060 Value *ScalarArg = nullptr;
16061 SmallVector<Value *> OpVecs;
16062 SmallVector<Type *, 2> TysForDecl;
16063 // Add return type if intrinsic is overloaded on it.
16064 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
16065 TysForDecl.push_back(VecTy);
16066 auto *CEI = cast<CallInst>(VL0);
16067 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
16068 ValueList OpVL;
16069 // Some intrinsics have scalar arguments. This argument should not be
16070 // vectorized.
16071 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
16072 ScalarArg = CEI->getArgOperand(I);
16073 // if decided to reduce bitwidth of abs intrinsic, it second argument
16074 // must be set false (do not return poison, if value issigned min).
16075 if (ID == Intrinsic::abs && It != MinBWs.end() &&
16076 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
16077 ScalarArg = Builder.getFalse();
16078 OpVecs.push_back(ScalarArg);
16080 TysForDecl.push_back(ScalarArg->getType());
16081 continue;
16082 }
16083
16084 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
16085 if (E->VectorizedValue) {
16086 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16087 return E->VectorizedValue;
16088 }
16089 ScalarArg = CEI->getArgOperand(I);
16090 if (cast<VectorType>(OpVec->getType())->getElementType() !=
16091 ScalarArg->getType()->getScalarType() &&
16092 It == MinBWs.end()) {
16093 auto *CastTy =
16094 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16095 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16096 } else if (It != MinBWs.end()) {
16097 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16098 }
16099 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
16100 OpVecs.push_back(OpVec);
16101 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
16102 TysForDecl.push_back(OpVec->getType());
16103 }
16104
16105 Function *CF;
16106 if (!UseIntrinsic) {
16107 VFShape Shape =
16110 static_cast<unsigned>(VecTy->getNumElements())),
16111 false /*HasGlobalPred*/);
16112 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
16113 } else {
16114 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
16115 }
16116
16118 CI->getOperandBundlesAsDefs(OpBundles);
16119 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16120
16121 propagateIRFlags(V, E->Scalars, VL0);
16122 V = FinalShuffle(V, E);
16123
16124 E->VectorizedValue = V;
16125 ++NumVectorInstructions;
16126 return V;
16127 }
16128 case Instruction::ShuffleVector: {
16129 Value *V;
16130 if (SLPReVec && !E->isAltShuffle()) {
16131 setInsertPointAfterBundle(E);
16132 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16133 if (E->VectorizedValue) {
16134 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16135 return E->VectorizedValue;
16136 }
16137 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16138 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16139 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16140 "Not supported shufflevector usage.");
16141 SmallVector<int> NewMask(ThisMask.size());
16142 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16143 return SVSrc->getShuffleMask()[Mask];
16144 });
16145 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16146 } else {
16147 V = Builder.CreateShuffleVector(Src, ThisMask);
16148 }
16149 propagateIRFlags(V, E->Scalars, VL0);
16150 if (auto *I = dyn_cast<Instruction>(V))
16151 V = ::propagateMetadata(I, E->Scalars);
16152 V = FinalShuffle(V, E);
16153 } else {
16154 assert(E->isAltShuffle() &&
16155 ((Instruction::isBinaryOp(E->getOpcode()) &&
16156 Instruction::isBinaryOp(E->getAltOpcode())) ||
16157 (Instruction::isCast(E->getOpcode()) &&
16158 Instruction::isCast(E->getAltOpcode())) ||
16159 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16160 "Invalid Shuffle Vector Operand");
16161
16162 Value *LHS = nullptr, *RHS = nullptr;
16163 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16164 setInsertPointAfterBundle(E);
16165 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16166 if (E->VectorizedValue) {
16167 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16168 return E->VectorizedValue;
16169 }
16170 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16171 } else {
16172 setInsertPointAfterBundle(E);
16173 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16174 }
16175 if (E->VectorizedValue) {
16176 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16177 return E->VectorizedValue;
16178 }
16179 if (LHS && RHS &&
16180 ((Instruction::isBinaryOp(E->getOpcode()) &&
16181 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
16182 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
16183 assert((It != MinBWs.end() ||
16184 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16185 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16186 MinBWs.contains(getOperandEntry(E, 0)) ||
16187 MinBWs.contains(getOperandEntry(E, 1))) &&
16188 "Expected item in MinBWs.");
16189 Type *CastTy = VecTy;
16190 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
16191 if (cast<VectorType>(LHS->getType())
16192 ->getElementType()
16193 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16194 ->getElementType()
16195 ->getIntegerBitWidth())
16196 CastTy = RHS->getType();
16197 else
16198 CastTy = LHS->getType();
16199 }
16200 if (LHS->getType() != CastTy)
16201 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16202 if (RHS->getType() != CastTy)
16203 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16204 }
16205
16206 Value *V0, *V1;
16207 if (Instruction::isBinaryOp(E->getOpcode())) {
16208 V0 = Builder.CreateBinOp(
16209 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16210 V1 = Builder.CreateBinOp(
16211 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16212 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16213 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16214 auto *AltCI = cast<CmpInst>(E->getAltOp());
16215 CmpInst::Predicate AltPred = AltCI->getPredicate();
16216 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16217 } else {
16218 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16219 unsigned SrcBWSz = DL->getTypeSizeInBits(
16220 cast<VectorType>(LHS->getType())->getElementType());
16221 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16222 if (BWSz <= SrcBWSz) {
16223 if (BWSz < SrcBWSz)
16224 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16225 assert(LHS->getType() == VecTy &&
16226 "Expected same type as operand.");
16227 if (auto *I = dyn_cast<Instruction>(LHS))
16228 LHS = ::propagateMetadata(I, E->Scalars);
16229 LHS = FinalShuffle(LHS, E);
16230 E->VectorizedValue = LHS;
16231 ++NumVectorInstructions;
16232 return LHS;
16233 }
16234 }
16235 V0 = Builder.CreateCast(
16236 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16237 V1 = Builder.CreateCast(
16238 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16239 }
16240 // Add V0 and V1 to later analysis to try to find and remove matching
16241 // instruction, if any.
16242 for (Value *V : {V0, V1}) {
16243 if (auto *I = dyn_cast<Instruction>(V)) {
16244 GatherShuffleExtractSeq.insert(I);
16245 CSEBlocks.insert(I->getParent());
16246 }
16247 }
16248
16249 // Create shuffle to take alternate operations from the vector.
16250 // Also, gather up main and alt scalar ops to propagate IR flags to
16251 // each vector operation.
16252 ValueList OpScalars, AltScalars;
16254 E->buildAltOpShuffleMask(
16255 [E, this](Instruction *I) {
16256 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16257 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16258 *TLI);
16259 },
16260 Mask, &OpScalars, &AltScalars);
16261
16262 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16263 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16264 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16265 // Drop nuw flags for abs(sub(commutative), true).
16266 if (auto *I = dyn_cast<Instruction>(Vec);
16267 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16268 any_of(E->Scalars, [](Value *V) {
16269 if (isa<PoisonValue>(V))
16270 return false;
16271 auto *IV = cast<Instruction>(V);
16272 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16273 }))
16274 I->setHasNoUnsignedWrap(/*b=*/false);
16275 };
16276 DropNuwFlag(V0, E->getOpcode());
16277 DropNuwFlag(V1, E->getAltOpcode());
16278
16279 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16280 assert(SLPReVec && "FixedVectorType is not expected.");
16282 }
16283 V = Builder.CreateShuffleVector(V0, V1, Mask);
16284 if (auto *I = dyn_cast<Instruction>(V)) {
16285 V = ::propagateMetadata(I, E->Scalars);
16286 GatherShuffleExtractSeq.insert(I);
16287 CSEBlocks.insert(I->getParent());
16288 }
16289 }
16290
16291 E->VectorizedValue = V;
16292 ++NumVectorInstructions;
16293
16294 return V;
16295 }
16296 default:
16297 llvm_unreachable("unknown inst");
16298 }
16299 return nullptr;
16300}
16301
16303 ExtraValueToDebugLocsMap ExternallyUsedValues;
16304 return vectorizeTree(ExternallyUsedValues);
16305}
16306
16307Value *
16309 Instruction *ReductionRoot) {
16310 // All blocks must be scheduled before any instructions are inserted.
16311 for (auto &BSIter : BlocksSchedules) {
16312 scheduleBlock(BSIter.second.get());
16313 }
16314 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16315 // need to rebuild it.
16316 EntryToLastInstruction.clear();
16317
16318 if (ReductionRoot)
16319 Builder.SetInsertPoint(ReductionRoot->getParent(),
16320 ReductionRoot->getIterator());
16321 else
16322 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16323
16324 // Emit gathered loads first to emit better code for the users of those
16325 // gathered loads.
16326 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16327 if (GatheredLoadsEntriesFirst.has_value() &&
16328 TE->Idx >= *GatheredLoadsEntriesFirst &&
16329 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16330 assert((!TE->UserTreeIndices.empty() ||
16331 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16332 "Expected gathered load node.");
16333 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16334 }
16335 }
16336 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16337 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16338 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16339 if (TE->State == TreeEntry::Vectorize &&
16340 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16341 TE->VectorizedValue)
16342 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16343 // Run through the list of postponed gathers and emit them, replacing the temp
16344 // emitted allocas with actual vector instructions.
16345 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16347 for (const TreeEntry *E : PostponedNodes) {
16348 auto *TE = const_cast<TreeEntry *>(E);
16349 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16350 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16351 TE->UserTreeIndices.front().EdgeIdx)) &&
16352 VecTE->isSame(TE->Scalars))
16353 // Found gather node which is absolutely the same as one of the
16354 // vectorized nodes. It may happen after reordering.
16355 continue;
16356 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16357 TE->VectorizedValue = nullptr;
16358 auto *UserI =
16359 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16360 // If user is a PHI node, its vector code have to be inserted right before
16361 // block terminator. Since the node was delayed, there were some unresolved
16362 // dependencies at the moment when stab instruction was emitted. In a case
16363 // when any of these dependencies turn out an operand of another PHI, coming
16364 // from this same block, position of a stab instruction will become invalid.
16365 // The is because source vector that supposed to feed this gather node was
16366 // inserted at the end of the block [after stab instruction]. So we need
16367 // to adjust insertion point again to the end of block.
16368 if (isa<PHINode>(UserI)) {
16369 // Insert before all users.
16370 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16371 for (User *U : PrevVec->users()) {
16372 if (U == UserI)
16373 continue;
16374 auto *UI = dyn_cast<Instruction>(U);
16375 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16376 continue;
16377 if (UI->comesBefore(InsertPt))
16378 InsertPt = UI;
16379 }
16380 Builder.SetInsertPoint(InsertPt);
16381 } else {
16382 Builder.SetInsertPoint(PrevVec);
16383 }
16384 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16385 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16386 if (auto *VecI = dyn_cast<Instruction>(Vec);
16387 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16388 Builder.GetInsertPoint()->comesBefore(VecI))
16389 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16390 Builder.GetInsertPoint());
16391 if (Vec->getType() != PrevVec->getType()) {
16392 assert(Vec->getType()->isIntOrIntVectorTy() &&
16393 PrevVec->getType()->isIntOrIntVectorTy() &&
16394 "Expected integer vector types only.");
16395 std::optional<bool> IsSigned;
16396 for (Value *V : TE->Scalars) {
16397 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16398 auto It = MinBWs.find(BaseTE);
16399 if (It != MinBWs.end()) {
16400 IsSigned = IsSigned.value_or(false) || It->second.second;
16401 if (*IsSigned)
16402 break;
16403 }
16404 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16405 auto It = MinBWs.find(MNTE);
16406 if (It != MinBWs.end()) {
16407 IsSigned = IsSigned.value_or(false) || It->second.second;
16408 if (*IsSigned)
16409 break;
16410 }
16411 }
16412 if (IsSigned.value_or(false))
16413 break;
16414 // Scan through gather nodes.
16415 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16416 auto It = MinBWs.find(BVE);
16417 if (It != MinBWs.end()) {
16418 IsSigned = IsSigned.value_or(false) || It->second.second;
16419 if (*IsSigned)
16420 break;
16421 }
16422 }
16423 if (IsSigned.value_or(false))
16424 break;
16425 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16426 IsSigned =
16427 IsSigned.value_or(false) ||
16428 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16429 continue;
16430 }
16431 if (IsSigned.value_or(false))
16432 break;
16433 }
16434 }
16435 if (IsSigned.value_or(false)) {
16436 // Final attempt - check user node.
16437 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16438 if (It != MinBWs.end())
16439 IsSigned = It->second.second;
16440 }
16441 assert(IsSigned &&
16442 "Expected user node or perfect diamond match in MinBWs.");
16443 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16444 }
16445 PrevVec->replaceAllUsesWith(Vec);
16446 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16447 // Replace the stub vector node, if it was used before for one of the
16448 // buildvector nodes already.
16449 auto It = PostponedValues.find(PrevVec);
16450 if (It != PostponedValues.end()) {
16451 for (TreeEntry *VTE : It->getSecond())
16452 VTE->VectorizedValue = Vec;
16453 }
16454 eraseInstruction(PrevVec);
16455 }
16456
16457 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16458 << " values .\n");
16459
16461 // Maps vector instruction to original insertelement instruction
16462 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16463 // Maps extract Scalar to the corresponding extractelement instruction in the
16464 // basic block. Only one extractelement per block should be emitted.
16466 ScalarToEEs;
16467 SmallDenseSet<Value *, 4> UsedInserts;
16469 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16471 // Extract all of the elements with the external uses.
16472 for (const auto &ExternalUse : ExternalUses) {
16473 Value *Scalar = ExternalUse.Scalar;
16474 llvm::User *User = ExternalUse.User;
16475
16476 // Skip users that we already RAUW. This happens when one instruction
16477 // has multiple uses of the same value.
16478 if (User && !is_contained(Scalar->users(), User))
16479 continue;
16480 TreeEntry *E = getTreeEntry(Scalar);
16481 assert(E && "Invalid scalar");
16482 assert(!E->isGather() && "Extracting from a gather list");
16483 // Non-instruction pointers are not deleted, just skip them.
16484 if (E->getOpcode() == Instruction::GetElementPtr &&
16485 !isa<GetElementPtrInst>(Scalar))
16486 continue;
16487
16488 Value *Vec = E->VectorizedValue;
16489 assert(Vec && "Can't find vectorizable value");
16490
16491 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16492 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16493 if (Scalar->getType() != Vec->getType()) {
16494 Value *Ex = nullptr;
16495 Value *ExV = nullptr;
16496 auto *Inst = dyn_cast<Instruction>(Scalar);
16497 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16498 auto It = ScalarToEEs.find(Scalar);
16499 if (It != ScalarToEEs.end()) {
16500 // No need to emit many extracts, just move the only one in the
16501 // current block.
16502 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16503 : Builder.GetInsertBlock());
16504 if (EEIt != It->second.end()) {
16505 Value *PrevV = EEIt->second.first;
16506 if (auto *I = dyn_cast<Instruction>(PrevV);
16507 I && !ReplaceInst &&
16508 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16509 Builder.GetInsertPoint()->comesBefore(I)) {
16510 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16511 Builder.GetInsertPoint());
16512 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16513 CI->moveAfter(I);
16514 }
16515 Ex = PrevV;
16516 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16517 }
16518 }
16519 if (!Ex) {
16520 // "Reuse" the existing extract to improve final codegen.
16521 if (ReplaceInst) {
16522 // Leave the instruction as is, if it cheaper extracts and all
16523 // operands are scalar.
16524 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16525 IgnoredExtracts.insert(EE);
16526 Ex = EE;
16527 } else {
16528 auto *CloneInst = Inst->clone();
16529 CloneInst->insertBefore(Inst);
16530 if (Inst->hasName())
16531 CloneInst->takeName(Inst);
16532 Ex = CloneInst;
16533 }
16534 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16535 ES && isa<Instruction>(Vec)) {
16536 Value *V = ES->getVectorOperand();
16537 auto *IVec = cast<Instruction>(Vec);
16538 if (const TreeEntry *ETE = getTreeEntry(V))
16539 V = ETE->VectorizedValue;
16540 if (auto *IV = dyn_cast<Instruction>(V);
16541 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16542 IV->comesBefore(IVec))
16543 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16544 else
16545 Ex = Builder.CreateExtractElement(Vec, Lane);
16546 } else if (auto *VecTy =
16547 dyn_cast<FixedVectorType>(Scalar->getType())) {
16548 assert(SLPReVec && "FixedVectorType is not expected.");
16549 unsigned VecTyNumElements = VecTy->getNumElements();
16550 // When REVEC is enabled, we need to extract a vector.
16551 // Note: The element size of Scalar may be different from the
16552 // element size of Vec.
16553 Ex = Builder.CreateExtractVector(
16555 VecTyNumElements),
16556 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
16557 } else {
16558 Ex = Builder.CreateExtractElement(Vec, Lane);
16559 }
16560 // If necessary, sign-extend or zero-extend ScalarRoot
16561 // to the larger type.
16562 ExV = Ex;
16563 if (Scalar->getType() != Ex->getType())
16564 ExV = Builder.CreateIntCast(
16565 Ex, Scalar->getType(),
16566 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16567 auto *I = dyn_cast<Instruction>(Ex);
16568 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16569 : &F->getEntryBlock(),
16570 std::make_pair(Ex, ExV));
16571 }
16572 // The then branch of the previous if may produce constants, since 0
16573 // operand might be a constant.
16574 if (auto *ExI = dyn_cast<Instruction>(Ex);
16575 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16576 GatherShuffleExtractSeq.insert(ExI);
16577 CSEBlocks.insert(ExI->getParent());
16578 }
16579 return ExV;
16580 }
16581 assert(isa<FixedVectorType>(Scalar->getType()) &&
16582 isa<InsertElementInst>(Scalar) &&
16583 "In-tree scalar of vector type is not insertelement?");
16584 auto *IE = cast<InsertElementInst>(Scalar);
16585 VectorToInsertElement.try_emplace(Vec, IE);
16586 return Vec;
16587 };
16588 // If User == nullptr, the Scalar remains as scalar in vectorized
16589 // instructions or is used as extra arg. Generate ExtractElement instruction
16590 // and update the record for this scalar in ExternallyUsedValues.
16591 if (!User) {
16592 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16593 continue;
16594 assert((ExternallyUsedValues.count(Scalar) ||
16595 Scalar->hasNUsesOrMore(UsesLimit) ||
16596 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16597 any_of(Scalar->users(),
16598 [&](llvm::User *U) {
16599 if (ExternalUsesAsOriginalScalar.contains(U))
16600 return true;
16601 TreeEntry *UseEntry = getTreeEntry(U);
16602 return UseEntry &&
16603 (UseEntry->State == TreeEntry::Vectorize ||
16604 UseEntry->State ==
16605 TreeEntry::StridedVectorize) &&
16606 (E->State == TreeEntry::Vectorize ||
16607 E->State == TreeEntry::StridedVectorize) &&
16608 doesInTreeUserNeedToExtract(
16609 Scalar, getRootEntryInstruction(*UseEntry),
16610 TLI, TTI);
16611 })) &&
16612 "Scalar with nullptr User must be registered in "
16613 "ExternallyUsedValues map or remain as scalar in vectorized "
16614 "instructions");
16615 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16616 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16617 if (PHI->getParent()->isLandingPad())
16618 Builder.SetInsertPoint(
16619 PHI->getParent(),
16620 std::next(
16621 PHI->getParent()->getLandingPadInst()->getIterator()));
16622 else
16623 Builder.SetInsertPoint(PHI->getParent(),
16624 PHI->getParent()->getFirstNonPHIIt());
16625 } else {
16626 Builder.SetInsertPoint(VecI->getParent(),
16627 std::next(VecI->getIterator()));
16628 }
16629 } else {
16630 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16631 }
16632 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16633 // Required to update internally referenced instructions.
16634 if (Scalar != NewInst) {
16635 assert((!isa<ExtractElementInst>(Scalar) ||
16636 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16637 "Extractelements should not be replaced.");
16638 Scalar->replaceAllUsesWith(NewInst);
16639 }
16640 continue;
16641 }
16642
16643 if (auto *VU = dyn_cast<InsertElementInst>(User);
16644 VU && VU->getOperand(1) == Scalar) {
16645 // Skip if the scalar is another vector op or Vec is not an instruction.
16646 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16647 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16648 if (!UsedInserts.insert(VU).second)
16649 continue;
16650 // Need to use original vector, if the root is truncated.
16651 auto BWIt = MinBWs.find(E);
16652 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16653 auto *ScalarTy = FTy->getElementType();
16654 auto Key = std::make_pair(Vec, ScalarTy);
16655 auto VecIt = VectorCasts.find(Key);
16656 if (VecIt == VectorCasts.end()) {
16657 IRBuilderBase::InsertPointGuard Guard(Builder);
16658 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16659 if (IVec->getParent()->isLandingPad())
16660 Builder.SetInsertPoint(IVec->getParent(),
16661 std::next(IVec->getParent()
16662 ->getLandingPadInst()
16663 ->getIterator()));
16664 else
16665 Builder.SetInsertPoint(
16666 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16667 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16668 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16669 }
16670 Vec = Builder.CreateIntCast(
16671 Vec,
16673 ScalarTy,
16674 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16675 BWIt->second.second);
16676 VectorCasts.try_emplace(Key, Vec);
16677 } else {
16678 Vec = VecIt->second;
16679 }
16680 }
16681
16682 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16683 if (InsertIdx) {
16684 auto *It = find_if(
16685 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16686 // Checks if 2 insertelements are from the same buildvector.
16687 InsertElementInst *VecInsert = Data.InsertElements.front();
16689 VU, VecInsert,
16690 [](InsertElementInst *II) { return II->getOperand(0); });
16691 });
16692 unsigned Idx = *InsertIdx;
16693 if (It == ShuffledInserts.end()) {
16694 (void)ShuffledInserts.emplace_back();
16695 It = std::next(ShuffledInserts.begin(),
16696 ShuffledInserts.size() - 1);
16697 }
16698 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16699 if (Mask.empty())
16700 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16701 Mask[Idx] = ExternalUse.Lane;
16702 It->InsertElements.push_back(cast<InsertElementInst>(User));
16703 continue;
16704 }
16705 }
16706 }
16707 }
16708
16709 // Generate extracts for out-of-tree users.
16710 // Find the insertion point for the extractelement lane.
16711 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16712 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16713 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16714 if (PH->getIncomingValue(I) == Scalar) {
16715 Instruction *IncomingTerminator =
16716 PH->getIncomingBlock(I)->getTerminator();
16717 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16718 Builder.SetInsertPoint(VecI->getParent(),
16719 std::next(VecI->getIterator()));
16720 } else {
16721 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16722 }
16723 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16724 PH->setOperand(I, NewInst);
16725 }
16726 }
16727 } else {
16728 Builder.SetInsertPoint(cast<Instruction>(User));
16729 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16730 User->replaceUsesOfWith(Scalar, NewInst);
16731 }
16732 } else {
16733 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16734 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16735 User->replaceUsesOfWith(Scalar, NewInst);
16736 }
16737
16738 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16739 }
16740
16741 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16742 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16743 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16744 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16745 for (int I = 0, E = Mask.size(); I < E; ++I) {
16746 if (Mask[I] < VF)
16747 CombinedMask1[I] = Mask[I];
16748 else
16749 CombinedMask2[I] = Mask[I] - VF;
16750 }
16751 ShuffleInstructionBuilder ShuffleBuilder(
16752 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16753 ShuffleBuilder.add(V1, CombinedMask1);
16754 if (V2)
16755 ShuffleBuilder.add(V2, CombinedMask2);
16756 return ShuffleBuilder.finalize({}, {}, {});
16757 };
16758
16759 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16760 bool ForSingleMask) {
16761 unsigned VF = Mask.size();
16762 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16763 if (VF != VecVF) {
16764 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16765 Vec = CreateShuffle(Vec, nullptr, Mask);
16766 return std::make_pair(Vec, true);
16767 }
16768 if (!ForSingleMask) {
16769 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16770 for (unsigned I = 0; I < VF; ++I) {
16771 if (Mask[I] != PoisonMaskElem)
16772 ResizeMask[Mask[I]] = Mask[I];
16773 }
16774 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16775 }
16776 }
16777
16778 return std::make_pair(Vec, false);
16779 };
16780 // Perform shuffling of the vectorize tree entries for better handling of
16781 // external extracts.
16782 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16783 // Find the first and the last instruction in the list of insertelements.
16784 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16785 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16786 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16787 Builder.SetInsertPoint(LastInsert);
16788 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16789 Value *NewInst = performExtractsShuffleAction<Value>(
16790 MutableArrayRef(Vector.data(), Vector.size()),
16791 FirstInsert->getOperand(0),
16792 [](Value *Vec) {
16793 return cast<VectorType>(Vec->getType())
16794 ->getElementCount()
16795 .getKnownMinValue();
16796 },
16797 ResizeToVF,
16798 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16799 ArrayRef<Value *> Vals) {
16800 assert((Vals.size() == 1 || Vals.size() == 2) &&
16801 "Expected exactly 1 or 2 input values.");
16802 if (Vals.size() == 1) {
16803 // Do not create shuffle if the mask is a simple identity
16804 // non-resizing mask.
16805 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16806 ->getNumElements() ||
16807 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16808 return CreateShuffle(Vals.front(), nullptr, Mask);
16809 return Vals.front();
16810 }
16811 return CreateShuffle(Vals.front() ? Vals.front()
16812 : FirstInsert->getOperand(0),
16813 Vals.back(), Mask);
16814 });
16815 auto It = ShuffledInserts[I].InsertElements.rbegin();
16816 // Rebuild buildvector chain.
16817 InsertElementInst *II = nullptr;
16818 if (It != ShuffledInserts[I].InsertElements.rend())
16819 II = *It;
16821 while (It != ShuffledInserts[I].InsertElements.rend()) {
16822 assert(II && "Must be an insertelement instruction.");
16823 if (*It == II)
16824 ++It;
16825 else
16826 Inserts.push_back(cast<Instruction>(II));
16827 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16828 }
16829 for (Instruction *II : reverse(Inserts)) {
16830 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16831 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16832 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16833 II->moveAfter(NewI);
16834 NewInst = II;
16835 }
16836 LastInsert->replaceAllUsesWith(NewInst);
16837 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16838 IE->replaceUsesOfWith(IE->getOperand(0),
16839 PoisonValue::get(IE->getOperand(0)->getType()));
16840 IE->replaceUsesOfWith(IE->getOperand(1),
16841 PoisonValue::get(IE->getOperand(1)->getType()));
16842 eraseInstruction(IE);
16843 }
16844 CSEBlocks.insert(LastInsert->getParent());
16845 }
16846
16847 SmallVector<Instruction *> RemovedInsts;
16848 // For each vectorized value:
16849 for (auto &TEPtr : VectorizableTree) {
16850 TreeEntry *Entry = TEPtr.get();
16851
16852 // No need to handle users of gathered values.
16853 if (Entry->isGather())
16854 continue;
16855
16856 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16857
16858 // For each lane:
16859 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16860 Value *Scalar = Entry->Scalars[Lane];
16861
16862 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16863 !isa<GetElementPtrInst>(Scalar))
16864 continue;
16865 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16866 EE && IgnoredExtracts.contains(EE))
16867 continue;
16868 if (isa<PoisonValue>(Scalar))
16869 continue;
16870#ifndef NDEBUG
16871 Type *Ty = Scalar->getType();
16872 if (!Ty->isVoidTy()) {
16873 for (User *U : Scalar->users()) {
16874 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16875
16876 // It is legal to delete users in the ignorelist.
16877 assert((getTreeEntry(U) ||
16878 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16879 (isa_and_nonnull<Instruction>(U) &&
16880 isDeleted(cast<Instruction>(U)))) &&
16881 "Deleting out-of-tree value");
16882 }
16883 }
16884#endif
16885 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16886 auto *I = cast<Instruction>(Scalar);
16887 RemovedInsts.push_back(I);
16888 }
16889 }
16890
16891 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16892 // new vector instruction.
16893 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16894 V->mergeDIAssignID(RemovedInsts);
16895
16896 // Clear up reduction references, if any.
16897 if (UserIgnoreList) {
16898 for (Instruction *I : RemovedInsts) {
16899 const TreeEntry *IE = getTreeEntry(I);
16900 if (IE->Idx != 0 &&
16901 !(VectorizableTree.front()->isGather() &&
16902 !IE->UserTreeIndices.empty() &&
16903 (ValueToGatherNodes.lookup(I).contains(
16904 VectorizableTree.front().get()) ||
16905 any_of(IE->UserTreeIndices,
16906 [&](const EdgeInfo &EI) {
16907 return EI.UserTE == VectorizableTree.front().get() &&
16908 EI.EdgeIdx == UINT_MAX;
16909 }))) &&
16910 !(GatheredLoadsEntriesFirst.has_value() &&
16911 IE->Idx >= *GatheredLoadsEntriesFirst &&
16912 VectorizableTree.front()->isGather() &&
16913 is_contained(VectorizableTree.front()->Scalars, I)))
16914 continue;
16915 SmallVector<SelectInst *> LogicalOpSelects;
16916 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16917 // Do not replace condition of the logical op in form select <cond>.
16918 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16919 (match(U.getUser(), m_LogicalAnd()) ||
16920 match(U.getUser(), m_LogicalOr())) &&
16921 U.getOperandNo() == 0;
16922 if (IsPoisoningLogicalOp) {
16923 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16924 return false;
16925 }
16926 return UserIgnoreList->contains(U.getUser());
16927 });
16928 // Replace conditions of the poisoning logical ops with the non-poison
16929 // constant value.
16930 for (SelectInst *SI : LogicalOpSelects)
16931 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16932 }
16933 }
16934 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16935 // cache correctness.
16936 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16937 // - instructions are not deleted until later.
16938 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16939
16940 Builder.ClearInsertionPoint();
16941 InstrElementSize.clear();
16942
16943 const TreeEntry &RootTE = *VectorizableTree.front();
16944 Value *Vec = RootTE.VectorizedValue;
16945 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16946 It != MinBWs.end() &&
16947 ReductionBitWidth != It->second.first) {
16948 IRBuilder<>::InsertPointGuard Guard(Builder);
16949 Builder.SetInsertPoint(ReductionRoot->getParent(),
16950 ReductionRoot->getIterator());
16951 Vec = Builder.CreateIntCast(
16952 Vec,
16953 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16954 cast<VectorType>(Vec->getType())->getElementCount()),
16955 It->second.second);
16956 }
16957 return Vec;
16958}
16959
16961 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16962 << " gather sequences instructions.\n");
16963 // LICM InsertElementInst sequences.
16964 for (Instruction *I : GatherShuffleExtractSeq) {
16965 if (isDeleted(I))
16966 continue;
16967
16968 // Check if this block is inside a loop.
16969 Loop *L = LI->getLoopFor(I->getParent());
16970 if (!L)
16971 continue;
16972
16973 // Check if it has a preheader.
16974 BasicBlock *PreHeader = L->getLoopPreheader();
16975 if (!PreHeader)
16976 continue;
16977
16978 // If the vector or the element that we insert into it are
16979 // instructions that are defined in this basic block then we can't
16980 // hoist this instruction.
16981 if (any_of(I->operands(), [L](Value *V) {
16982 auto *OpI = dyn_cast<Instruction>(V);
16983 return OpI && L->contains(OpI);
16984 }))
16985 continue;
16986
16987 // We can hoist this instruction. Move it to the pre-header.
16988 I->moveBefore(PreHeader->getTerminator());
16989 CSEBlocks.insert(PreHeader);
16990 }
16991
16992 // Make a list of all reachable blocks in our CSE queue.
16994 CSEWorkList.reserve(CSEBlocks.size());
16995 for (BasicBlock *BB : CSEBlocks)
16996 if (DomTreeNode *N = DT->getNode(BB)) {
16998 CSEWorkList.push_back(N);
16999 }
17000
17001 // Sort blocks by domination. This ensures we visit a block after all blocks
17002 // dominating it are visited.
17003 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
17004 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
17005 "Different nodes should have different DFS numbers");
17006 return A->getDFSNumIn() < B->getDFSNumIn();
17007 });
17008
17009 // Less defined shuffles can be replaced by the more defined copies.
17010 // Between two shuffles one is less defined if it has the same vector operands
17011 // and its mask indeces are the same as in the first one or undefs. E.g.
17012 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
17013 // poison, <0, 0, 0, 0>.
17014 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
17015 Instruction *I2,
17016 SmallVectorImpl<int> &NewMask) {
17017 if (I1->getType() != I2->getType())
17018 return false;
17019 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17020 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17021 if (!SI1 || !SI2)
17022 return I1->isIdenticalTo(I2);
17023 if (SI1->isIdenticalTo(SI2))
17024 return true;
17025 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
17026 if (SI1->getOperand(I) != SI2->getOperand(I))
17027 return false;
17028 // Check if the second instruction is more defined than the first one.
17029 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17030 ArrayRef<int> SM1 = SI1->getShuffleMask();
17031 // Count trailing undefs in the mask to check the final number of used
17032 // registers.
17033 unsigned LastUndefsCnt = 0;
17034 for (int I = 0, E = NewMask.size(); I < E; ++I) {
17035 if (SM1[I] == PoisonMaskElem)
17036 ++LastUndefsCnt;
17037 else
17038 LastUndefsCnt = 0;
17039 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
17040 NewMask[I] != SM1[I])
17041 return false;
17042 if (NewMask[I] == PoisonMaskElem)
17043 NewMask[I] = SM1[I];
17044 }
17045 // Check if the last undefs actually change the final number of used vector
17046 // registers.
17047 return SM1.size() - LastUndefsCnt > 1 &&
17048 TTI->getNumberOfParts(SI1->getType()) ==
17050 getWidenedType(SI1->getType()->getElementType(),
17051 SM1.size() - LastUndefsCnt));
17052 };
17053 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
17054 // instructions. TODO: We can further optimize this scan if we split the
17055 // instructions into different buckets based on the insert lane.
17057 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
17058 assert(*I &&
17059 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17060 "Worklist not sorted properly!");
17061 BasicBlock *BB = (*I)->getBlock();
17062 // For all instructions in blocks containing gather sequences:
17063 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
17064 if (isDeleted(&In))
17065 continue;
17066 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17067 !GatherShuffleExtractSeq.contains(&In))
17068 continue;
17069
17070 // Check if we can replace this instruction with any of the
17071 // visited instructions.
17072 bool Replaced = false;
17073 for (Instruction *&V : Visited) {
17074 SmallVector<int> NewMask;
17075 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17076 DT->dominates(V->getParent(), In.getParent())) {
17077 In.replaceAllUsesWith(V);
17078 eraseInstruction(&In);
17079 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17080 if (!NewMask.empty())
17081 SI->setShuffleMask(NewMask);
17082 Replaced = true;
17083 break;
17084 }
17085 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17086 GatherShuffleExtractSeq.contains(V) &&
17087 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17088 DT->dominates(In.getParent(), V->getParent())) {
17089 In.moveAfter(V);
17090 V->replaceAllUsesWith(&In);
17092 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17093 if (!NewMask.empty())
17094 SI->setShuffleMask(NewMask);
17095 V = &In;
17096 Replaced = true;
17097 break;
17098 }
17099 }
17100 if (!Replaced) {
17101 assert(!is_contained(Visited, &In));
17102 Visited.push_back(&In);
17103 }
17104 }
17105 }
17106 CSEBlocks.clear();
17107 GatherShuffleExtractSeq.clear();
17108}
17109
17110BoUpSLP::ScheduleData *
17111BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17112 ScheduleData *Bundle = nullptr;
17113 ScheduleData *PrevInBundle = nullptr;
17114 for (Value *V : VL) {
17116 continue;
17117 ScheduleData *BundleMember = getScheduleData(V);
17118 assert(BundleMember &&
17119 "no ScheduleData for bundle member "
17120 "(maybe not in same basic block)");
17121 assert(BundleMember->isSchedulingEntity() &&
17122 "bundle member already part of other bundle");
17123 if (PrevInBundle) {
17124 PrevInBundle->NextInBundle = BundleMember;
17125 } else {
17126 Bundle = BundleMember;
17127 }
17128
17129 // Group the instructions to a bundle.
17130 BundleMember->FirstInBundle = Bundle;
17131 PrevInBundle = BundleMember;
17132 }
17133 assert(Bundle && "Failed to find schedule bundle");
17134 return Bundle;
17135}
17136
17137// Groups the instructions to a bundle (which is then a single scheduling entity)
17138// and schedules instructions until the bundle gets ready.
17139std::optional<BoUpSLP::ScheduleData *>
17140BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
17141 const InstructionsState &S) {
17142 // No need to schedule PHIs, insertelement, extractelement and extractvalue
17143 // instructions.
17144 if (isa<PHINode>(S.getMainOp()) ||
17146 return nullptr;
17147
17148 // Initialize the instruction bundle.
17149 Instruction *OldScheduleEnd = ScheduleEnd;
17150 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
17151
17152 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17153 ScheduleData *Bundle) {
17154 // The scheduling region got new instructions at the lower end (or it is a
17155 // new region for the first bundle). This makes it necessary to
17156 // recalculate all dependencies.
17157 // It is seldom that this needs to be done a second time after adding the
17158 // initial bundle to the region.
17159 if (ScheduleEnd != OldScheduleEnd) {
17160 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
17161 if (ScheduleData *SD = getScheduleData(I))
17162 SD->clearDependencies();
17163 ReSchedule = true;
17164 }
17165 if (Bundle) {
17166 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
17167 << " in block " << BB->getName() << "\n");
17168 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
17169 }
17170
17171 if (ReSchedule) {
17172 resetSchedule();
17173 initialFillReadyList(ReadyInsts);
17174 }
17175
17176 // Now try to schedule the new bundle or (if no bundle) just calculate
17177 // dependencies. As soon as the bundle is "ready" it means that there are no
17178 // cyclic dependencies and we can schedule it. Note that's important that we
17179 // don't "schedule" the bundle yet (see cancelScheduling).
17180 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17181 !ReadyInsts.empty()) {
17182 ScheduleData *Picked = ReadyInsts.pop_back_val();
17183 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17184 "must be ready to schedule");
17185 schedule(Picked, ReadyInsts);
17186 }
17187 };
17188
17189 // Make sure that the scheduling region contains all
17190 // instructions of the bundle.
17191 for (Value *V : VL) {
17193 continue;
17194 if (!extendSchedulingRegion(V, S)) {
17195 // If the scheduling region got new instructions at the lower end (or it
17196 // is a new region for the first bundle). This makes it necessary to
17197 // recalculate all dependencies.
17198 // Otherwise the compiler may crash trying to incorrectly calculate
17199 // dependencies and emit instruction in the wrong order at the actual
17200 // scheduling.
17201 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
17202 return std::nullopt;
17203 }
17204 }
17205
17206 bool ReSchedule = false;
17207 for (Value *V : VL) {
17209 continue;
17210 ScheduleData *BundleMember = getScheduleData(V);
17211 assert(BundleMember &&
17212 "no ScheduleData for bundle member (maybe not in same basic block)");
17213
17214 // Make sure we don't leave the pieces of the bundle in the ready list when
17215 // whole bundle might not be ready.
17216 ReadyInsts.remove(BundleMember);
17217
17218 if (!BundleMember->IsScheduled)
17219 continue;
17220 // A bundle member was scheduled as single instruction before and now
17221 // needs to be scheduled as part of the bundle. We just get rid of the
17222 // existing schedule.
17223 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
17224 << " was already scheduled\n");
17225 ReSchedule = true;
17226 }
17227
17228 auto *Bundle = buildBundle(VL);
17229 TryScheduleBundleImpl(ReSchedule, Bundle);
17230 if (!Bundle->isReady()) {
17231 cancelScheduling(VL, S.getMainOp());
17232 return std::nullopt;
17233 }
17234 return Bundle;
17235}
17236
17237void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17238 Value *OpValue) {
17239 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17241 return;
17242
17243 if (doesNotNeedToBeScheduled(OpValue))
17244 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17245 ScheduleData *Bundle = getScheduleData(OpValue);
17246 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17247 assert(!Bundle->IsScheduled &&
17248 "Can't cancel bundle which is already scheduled");
17249 assert(Bundle->isSchedulingEntity() &&
17250 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17251 "tried to unbundle something which is not a bundle");
17252
17253 // Remove the bundle from the ready list.
17254 if (Bundle->isReady())
17255 ReadyInsts.remove(Bundle);
17256
17257 // Un-bundle: make single instructions out of the bundle.
17258 ScheduleData *BundleMember = Bundle;
17259 while (BundleMember) {
17260 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17261 BundleMember->FirstInBundle = BundleMember;
17262 ScheduleData *Next = BundleMember->NextInBundle;
17263 BundleMember->NextInBundle = nullptr;
17264 BundleMember->TE = nullptr;
17265 if (BundleMember->unscheduledDepsInBundle() == 0) {
17266 ReadyInsts.insert(BundleMember);
17267 }
17268 BundleMember = Next;
17269 }
17270}
17271
17272BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17273 // Allocate a new ScheduleData for the instruction.
17274 if (ChunkPos >= ChunkSize) {
17275 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17276 ChunkPos = 0;
17277 }
17278 return &(ScheduleDataChunks.back()[ChunkPos++]);
17279}
17280
17281bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17282 Value *V, const InstructionsState &S) {
17283 Instruction *I = dyn_cast<Instruction>(V);
17284 assert(I && "bundle member must be an instruction");
17285 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17287 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17288 "be scheduled");
17289 if (getScheduleData(I))
17290 return true;
17291 if (!ScheduleStart) {
17292 // It's the first instruction in the new region.
17293 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17294 ScheduleStart = I;
17295 ScheduleEnd = I->getNextNode();
17296 assert(ScheduleEnd && "tried to vectorize a terminator?");
17297 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17298 return true;
17299 }
17300 // Search up and down at the same time, because we don't know if the new
17301 // instruction is above or below the existing scheduling region.
17302 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17303 // against the budget. Otherwise debug info could affect codegen.
17305 ++ScheduleStart->getIterator().getReverse();
17306 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17307 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17308 BasicBlock::iterator LowerEnd = BB->end();
17309 auto IsAssumeLikeIntr = [](const Instruction &I) {
17310 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17311 return II->isAssumeLikeIntrinsic();
17312 return false;
17313 };
17314 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17315 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17316 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17317 &*DownIter != I) {
17318 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17319 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17320 return false;
17321 }
17322
17323 ++UpIter;
17324 ++DownIter;
17325
17326 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17327 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17328 }
17329 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17330 assert(I->getParent() == ScheduleStart->getParent() &&
17331 "Instruction is in wrong basic block.");
17332 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17333 ScheduleStart = I;
17334 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17335 << "\n");
17336 return true;
17337 }
17338 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17339 "Expected to reach top of the basic block or instruction down the "
17340 "lower end.");
17341 assert(I->getParent() == ScheduleEnd->getParent() &&
17342 "Instruction is in wrong basic block.");
17343 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17344 nullptr);
17345 ScheduleEnd = I->getNextNode();
17346 assert(ScheduleEnd && "tried to vectorize a terminator?");
17347 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17348 return true;
17349}
17350
17351void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17352 Instruction *ToI,
17353 ScheduleData *PrevLoadStore,
17354 ScheduleData *NextLoadStore) {
17355 ScheduleData *CurrentLoadStore = PrevLoadStore;
17356 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17357 // No need to allocate data for non-schedulable instructions.
17359 continue;
17360 ScheduleData *SD = ScheduleDataMap.lookup(I);
17361 if (!SD) {
17362 SD = allocateScheduleDataChunks();
17363 ScheduleDataMap[I] = SD;
17364 }
17365 assert(!isInSchedulingRegion(SD) &&
17366 "new ScheduleData already in scheduling region");
17367 SD->init(SchedulingRegionID, I);
17368
17369 if (I->mayReadOrWriteMemory() &&
17370 (!isa<IntrinsicInst>(I) ||
17371 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17372 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17373 Intrinsic::pseudoprobe))) {
17374 // Update the linked list of memory accessing instructions.
17375 if (CurrentLoadStore) {
17376 CurrentLoadStore->NextLoadStore = SD;
17377 } else {
17378 FirstLoadStoreInRegion = SD;
17379 }
17380 CurrentLoadStore = SD;
17381 }
17382
17383 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17384 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17385 RegionHasStackSave = true;
17386 }
17387 if (NextLoadStore) {
17388 if (CurrentLoadStore)
17389 CurrentLoadStore->NextLoadStore = NextLoadStore;
17390 } else {
17391 LastLoadStoreInRegion = CurrentLoadStore;
17392 }
17393}
17394
17395void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17396 bool InsertInReadyList,
17397 BoUpSLP *SLP) {
17398 assert(SD->isSchedulingEntity());
17399
17401 WorkList.push_back(SD);
17402
17403 while (!WorkList.empty()) {
17404 ScheduleData *SD = WorkList.pop_back_val();
17405 for (ScheduleData *BundleMember = SD; BundleMember;
17406 BundleMember = BundleMember->NextInBundle) {
17407 assert(isInSchedulingRegion(BundleMember));
17408 if (BundleMember->hasValidDependencies())
17409 continue;
17410
17411 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17412 << "\n");
17413 BundleMember->Dependencies = 0;
17414 BundleMember->resetUnscheduledDeps();
17415
17416 // Handle def-use chain dependencies.
17417 for (User *U : BundleMember->Inst->users()) {
17418 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17419 BundleMember->Dependencies++;
17420 ScheduleData *DestBundle = UseSD->FirstInBundle;
17421 if (!DestBundle->IsScheduled)
17422 BundleMember->incrementUnscheduledDeps(1);
17423 if (!DestBundle->hasValidDependencies())
17424 WorkList.push_back(DestBundle);
17425 }
17426 }
17427
17428 auto MakeControlDependent = [&](Instruction *I) {
17429 auto *DepDest = getScheduleData(I);
17430 assert(DepDest && "must be in schedule window");
17431 DepDest->ControlDependencies.push_back(BundleMember);
17432 BundleMember->Dependencies++;
17433 ScheduleData *DestBundle = DepDest->FirstInBundle;
17434 if (!DestBundle->IsScheduled)
17435 BundleMember->incrementUnscheduledDeps(1);
17436 if (!DestBundle->hasValidDependencies())
17437 WorkList.push_back(DestBundle);
17438 };
17439
17440 // Any instruction which isn't safe to speculate at the beginning of the
17441 // block is control dependend on any early exit or non-willreturn call
17442 // which proceeds it.
17443 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17444 for (Instruction *I = BundleMember->Inst->getNextNode();
17445 I != ScheduleEnd; I = I->getNextNode()) {
17446 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17447 continue;
17448
17449 // Add the dependency
17450 MakeControlDependent(I);
17451
17453 // Everything past here must be control dependent on I.
17454 break;
17455 }
17456 }
17457
17458 if (RegionHasStackSave) {
17459 // If we have an inalloc alloca instruction, it needs to be scheduled
17460 // after any preceeding stacksave. We also need to prevent any alloca
17461 // from reordering above a preceeding stackrestore.
17462 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17463 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17464 for (Instruction *I = BundleMember->Inst->getNextNode();
17465 I != ScheduleEnd; I = I->getNextNode()) {
17466 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17467 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17468 // Any allocas past here must be control dependent on I, and I
17469 // must be memory dependend on BundleMember->Inst.
17470 break;
17471
17472 if (!isa<AllocaInst>(I))
17473 continue;
17474
17475 // Add the dependency
17476 MakeControlDependent(I);
17477 }
17478 }
17479
17480 // In addition to the cases handle just above, we need to prevent
17481 // allocas and loads/stores from moving below a stacksave or a
17482 // stackrestore. Avoiding moving allocas below stackrestore is currently
17483 // thought to be conservatism. Moving loads/stores below a stackrestore
17484 // can lead to incorrect code.
17485 if (isa<AllocaInst>(BundleMember->Inst) ||
17486 BundleMember->Inst->mayReadOrWriteMemory()) {
17487 for (Instruction *I = BundleMember->Inst->getNextNode();
17488 I != ScheduleEnd; I = I->getNextNode()) {
17489 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17490 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17491 continue;
17492
17493 // Add the dependency
17494 MakeControlDependent(I);
17495 break;
17496 }
17497 }
17498 }
17499
17500 // Handle the memory dependencies (if any).
17501 ScheduleData *DepDest = BundleMember->NextLoadStore;
17502 if (!DepDest)
17503 continue;
17504 Instruction *SrcInst = BundleMember->Inst;
17505 assert(SrcInst->mayReadOrWriteMemory() &&
17506 "NextLoadStore list for non memory effecting bundle?");
17507 MemoryLocation SrcLoc = getLocation(SrcInst);
17508 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17509 unsigned NumAliased = 0;
17510 unsigned DistToSrc = 1;
17511
17512 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17513 assert(isInSchedulingRegion(DepDest));
17514
17515 // We have two limits to reduce the complexity:
17516 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17517 // SLP->isAliased (which is the expensive part in this loop).
17518 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17519 // the whole loop (even if the loop is fast, it's quadratic).
17520 // It's important for the loop break condition (see below) to
17521 // check this limit even between two read-only instructions.
17522 if (DistToSrc >= MaxMemDepDistance ||
17523 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17524 (NumAliased >= AliasedCheckLimit ||
17525 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17526
17527 // We increment the counter only if the locations are aliased
17528 // (instead of counting all alias checks). This gives a better
17529 // balance between reduced runtime and accurate dependencies.
17530 NumAliased++;
17531
17532 DepDest->MemoryDependencies.push_back(BundleMember);
17533 BundleMember->Dependencies++;
17534 ScheduleData *DestBundle = DepDest->FirstInBundle;
17535 if (!DestBundle->IsScheduled) {
17536 BundleMember->incrementUnscheduledDeps(1);
17537 }
17538 if (!DestBundle->hasValidDependencies()) {
17539 WorkList.push_back(DestBundle);
17540 }
17541 }
17542
17543 // Example, explaining the loop break condition: Let's assume our
17544 // starting instruction is i0 and MaxMemDepDistance = 3.
17545 //
17546 // +--------v--v--v
17547 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17548 // +--------^--^--^
17549 //
17550 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17551 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17552 // Previously we already added dependencies from i3 to i6,i7,i8
17553 // (because of MaxMemDepDistance). As we added a dependency from
17554 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17555 // and we can abort this loop at i6.
17556 if (DistToSrc >= 2 * MaxMemDepDistance)
17557 break;
17558 DistToSrc++;
17559 }
17560 }
17561 if (InsertInReadyList && SD->isReady()) {
17562 ReadyInsts.insert(SD);
17563 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17564 << "\n");
17565 }
17566 }
17567}
17568
17569void BoUpSLP::BlockScheduling::resetSchedule() {
17570 assert(ScheduleStart &&
17571 "tried to reset schedule on block which has not been scheduled");
17572 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17573 if (ScheduleData *SD = getScheduleData(I)) {
17574 assert(isInSchedulingRegion(SD) &&
17575 "ScheduleData not in scheduling region");
17576 SD->IsScheduled = false;
17577 SD->resetUnscheduledDeps();
17578 }
17579 }
17580 ReadyInsts.clear();
17581}
17582
17583void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17584 if (!BS->ScheduleStart)
17585 return;
17586
17587 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17588
17589 // A key point - if we got here, pre-scheduling was able to find a valid
17590 // scheduling of the sub-graph of the scheduling window which consists
17591 // of all vector bundles and their transitive users. As such, we do not
17592 // need to reschedule anything *outside of* that subgraph.
17593
17594 BS->resetSchedule();
17595
17596 // For the real scheduling we use a more sophisticated ready-list: it is
17597 // sorted by the original instruction location. This lets the final schedule
17598 // be as close as possible to the original instruction order.
17599 // WARNING: If changing this order causes a correctness issue, that means
17600 // there is some missing dependence edge in the schedule data graph.
17601 struct ScheduleDataCompare {
17602 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17603 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17604 }
17605 };
17606 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17607
17608 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17609 // and fill the ready-list with initial instructions.
17610 int Idx = 0;
17611 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17612 I = I->getNextNode()) {
17613 if (ScheduleData *SD = BS->getScheduleData(I)) {
17614 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17615 (void)SDTE;
17617 SD->isPartOfBundle() ==
17618 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17619 "scheduler and vectorizer bundle mismatch");
17620 SD->FirstInBundle->SchedulingPriority = Idx++;
17621
17622 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17623 BS->calculateDependencies(SD, false, this);
17624 }
17625 }
17626 BS->initialFillReadyList(ReadyInsts);
17627
17628 Instruction *LastScheduledInst = BS->ScheduleEnd;
17629
17630 // Do the "real" scheduling.
17631 while (!ReadyInsts.empty()) {
17632 ScheduleData *Picked = *ReadyInsts.begin();
17633 ReadyInsts.erase(ReadyInsts.begin());
17634
17635 // Move the scheduled instruction(s) to their dedicated places, if not
17636 // there yet.
17637 for (ScheduleData *BundleMember = Picked; BundleMember;
17638 BundleMember = BundleMember->NextInBundle) {
17639 Instruction *PickedInst = BundleMember->Inst;
17640 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17641 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17642 LastScheduledInst = PickedInst;
17643 }
17644
17645 BS->schedule(Picked, ReadyInsts);
17646 }
17647
17648 // Check that we didn't break any of our invariants.
17649#ifdef EXPENSIVE_CHECKS
17650 BS->verify();
17651#endif
17652
17653#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17654 // Check that all schedulable entities got scheduled
17655 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17656 ScheduleData *SD = BS->getScheduleData(I);
17657 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17658 assert(SD->IsScheduled && "must be scheduled at this point");
17659 }
17660#endif
17661
17662 // Avoid duplicate scheduling of the block.
17663 BS->ScheduleStart = nullptr;
17664}
17665
17667 // If V is a store, just return the width of the stored value (or value
17668 // truncated just before storing) without traversing the expression tree.
17669 // This is the common case.
17670 if (auto *Store = dyn_cast<StoreInst>(V))
17671 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17672
17673 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17674 return getVectorElementSize(IEI->getOperand(1));
17675
17676 auto E = InstrElementSize.find(V);
17677 if (E != InstrElementSize.end())
17678 return E->second;
17679
17680 // If V is not a store, we can traverse the expression tree to find loads
17681 // that feed it. The type of the loaded value may indicate a more suitable
17682 // width than V's type. We want to base the vector element size on the width
17683 // of memory operations where possible.
17686 if (auto *I = dyn_cast<Instruction>(V)) {
17687 Worklist.emplace_back(I, I->getParent(), 0);
17688 Visited.insert(I);
17689 }
17690
17691 // Traverse the expression tree in bottom-up order looking for loads. If we
17692 // encounter an instruction we don't yet handle, we give up.
17693 auto Width = 0u;
17694 Value *FirstNonBool = nullptr;
17695 while (!Worklist.empty()) {
17696 auto [I, Parent, Level] = Worklist.pop_back_val();
17697
17698 // We should only be looking at scalar instructions here. If the current
17699 // instruction has a vector type, skip.
17700 auto *Ty = I->getType();
17701 if (isa<VectorType>(Ty))
17702 continue;
17703 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17704 FirstNonBool = I;
17705 if (Level > RecursionMaxDepth)
17706 continue;
17707
17708 // If the current instruction is a load, update MaxWidth to reflect the
17709 // width of the loaded value.
17710 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17711 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17712
17713 // Otherwise, we need to visit the operands of the instruction. We only
17714 // handle the interesting cases from buildTree here. If an operand is an
17715 // instruction we haven't yet visited and from the same basic block as the
17716 // user or the use is a PHI node, we add it to the worklist.
17719 for (Use &U : I->operands()) {
17720 if (auto *J = dyn_cast<Instruction>(U.get()))
17721 if (Visited.insert(J).second &&
17722 (isa<PHINode>(I) || J->getParent() == Parent)) {
17723 Worklist.emplace_back(J, J->getParent(), Level + 1);
17724 continue;
17725 }
17726 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17727 FirstNonBool = U.get();
17728 }
17729 } else {
17730 break;
17731 }
17732 }
17733
17734 // If we didn't encounter a memory access in the expression tree, or if we
17735 // gave up for some reason, just return the width of V. Otherwise, return the
17736 // maximum width we found.
17737 if (!Width) {
17738 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17739 V = FirstNonBool;
17740 Width = DL->getTypeSizeInBits(V->getType());
17741 }
17742
17743 for (Instruction *I : Visited)
17744 InstrElementSize[I] = Width;
17745
17746 return Width;
17747}
17748
17749bool BoUpSLP::collectValuesToDemote(
17750 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17752 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17753 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17754 // We can always demote constants.
17755 if (all_of(E.Scalars, IsaPred<Constant>))
17756 return true;
17757
17758 unsigned OrigBitWidth =
17759 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17760 if (OrigBitWidth == BitWidth) {
17761 MaxDepthLevel = 1;
17762 return true;
17763 }
17764
17765 // Check if the node was analyzed already and must keep its original bitwidth.
17766 if (NodesToKeepBWs.contains(E.Idx))
17767 return false;
17768
17769 // If the value is not a vectorized instruction in the expression and not used
17770 // by the insertelement instruction and not used in multiple vector nodes, it
17771 // cannot be demoted.
17772 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17773 if (isa<PoisonValue>(R))
17774 return false;
17775 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17776 });
17777 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17778 if (isa<PoisonValue>(V))
17779 return true;
17780 if (MultiNodeScalars.contains(V))
17781 return false;
17782 // For lat shuffle of sext/zext with many uses need to check the extra bit
17783 // for unsigned values, otherwise may have incorrect casting for reused
17784 // scalars.
17785 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17786 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17787 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17788 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17789 return true;
17790 }
17791 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17792 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17793 if (IsSignedNode)
17794 ++BitWidth1;
17795 if (auto *I = dyn_cast<Instruction>(V)) {
17796 APInt Mask = DB->getDemandedBits(I);
17797 unsigned BitWidth2 =
17798 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17799 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17800 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17801 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17802 break;
17803 BitWidth2 *= 2;
17804 }
17805 BitWidth1 = std::min(BitWidth1, BitWidth2);
17806 }
17807 BitWidth = std::max(BitWidth, BitWidth1);
17808 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17809 };
17810 auto FinalAnalysis = [&, TTI = TTI]() {
17811 if (!IsProfitableToDemote)
17812 return false;
17813 bool Res = all_of(
17814 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17815 // Demote gathers.
17816 if (Res && E.isGather()) {
17817 // Check possible extractelement instructions bases and final vector
17818 // length.
17819 SmallPtrSet<Value *, 4> UniqueBases;
17820 for (Value *V : E.Scalars) {
17821 auto *EE = dyn_cast<ExtractElementInst>(V);
17822 if (!EE)
17823 continue;
17824 UniqueBases.insert(EE->getVectorOperand());
17825 }
17826 const unsigned VF = E.Scalars.size();
17827 Type *OrigScalarTy = E.Scalars.front()->getType();
17828 if (UniqueBases.size() <= 2 ||
17829 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17831 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17832 ToDemote.push_back(E.Idx);
17833 }
17834 return Res;
17835 };
17836 if (E.isGather() || !Visited.insert(&E).second ||
17837 any_of(E.Scalars, [&](Value *V) {
17838 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17839 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17840 });
17841 }))
17842 return FinalAnalysis();
17843
17844 if (any_of(E.Scalars, [&](Value *V) {
17845 return !all_of(V->users(), [=](User *U) {
17846 return getTreeEntry(U) ||
17847 (E.Idx == 0 && UserIgnoreList &&
17848 UserIgnoreList->contains(U)) ||
17849 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17850 !U->getType()->isScalableTy() &&
17851 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17852 }) && !IsPotentiallyTruncated(V, BitWidth);
17853 }))
17854 return false;
17855
17856 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17857 bool &NeedToExit) {
17858 NeedToExit = false;
17859 unsigned InitLevel = MaxDepthLevel;
17860 for (const TreeEntry *Op : Operands) {
17861 unsigned Level = InitLevel;
17862 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17863 ToDemote, Visited, NodesToKeepBWs, Level,
17864 IsProfitableToDemote, IsTruncRoot)) {
17865 if (!IsProfitableToDemote)
17866 return false;
17867 NeedToExit = true;
17868 if (!FinalAnalysis())
17869 return false;
17870 continue;
17871 }
17872 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17873 }
17874 return true;
17875 };
17876 auto AttemptCheckBitwidth =
17877 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17878 // Try all bitwidth < OrigBitWidth.
17879 NeedToExit = false;
17880 unsigned BestFailBitwidth = 0;
17881 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17882 if (Checker(BitWidth, OrigBitWidth))
17883 return true;
17884 if (BestFailBitwidth == 0 && FinalAnalysis())
17885 BestFailBitwidth = BitWidth;
17886 }
17887 if (BitWidth >= OrigBitWidth) {
17888 if (BestFailBitwidth == 0) {
17889 BitWidth = OrigBitWidth;
17890 return false;
17891 }
17892 MaxDepthLevel = 1;
17893 BitWidth = BestFailBitwidth;
17894 NeedToExit = true;
17895 return true;
17896 }
17897 return false;
17898 };
17899 auto TryProcessInstruction =
17900 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17901 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17902 if (Operands.empty()) {
17903 if (!IsTruncRoot)
17904 MaxDepthLevel = 1;
17905 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17906 std::ref(BitWidth)));
17907 } else {
17908 // Several vectorized uses? Check if we can truncate it, otherwise -
17909 // exit.
17910 if (E.UserTreeIndices.size() > 1 &&
17911 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17912 std::ref(BitWidth))))
17913 return false;
17914 bool NeedToExit = false;
17915 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17916 return false;
17917 if (NeedToExit)
17918 return true;
17919 if (!ProcessOperands(Operands, NeedToExit))
17920 return false;
17921 if (NeedToExit)
17922 return true;
17923 }
17924
17925 ++MaxDepthLevel;
17926 // Record the entry that we can demote.
17927 ToDemote.push_back(E.Idx);
17928 return IsProfitableToDemote;
17929 };
17930 switch (E.getOpcode()) {
17931
17932 // We can always demote truncations and extensions. Since truncations can
17933 // seed additional demotion, we save the truncated value.
17934 case Instruction::Trunc:
17935 if (IsProfitableToDemoteRoot)
17936 IsProfitableToDemote = true;
17937 return TryProcessInstruction(BitWidth);
17938 case Instruction::ZExt:
17939 case Instruction::SExt:
17940 IsProfitableToDemote = true;
17941 return TryProcessInstruction(BitWidth);
17942
17943 // We can demote certain binary operations if we can demote both of their
17944 // operands.
17945 case Instruction::Add:
17946 case Instruction::Sub:
17947 case Instruction::Mul:
17948 case Instruction::And:
17949 case Instruction::Or:
17950 case Instruction::Xor: {
17951 return TryProcessInstruction(
17952 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17953 }
17954 case Instruction::Freeze:
17955 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17956 case Instruction::Shl: {
17957 // If we are truncating the result of this SHL, and if it's a shift of an
17958 // inrange amount, we can always perform a SHL in a smaller type.
17959 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17960 return all_of(E.Scalars, [&](Value *V) {
17961 if (isa<PoisonValue>(V))
17962 return true;
17963 auto *I = cast<Instruction>(V);
17964 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17965 return AmtKnownBits.getMaxValue().ult(BitWidth);
17966 });
17967 };
17968 return TryProcessInstruction(
17969 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17970 }
17971 case Instruction::LShr: {
17972 // If this is a truncate of a logical shr, we can truncate it to a smaller
17973 // lshr iff we know that the bits we would otherwise be shifting in are
17974 // already zeros.
17975 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17976 return all_of(E.Scalars, [&](Value *V) {
17977 if (isa<PoisonValue>(V))
17978 return true;
17979 auto *I = cast<Instruction>(V);
17980 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17981 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17982 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17983 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17984 SimplifyQuery(*DL));
17985 });
17986 };
17987 return TryProcessInstruction(
17988 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17989 LShrChecker);
17990 }
17991 case Instruction::AShr: {
17992 // If this is a truncate of an arithmetic shr, we can truncate it to a
17993 // smaller ashr iff we know that all the bits from the sign bit of the
17994 // original type and the sign bit of the truncate type are similar.
17995 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17996 return all_of(E.Scalars, [&](Value *V) {
17997 if (isa<PoisonValue>(V))
17998 return true;
17999 auto *I = cast<Instruction>(V);
18000 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
18001 unsigned ShiftedBits = OrigBitWidth - BitWidth;
18002 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18003 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18004 nullptr, DT);
18005 });
18006 };
18007 return TryProcessInstruction(
18008 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18009 AShrChecker);
18010 }
18011 case Instruction::UDiv:
18012 case Instruction::URem: {
18013 // UDiv and URem can be truncated if all the truncated bits are zero.
18014 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18015 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18016 return all_of(E.Scalars, [&](Value *V) {
18017 auto *I = cast<Instruction>(V);
18018 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18019 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18020 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18021 });
18022 };
18023 return TryProcessInstruction(
18024 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18025 }
18026
18027 // We can demote selects if we can demote their true and false values.
18028 case Instruction::Select: {
18029 return TryProcessInstruction(
18030 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18031 }
18032
18033 // We can demote phis if we can demote all their incoming operands. Note that
18034 // we don't need to worry about cycles since we ensure single use above.
18035 case Instruction::PHI: {
18036 const unsigned NumOps = E.getNumOperands();
18038 transform(seq<unsigned>(0, NumOps), Ops.begin(),
18039 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
18040
18041 return TryProcessInstruction(BitWidth, Ops);
18042 }
18043
18044 case Instruction::Call: {
18045 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18046 if (!IC)
18047 break;
18049 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
18050 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
18051 break;
18052 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
18053 function_ref<bool(unsigned, unsigned)> CallChecker;
18054 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18055 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18056 return all_of(E.Scalars, [&](Value *V) {
18057 auto *I = cast<Instruction>(V);
18058 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18059 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18060 return MaskedValueIsZero(I->getOperand(0), Mask,
18061 SimplifyQuery(*DL)) &&
18062 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18063 }
18064 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
18065 "Expected min/max intrinsics only.");
18066 unsigned SignBits = OrigBitWidth - BitWidth;
18067 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18068 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18069 nullptr, DT);
18070 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18071 nullptr, DT);
18072 return SignBits <= Op0SignBits &&
18073 ((SignBits != Op0SignBits &&
18074 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18075 MaskedValueIsZero(I->getOperand(0), Mask,
18076 SimplifyQuery(*DL))) &&
18077 SignBits <= Op1SignBits &&
18078 ((SignBits != Op1SignBits &&
18079 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
18080 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
18081 });
18082 };
18083 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18084 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18085 return all_of(E.Scalars, [&](Value *V) {
18086 auto *I = cast<Instruction>(V);
18087 unsigned SignBits = OrigBitWidth - BitWidth;
18088 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18089 unsigned Op0SignBits =
18090 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18091 return SignBits <= Op0SignBits &&
18092 ((SignBits != Op0SignBits &&
18093 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18094 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18095 });
18096 };
18097 if (ID != Intrinsic::abs) {
18098 Operands.push_back(getOperandEntry(&E, 1));
18099 CallChecker = CompChecker;
18100 } else {
18101 CallChecker = AbsChecker;
18102 }
18103 InstructionCost BestCost =
18104 std::numeric_limits<InstructionCost::CostType>::max();
18105 unsigned BestBitWidth = BitWidth;
18106 unsigned VF = E.Scalars.size();
18107 // Choose the best bitwidth based on cost estimations.
18108 auto Checker = [&](unsigned BitWidth, unsigned) {
18109 unsigned MinBW = PowerOf2Ceil(BitWidth);
18110 SmallVector<Type *> ArgTys =
18111 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
18112 auto VecCallCosts = getVectorCallCosts(
18113 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18114 TTI, TLI, ArgTys);
18115 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
18116 if (Cost < BestCost) {
18117 BestCost = Cost;
18118 BestBitWidth = BitWidth;
18119 }
18120 return false;
18121 };
18122 [[maybe_unused]] bool NeedToExit;
18123 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18124 BitWidth = BestBitWidth;
18125 return TryProcessInstruction(BitWidth, Operands, CallChecker);
18126 }
18127
18128 // Otherwise, conservatively give up.
18129 default:
18130 break;
18131 }
18132 MaxDepthLevel = 1;
18133 return FinalAnalysis();
18134}
18135
18136static RecurKind getRdxKind(Value *V);
18137
18139 // We only attempt to truncate integer expressions.
18140 bool IsStoreOrInsertElt =
18141 VectorizableTree.front()->getOpcode() == Instruction::Store ||
18142 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
18143 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18144 ExtraBitWidthNodes.size() <= 1 &&
18145 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18146 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18147 return;
18148
18149 unsigned NodeIdx = 0;
18150 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18151 NodeIdx = 1;
18152
18153 // Ensure the roots of the vectorizable tree don't form a cycle.
18154 if (VectorizableTree[NodeIdx]->isGather() ||
18155 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18156 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18157 [NodeIdx](const EdgeInfo &EI) {
18158 return EI.UserTE->Idx > NodeIdx;
18159 })))
18160 return;
18161
18162 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
18163 // resize to the final type.
18164 bool IsTruncRoot = false;
18165 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18166 SmallVector<unsigned> RootDemotes;
18167 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18168 if (NodeIdx != 0 &&
18169 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18170 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18171 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
18172 IsTruncRoot = true;
18173 RootDemotes.push_back(NodeIdx);
18174 IsProfitableToDemoteRoot = true;
18175 ++NodeIdx;
18176 }
18177
18178 // Analyzed the reduction already and not profitable - exit.
18179 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18180 return;
18181
18182 SmallVector<unsigned> ToDemote;
18183 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
18184 bool IsProfitableToDemoteRoot, unsigned Opcode,
18185 unsigned Limit, bool IsTruncRoot,
18186 bool IsSignedCmp) -> unsigned {
18187 ToDemote.clear();
18188 // Check if the root is trunc and the next node is gather/buildvector, then
18189 // keep trunc in scalars, which is free in most cases.
18190 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18191 !NodesToKeepBWs.contains(E.Idx) &&
18192 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18193 all_of(E.Scalars, [&](Value *V) {
18194 return V->hasOneUse() || isa<Constant>(V) ||
18195 (!V->hasNUsesOrMore(UsesLimit) &&
18196 none_of(V->users(), [&](User *U) {
18197 const TreeEntry *TE = getTreeEntry(U);
18198 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18199 if (TE == UserTE || !TE)
18200 return false;
18201 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18202 SelectInst>(U) ||
18203 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18204 SelectInst>(UserTE->getMainOp()))
18205 return true;
18206 unsigned UserTESz = DL->getTypeSizeInBits(
18207 UserTE->Scalars.front()->getType());
18208 auto It = MinBWs.find(TE);
18209 if (It != MinBWs.end() && It->second.first > UserTESz)
18210 return true;
18211 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18212 }));
18213 })) {
18214 ToDemote.push_back(E.Idx);
18215 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18216 auto It = MinBWs.find(UserTE);
18217 if (It != MinBWs.end())
18218 return It->second.first;
18219 unsigned MaxBitWidth =
18220 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18221 MaxBitWidth = bit_ceil(MaxBitWidth);
18222 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18223 MaxBitWidth = 8;
18224 return MaxBitWidth;
18225 }
18226
18227 unsigned VF = E.getVectorFactor();
18228 Type *ScalarTy = E.Scalars.front()->getType();
18229 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18230 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18231 if (!TreeRootIT || !Opcode)
18232 return 0u;
18233
18234 if (any_of(E.Scalars,
18235 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18236 return 0u;
18237
18238 unsigned NumParts = TTI->getNumberOfParts(
18239 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18240
18241 // The maximum bit width required to represent all the values that can be
18242 // demoted without loss of precision. It would be safe to truncate the roots
18243 // of the expression to this width.
18244 unsigned MaxBitWidth = 1u;
18245
18246 // True if the roots can be zero-extended back to their original type,
18247 // rather than sign-extended. We know that if the leading bits are not
18248 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18249 // True.
18250 // Determine if the sign bit of all the roots is known to be zero. If not,
18251 // IsKnownPositive is set to False.
18252 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18253 if (isa<PoisonValue>(R))
18254 return true;
18255 KnownBits Known = computeKnownBits(R, *DL);
18256 return Known.isNonNegative();
18257 });
18258
18259 // We first check if all the bits of the roots are demanded. If they're not,
18260 // we can truncate the roots to this narrower type.
18261 for (Value *Root : E.Scalars) {
18262 if (isa<PoisonValue>(Root))
18263 continue;
18264 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18265 TypeSize NumTypeBits =
18266 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18267 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18268 // If we can't prove that the sign bit is zero, we must add one to the
18269 // maximum bit width to account for the unknown sign bit. This preserves
18270 // the existing sign bit so we can safely sign-extend the root back to the
18271 // original type. Otherwise, if we know the sign bit is zero, we will
18272 // zero-extend the root instead.
18273 //
18274 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18275 // one to the maximum bit width will yield a larger-than-necessary
18276 // type. In general, we need to add an extra bit only if we can't
18277 // prove that the upper bit of the original type is equal to the
18278 // upper bit of the proposed smaller type. If these two bits are
18279 // the same (either zero or one) we know that sign-extending from
18280 // the smaller type will result in the same value. Here, since we
18281 // can't yet prove this, we are just making the proposed smaller
18282 // type larger to ensure correctness.
18283 if (!IsKnownPositive)
18284 ++BitWidth1;
18285
18286 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18287 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18288 MaxBitWidth =
18289 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18290 }
18291
18292 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18293 MaxBitWidth = 8;
18294
18295 // If the original type is large, but reduced type does not improve the reg
18296 // use - ignore it.
18297 if (NumParts > 1 &&
18298 NumParts ==
18300 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18301 return 0u;
18302
18303 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18304 Opcode == Instruction::SExt ||
18305 Opcode == Instruction::ZExt || NumParts > 1;
18306 // Conservatively determine if we can actually truncate the roots of the
18307 // expression. Collect the values that can be demoted in ToDemote and
18308 // additional roots that require investigating in Roots.
18310 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18311 bool NeedToDemote = IsProfitableToDemote;
18312
18313 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18314 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18315 NeedToDemote, IsTruncRoot) ||
18316 (MaxDepthLevel <= Limit &&
18317 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18318 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18319 DL->getTypeSizeInBits(TreeRootIT) /
18320 DL->getTypeSizeInBits(
18321 E.getMainOp()->getOperand(0)->getType()) >
18322 2)))))
18323 return 0u;
18324 // Round MaxBitWidth up to the next power-of-two.
18325 MaxBitWidth = bit_ceil(MaxBitWidth);
18326
18327 return MaxBitWidth;
18328 };
18329
18330 // If we can truncate the root, we must collect additional values that might
18331 // be demoted as a result. That is, those seeded by truncations we will
18332 // modify.
18333 // Add reduction ops sizes, if any.
18334 if (UserIgnoreList &&
18335 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18336 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18337 // x i1> to in)).
18338 if (all_of(*UserIgnoreList,
18339 [](Value *V) {
18340 return isa<PoisonValue>(V) ||
18341 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18342 }) &&
18343 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18344 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18345 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18346 Builder.getInt1Ty()) {
18347 ReductionBitWidth = 1;
18348 } else {
18349 for (Value *V : *UserIgnoreList) {
18350 if (isa<PoisonValue>(V))
18351 continue;
18352 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18353 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18354 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18356 ++BitWidth1;
18357 unsigned BitWidth2 = BitWidth1;
18359 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18360 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18361 }
18362 ReductionBitWidth =
18363 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18364 }
18365 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18366 ReductionBitWidth = 8;
18367
18368 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18369 }
18370 }
18371 bool IsTopRoot = NodeIdx == 0;
18372 while (NodeIdx < VectorizableTree.size() &&
18373 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18374 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18375 RootDemotes.push_back(NodeIdx);
18376 ++NodeIdx;
18377 IsTruncRoot = true;
18378 }
18379 bool IsSignedCmp = false;
18380 while (NodeIdx < VectorizableTree.size()) {
18381 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18382 unsigned Limit = 2;
18383 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18384 if (IsTopRoot &&
18385 ReductionBitWidth ==
18386 DL->getTypeSizeInBits(
18387 VectorizableTree.front()->Scalars.front()->getType()))
18388 Limit = 3;
18389 unsigned MaxBitWidth = ComputeMaxBitWidth(
18390 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18391 Limit, IsTruncRoot, IsSignedCmp);
18392 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18393 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18394 ReductionBitWidth = bit_ceil(MaxBitWidth);
18395 else if (MaxBitWidth == 0)
18396 ReductionBitWidth = 0;
18397 }
18398
18399 for (unsigned Idx : RootDemotes) {
18400 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18401 uint32_t OrigBitWidth =
18402 DL->getTypeSizeInBits(V->getType()->getScalarType());
18403 if (OrigBitWidth > MaxBitWidth) {
18404 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18405 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18406 }
18407 return false;
18408 }))
18409 ToDemote.push_back(Idx);
18410 }
18411 RootDemotes.clear();
18412 IsTopRoot = false;
18413 IsProfitableToDemoteRoot = true;
18414
18415 if (ExtraBitWidthNodes.empty()) {
18416 NodeIdx = VectorizableTree.size();
18417 } else {
18418 unsigned NewIdx = 0;
18419 do {
18420 NewIdx = *ExtraBitWidthNodes.begin();
18421 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18422 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18423 NodeIdx = NewIdx;
18424 IsTruncRoot =
18425 NodeIdx < VectorizableTree.size() &&
18426 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18427 [](const EdgeInfo &EI) {
18428 return EI.EdgeIdx == 0 &&
18429 EI.UserTE->getOpcode() == Instruction::Trunc &&
18430 !EI.UserTE->isAltShuffle();
18431 });
18432 IsSignedCmp =
18433 NodeIdx < VectorizableTree.size() &&
18434 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18435 [&](const EdgeInfo &EI) {
18436 return EI.UserTE->getOpcode() == Instruction::ICmp &&
18437 any_of(EI.UserTE->Scalars, [&](Value *V) {
18438 auto *IC = dyn_cast<ICmpInst>(V);
18439 return IC &&
18440 (IC->isSigned() ||
18441 !isKnownNonNegative(IC->getOperand(0),
18442 SimplifyQuery(*DL)) ||
18443 !isKnownNonNegative(IC->getOperand(1),
18444 SimplifyQuery(*DL)));
18445 });
18446 });
18447 }
18448
18449 // If the maximum bit width we compute is less than the width of the roots'
18450 // type, we can proceed with the narrowing. Otherwise, do nothing.
18451 if (MaxBitWidth == 0 ||
18452 MaxBitWidth >=
18453 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18454 ->getBitWidth()) {
18455 if (UserIgnoreList)
18456 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18457 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18458 continue;
18459 }
18460
18461 // Finally, map the values we can demote to the maximum bit with we
18462 // computed.
18463 for (unsigned Idx : ToDemote) {
18464 TreeEntry *TE = VectorizableTree[Idx].get();
18465 if (MinBWs.contains(TE))
18466 continue;
18467 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18468 if (isa<PoisonValue>(R))
18469 return false;
18470 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18471 });
18472 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18473 }
18474 }
18475}
18476
18478 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18479 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18480 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18481 auto *AA = &AM.getResult<AAManager>(F);
18482 auto *LI = &AM.getResult<LoopAnalysis>(F);
18483 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18484 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18485 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18487
18488 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18489 if (!Changed)
18490 return PreservedAnalyses::all();
18491
18494 return PA;
18495}
18496
18498 TargetTransformInfo *TTI_,
18499 TargetLibraryInfo *TLI_, AAResults *AA_,
18500 LoopInfo *LI_, DominatorTree *DT_,
18501 AssumptionCache *AC_, DemandedBits *DB_,
18504 return false;
18505 SE = SE_;
18506 TTI = TTI_;
18507 TLI = TLI_;
18508 AA = AA_;
18509 LI = LI_;
18510 DT = DT_;
18511 AC = AC_;
18512 DB = DB_;
18513 DL = &F.getDataLayout();
18514
18515 Stores.clear();
18516 GEPs.clear();
18517 bool Changed = false;
18518
18519 // If the target claims to have no vector registers don't attempt
18520 // vectorization.
18522 LLVM_DEBUG(
18523 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18524 return false;
18525 }
18526
18527 // Don't vectorize when the attribute NoImplicitFloat is used.
18528 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18529 return false;
18530
18531 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18532
18533 // Use the bottom up slp vectorizer to construct chains that start with
18534 // store instructions.
18535 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18536
18537 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18538 // delete instructions.
18539
18540 // Update DFS numbers now so that we can use them for ordering.
18541 DT->updateDFSNumbers();
18542
18543 // Scan the blocks in the function in post order.
18544 for (auto *BB : post_order(&F.getEntryBlock())) {
18545 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18546 continue;
18547
18548 // Start new block - clear the list of reduction roots.
18549 R.clearReductionData();
18550 collectSeedInstructions(BB);
18551
18552 // Vectorize trees that end at stores.
18553 if (!Stores.empty()) {
18554 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18555 << " underlying objects.\n");
18556 Changed |= vectorizeStoreChains(R);
18557 }
18558
18559 // Vectorize trees that end at reductions.
18560 Changed |= vectorizeChainsInBlock(BB, R);
18561
18562 // Vectorize the index computations of getelementptr instructions. This
18563 // is primarily intended to catch gather-like idioms ending at
18564 // non-consecutive loads.
18565 if (!GEPs.empty()) {
18566 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18567 << " underlying objects.\n");
18568 Changed |= vectorizeGEPIndices(BB, R);
18569 }
18570 }
18571
18572 if (Changed) {
18573 R.optimizeGatherSequence();
18574 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18575 }
18576 return Changed;
18577}
18578
18579std::optional<bool>
18580SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18581 unsigned Idx, unsigned MinVF,
18582 unsigned &Size) {
18583 Size = 0;
18584 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18585 << "\n");
18586 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18587 unsigned VF = Chain.size();
18588
18589 if (!has_single_bit(Sz) ||
18591 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18592 VF) ||
18593 VF < 2 || VF < MinVF) {
18594 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18595 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18596 // all vector lanes are used.
18597 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18598 return false;
18599 }
18600
18601 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18602 << "\n");
18603
18604 SetVector<Value *> ValOps;
18605 for (Value *V : Chain)
18606 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18607 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18608 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18609 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18610 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18611 bool IsAllowedSize =
18612 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18613 ValOps.size()) ||
18614 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18615 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18616 (!S.getMainOp()->isSafeToRemove() ||
18617 any_of(ValOps.getArrayRef(),
18618 [&](Value *V) {
18619 return !isa<ExtractElementInst>(V) &&
18620 (V->getNumUses() > Chain.size() ||
18621 any_of(V->users(), [&](User *U) {
18622 return !Stores.contains(U);
18623 }));
18624 }))) ||
18625 (ValOps.size() > Chain.size() / 2 && !S)) {
18626 Size = (!IsAllowedSize && S) ? 1 : 2;
18627 return false;
18628 }
18629 }
18630 if (R.isLoadCombineCandidate(Chain))
18631 return true;
18632 R.buildTree(Chain);
18633 // Check if tree tiny and store itself or its value is not vectorized.
18634 if (R.isTreeTinyAndNotFullyVectorizable()) {
18635 if (R.isGathered(Chain.front()) ||
18636 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18637 return std::nullopt;
18638 Size = R.getCanonicalGraphSize();
18639 return false;
18640 }
18641 R.reorderTopToBottom();
18642 R.reorderBottomToTop();
18643 R.transformNodes();
18644 R.buildExternalUses();
18645
18646 R.computeMinimumValueSizes();
18647
18648 Size = R.getCanonicalGraphSize();
18649 if (S && S.getOpcode() == Instruction::Load)
18650 Size = 2; // cut off masked gather small trees
18651 InstructionCost Cost = R.getTreeCost();
18652
18653 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18654 if (Cost < -SLPCostThreshold) {
18655 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18656
18657 using namespace ore;
18658
18659 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18660 cast<StoreInst>(Chain[0]))
18661 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18662 << " and with tree size "
18663 << NV("TreeSize", R.getTreeSize()));
18664
18665 R.vectorizeTree();
18666 return true;
18667 }
18668
18669 return false;
18670}
18671
18672/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18673static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18674 bool First) {
18675 unsigned Num = 0;
18676 uint64_t Sum = std::accumulate(
18677 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18678 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18679 unsigned Size = First ? Val.first : Val.second;
18680 if (Size == 1)
18681 return V;
18682 ++Num;
18683 return V + Size;
18684 });
18685 if (Num == 0)
18686 return true;
18687 uint64_t Mean = Sum / Num;
18688 if (Mean == 0)
18689 return true;
18690 uint64_t Dev = std::accumulate(
18691 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18692 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18693 unsigned P = First ? Val.first : Val.second;
18694 if (P == 1)
18695 return V;
18696 return V + (P - Mean) * (P - Mean);
18697 }) /
18698 Num;
18699 return Dev * 81 / (Mean * Mean) == 0;
18700}
18701
18702bool SLPVectorizerPass::vectorizeStores(
18703 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18704 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18705 &Visited) {
18706 // We may run into multiple chains that merge into a single chain. We mark the
18707 // stores that we vectorized so that we don't visit the same store twice.
18708 BoUpSLP::ValueSet VectorizedStores;
18709 bool Changed = false;
18710
18711 struct StoreDistCompare {
18712 bool operator()(const std::pair<unsigned, int> &Op1,
18713 const std::pair<unsigned, int> &Op2) const {
18714 return Op1.second < Op2.second;
18715 }
18716 };
18717 // A set of pairs (index of store in Stores array ref, Distance of the store
18718 // address relative to base store address in units).
18719 using StoreIndexToDistSet =
18720 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18721 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18722 int PrevDist = -1;
18724 // Collect the chain into a list.
18725 for (auto [Idx, Data] : enumerate(Set)) {
18726 if (Operands.empty() || Data.second - PrevDist == 1) {
18727 Operands.push_back(Stores[Data.first]);
18728 PrevDist = Data.second;
18729 if (Idx != Set.size() - 1)
18730 continue;
18731 }
18732 auto E = make_scope_exit([&, &DataVar = Data]() {
18733 Operands.clear();
18734 Operands.push_back(Stores[DataVar.first]);
18735 PrevDist = DataVar.second;
18736 });
18737
18738 if (Operands.size() <= 1 ||
18739 !Visited
18740 .insert({Operands.front(),
18741 cast<StoreInst>(Operands.front())->getValueOperand(),
18742 Operands.back(),
18743 cast<StoreInst>(Operands.back())->getValueOperand(),
18744 Operands.size()})
18745 .second)
18746 continue;
18747
18748 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18749 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18750 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18751
18752 unsigned MaxVF =
18753 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18754 auto *Store = cast<StoreInst>(Operands[0]);
18755 Type *StoreTy = Store->getValueOperand()->getType();
18756 Type *ValueTy = StoreTy;
18757 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18758 ValueTy = Trunc->getSrcTy();
18759 unsigned MinVF = std::max<unsigned>(
18761 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18762 ValueTy)));
18763
18764 if (MaxVF < MinVF) {
18765 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18766 << ") < "
18767 << "MinVF (" << MinVF << ")\n");
18768 continue;
18769 }
18770
18771 unsigned NonPowerOf2VF = 0;
18773 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18774 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18775 // lanes are used.
18776 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18777 if (has_single_bit(CandVF + 1)) {
18778 NonPowerOf2VF = CandVF;
18779 assert(NonPowerOf2VF != MaxVF &&
18780 "Non-power-of-2 VF should not be equal to MaxVF");
18781 }
18782 }
18783
18784 unsigned MaxRegVF = MaxVF;
18785 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18786 if (MaxVF < MinVF) {
18787 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18788 << ") < "
18789 << "MinVF (" << MinVF << ")\n");
18790 continue;
18791 }
18792
18793 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18794 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18795 unsigned Size = MinVF;
18796 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18797 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18798 Size *= 2;
18799 });
18800 unsigned End = Operands.size();
18801 unsigned Repeat = 0;
18802 constexpr unsigned MaxAttempts = 4;
18804 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18805 P.first = P.second = 1;
18806 });
18808 auto IsNotVectorized = [](bool First,
18809 const std::pair<unsigned, unsigned> &P) {
18810 return First ? P.first > 0 : P.second > 0;
18811 };
18812 auto IsVectorized = [](bool First,
18813 const std::pair<unsigned, unsigned> &P) {
18814 return First ? P.first == 0 : P.second == 0;
18815 };
18816 auto VFIsProfitable = [](bool First, unsigned Size,
18817 const std::pair<unsigned, unsigned> &P) {
18818 return First ? Size >= P.first : Size >= P.second;
18819 };
18820 auto FirstSizeSame = [](unsigned Size,
18821 const std::pair<unsigned, unsigned> &P) {
18822 return Size == P.first;
18823 };
18824 while (true) {
18825 ++Repeat;
18826 bool RepeatChanged = false;
18827 bool AnyProfitableGraph = false;
18828 for (unsigned Size : CandidateVFs) {
18829 AnyProfitableGraph = false;
18830 unsigned StartIdx = std::distance(
18831 RangeSizes.begin(),
18832 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18833 std::placeholders::_1)));
18834 while (StartIdx < End) {
18835 unsigned EndIdx =
18836 std::distance(RangeSizes.begin(),
18837 find_if(RangeSizes.drop_front(StartIdx),
18838 std::bind(IsVectorized, Size >= MaxRegVF,
18839 std::placeholders::_1)));
18840 unsigned Sz = EndIdx >= End ? End : EndIdx;
18841 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18842 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18843 Size >= MaxRegVF)) {
18844 ++Cnt;
18845 continue;
18846 }
18848 assert(all_of(Slice,
18849 [&](Value *V) {
18850 return cast<StoreInst>(V)
18851 ->getValueOperand()
18852 ->getType() ==
18853 cast<StoreInst>(Slice.front())
18854 ->getValueOperand()
18855 ->getType();
18856 }) &&
18857 "Expected all operands of same type.");
18858 if (!NonSchedulable.empty()) {
18859 auto [NonSchedSizeMax, NonSchedSizeMin] =
18860 NonSchedulable.lookup(Slice.front());
18861 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18862 Cnt += NonSchedSizeMax;
18863 continue;
18864 }
18865 }
18866 unsigned TreeSize;
18867 std::optional<bool> Res =
18868 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18869 if (!Res) {
18870 NonSchedulable
18871 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18872 .first->getSecond()
18873 .second = Size;
18874 } else if (*Res) {
18875 // Mark the vectorized stores so that we don't vectorize them
18876 // again.
18877 VectorizedStores.insert(Slice.begin(), Slice.end());
18878 // Mark the vectorized stores so that we don't vectorize them
18879 // again.
18880 AnyProfitableGraph = RepeatChanged = Changed = true;
18881 // If we vectorized initial block, no need to try to vectorize
18882 // it again.
18883 for_each(RangeSizes.slice(Cnt, Size),
18884 [](std::pair<unsigned, unsigned> &P) {
18885 P.first = P.second = 0;
18886 });
18887 if (Cnt < StartIdx + MinVF) {
18888 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18889 [](std::pair<unsigned, unsigned> &P) {
18890 P.first = P.second = 0;
18891 });
18892 StartIdx = Cnt + Size;
18893 }
18894 if (Cnt > Sz - Size - MinVF) {
18895 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18896 [](std::pair<unsigned, unsigned> &P) {
18897 P.first = P.second = 0;
18898 });
18899 if (Sz == End)
18900 End = Cnt;
18901 Sz = Cnt;
18902 }
18903 Cnt += Size;
18904 continue;
18905 }
18906 if (Size > 2 && Res &&
18907 !all_of(RangeSizes.slice(Cnt, Size),
18908 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18909 std::placeholders::_1))) {
18910 Cnt += Size;
18911 continue;
18912 }
18913 // Check for the very big VFs that we're not rebuilding same
18914 // trees, just with larger number of elements.
18915 if (Size > MaxRegVF && TreeSize > 1 &&
18916 all_of(RangeSizes.slice(Cnt, Size),
18917 std::bind(FirstSizeSame, TreeSize,
18918 std::placeholders::_1))) {
18919 Cnt += Size;
18920 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18921 ++Cnt;
18922 continue;
18923 }
18924 if (TreeSize > 1)
18925 for_each(RangeSizes.slice(Cnt, Size),
18926 [&](std::pair<unsigned, unsigned> &P) {
18927 if (Size >= MaxRegVF)
18928 P.second = std::max(P.second, TreeSize);
18929 else
18930 P.first = std::max(P.first, TreeSize);
18931 });
18932 ++Cnt;
18933 AnyProfitableGraph = true;
18934 }
18935 if (StartIdx >= End)
18936 break;
18937 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18938 AnyProfitableGraph = true;
18939 StartIdx = std::distance(
18940 RangeSizes.begin(),
18941 find_if(RangeSizes.drop_front(Sz),
18942 std::bind(IsNotVectorized, Size >= MaxRegVF,
18943 std::placeholders::_1)));
18944 }
18945 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18946 break;
18947 }
18948 // All values vectorized - exit.
18949 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18950 return P.first == 0 && P.second == 0;
18951 }))
18952 break;
18953 // Check if tried all attempts or no need for the last attempts at all.
18954 if (Repeat >= MaxAttempts ||
18955 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18956 break;
18957 constexpr unsigned StoresLimit = 64;
18958 const unsigned MaxTotalNum = std::min<unsigned>(
18959 Operands.size(),
18960 static_cast<unsigned>(
18961 End -
18962 std::distance(
18963 RangeSizes.begin(),
18964 find_if(RangeSizes, std::bind(IsNotVectorized, true,
18965 std::placeholders::_1))) +
18966 1));
18967 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
18968 unsigned Limit =
18969 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18970 CandidateVFs.clear();
18971 if (bit_floor(Limit) == VF)
18972 CandidateVFs.push_back(Limit);
18973 if (VF > MaxTotalNum || VF >= StoresLimit)
18974 break;
18975 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18976 if (P.first != 0)
18977 P.first = std::max(P.second, P.first);
18978 });
18979 // Last attempt to vectorize max number of elements, if all previous
18980 // attempts were unsuccessful because of the cost issues.
18981 CandidateVFs.push_back(VF);
18982 }
18983 }
18984 };
18985
18986 // Stores pair (first: index of the store into Stores array ref, address of
18987 // which taken as base, second: sorted set of pairs {index, dist}, which are
18988 // indices of stores in the set and their store location distances relative to
18989 // the base address).
18990
18991 // Need to store the index of the very first store separately, since the set
18992 // may be reordered after the insertion and the first store may be moved. This
18993 // container allows to reduce number of calls of getPointersDiff() function.
18995 // Inserts the specified store SI with the given index Idx to the set of the
18996 // stores. If the store with the same distance is found already - stop
18997 // insertion, try to vectorize already found stores. If some stores from this
18998 // sequence were not vectorized - try to vectorize them with the new store
18999 // later. But this logic is applied only to the stores, that come before the
19000 // previous store with the same distance.
19001 // Example:
19002 // 1. store x, %p
19003 // 2. store y, %p+1
19004 // 3. store z, %p+2
19005 // 4. store a, %p
19006 // 5. store b, %p+3
19007 // - Scan this from the last to first store. The very first bunch of stores is
19008 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
19009 // vector).
19010 // - The next store in the list - #1 - has the same distance from store #5 as
19011 // the store #4.
19012 // - Try to vectorize sequence of stores 4,2,3,5.
19013 // - If all these stores are vectorized - just drop them.
19014 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
19015 // - Start new stores sequence.
19016 // The new bunch of stores is {1, {1, 0}}.
19017 // - Add the stores from previous sequence, that were not vectorized.
19018 // Here we consider the stores in the reversed order, rather they are used in
19019 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
19020 // Store #3 can be added -> comes after store #4 with the same distance as
19021 // store #1.
19022 // Store #5 cannot be added - comes before store #4.
19023 // This logic allows to improve the compile time, we assume that the stores
19024 // after previous store with the same distance most likely have memory
19025 // dependencies and no need to waste compile time to try to vectorize them.
19026 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
19027 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
19028 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19029 std::optional<int> Diff = getPointersDiff(
19030 Stores[Set.first]->getValueOperand()->getType(),
19031 Stores[Set.first]->getPointerOperand(),
19032 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
19033 /*StrictCheck=*/true);
19034 if (!Diff)
19035 continue;
19036 auto It = Set.second.find(std::make_pair(Idx, *Diff));
19037 if (It == Set.second.end()) {
19038 Set.second.emplace(Idx, *Diff);
19039 return;
19040 }
19041 // Try to vectorize the first found set to avoid duplicate analysis.
19042 TryToVectorize(Set.second);
19043 unsigned ItIdx = It->first;
19044 int ItDist = It->second;
19045 StoreIndexToDistSet PrevSet;
19046 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
19047 [&](const std::pair<unsigned, int> &Pair) {
19048 return Pair.first > ItIdx;
19049 });
19050 Set.second.clear();
19051 Set.first = Idx;
19052 Set.second.emplace(Idx, 0);
19053 // Insert stores that followed previous match to try to vectorize them
19054 // with this store.
19055 unsigned StartIdx = ItIdx + 1;
19056 SmallBitVector UsedStores(Idx - StartIdx);
19057 // Distances to previously found dup store (or this store, since they
19058 // store to the same addresses).
19059 SmallVector<int> Dists(Idx - StartIdx, 0);
19060 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
19061 // Do not try to vectorize sequences, we already tried.
19062 if (VectorizedStores.contains(Stores[Pair.first]))
19063 break;
19064 unsigned BI = Pair.first - StartIdx;
19065 UsedStores.set(BI);
19066 Dists[BI] = Pair.second - ItDist;
19067 }
19068 for (unsigned I = StartIdx; I < Idx; ++I) {
19069 unsigned BI = I - StartIdx;
19070 if (UsedStores.test(BI))
19071 Set.second.emplace(I, Dists[BI]);
19072 }
19073 return;
19074 }
19075 auto &Res = SortedStores.emplace_back();
19076 Res.first = Idx;
19077 Res.second.emplace(Idx, 0);
19078 };
19079 Type *PrevValTy = nullptr;
19080 for (auto [I, SI] : enumerate(Stores)) {
19081 if (R.isDeleted(SI))
19082 continue;
19083 if (!PrevValTy)
19084 PrevValTy = SI->getValueOperand()->getType();
19085 // Check that we do not try to vectorize stores of different types.
19086 if (PrevValTy != SI->getValueOperand()->getType()) {
19087 for (auto &Set : SortedStores)
19088 TryToVectorize(Set.second);
19089 SortedStores.clear();
19090 PrevValTy = SI->getValueOperand()->getType();
19091 }
19092 FillStoresSet(I, SI);
19093 }
19094
19095 // Final vectorization attempt.
19096 for (auto &Set : SortedStores)
19097 TryToVectorize(Set.second);
19098
19099 return Changed;
19100}
19101
19102void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19103 // Initialize the collections. We will make a single pass over the block.
19104 Stores.clear();
19105 GEPs.clear();
19106
19107 // Visit the store and getelementptr instructions in BB and organize them in
19108 // Stores and GEPs according to the underlying objects of their pointer
19109 // operands.
19110 for (Instruction &I : *BB) {
19111 // Ignore store instructions that are volatile or have a pointer operand
19112 // that doesn't point to a scalar type.
19113 if (auto *SI = dyn_cast<StoreInst>(&I)) {
19114 if (!SI->isSimple())
19115 continue;
19116 if (!isValidElementType(SI->getValueOperand()->getType()))
19117 continue;
19118 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19119 }
19120
19121 // Ignore getelementptr instructions that have more than one index, a
19122 // constant index, or a pointer operand that doesn't point to a scalar
19123 // type.
19124 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19125 if (GEP->getNumIndices() != 1)
19126 continue;
19127 Value *Idx = GEP->idx_begin()->get();
19128 if (isa<Constant>(Idx))
19129 continue;
19130 if (!isValidElementType(Idx->getType()))
19131 continue;
19132 if (GEP->getType()->isVectorTy())
19133 continue;
19134 GEPs[GEP->getPointerOperand()].push_back(GEP);
19135 }
19136 }
19137}
19138
19139bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19140 bool MaxVFOnly) {
19141 if (VL.size() < 2)
19142 return false;
19143
19144 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
19145 << VL.size() << ".\n");
19146
19147 // Check that all of the parts are instructions of the same type,
19148 // we permit an alternate opcode via InstructionsState.
19149 InstructionsState S = getSameOpcode(VL, *TLI);
19150 if (!S)
19151 return false;
19152
19153 Instruction *I0 = S.getMainOp();
19154 // Make sure invalid types (including vector type) are rejected before
19155 // determining vectorization factor for scalar instructions.
19156 for (Value *V : VL) {
19157 Type *Ty = V->getType();
19158 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19159 // NOTE: the following will give user internal llvm type name, which may
19160 // not be useful.
19161 R.getORE()->emit([&]() {
19162 std::string TypeStr;
19163 llvm::raw_string_ostream rso(TypeStr);
19164 Ty->print(rso);
19165 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
19166 << "Cannot SLP vectorize list: type "
19167 << TypeStr + " is unsupported by vectorizer";
19168 });
19169 return false;
19170 }
19171 }
19172
19173 unsigned Sz = R.getVectorElementSize(I0);
19174 unsigned MinVF = R.getMinVF(Sz);
19175 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
19176 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19177 if (MaxVF < 2) {
19178 R.getORE()->emit([&]() {
19179 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
19180 << "Cannot SLP vectorize list: vectorization factor "
19181 << "less than 2 is not supported";
19182 });
19183 return false;
19184 }
19185
19186 bool Changed = false;
19187 bool CandidateFound = false;
19188 InstructionCost MinCost = SLPCostThreshold.getValue();
19189 Type *ScalarTy = getValueType(VL[0]);
19190
19191 unsigned NextInst = 0, MaxInst = VL.size();
19192 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19193 // No actual vectorization should happen, if number of parts is the same as
19194 // provided vectorization factor (i.e. the scalar type is used for vector
19195 // code during codegen).
19196 auto *VecTy = getWidenedType(ScalarTy, VF);
19197 if (TTI->getNumberOfParts(VecTy) == VF)
19198 continue;
19199 for (unsigned I = NextInst; I < MaxInst; ++I) {
19200 unsigned ActualVF = std::min(MaxInst - I, VF);
19201
19202 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19203 continue;
19204
19205 if (MaxVFOnly && ActualVF < MaxVF)
19206 break;
19207 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19208 break;
19209
19210 SmallVector<Value *> Ops(ActualVF, nullptr);
19211 unsigned Idx = 0;
19212 for (Value *V : VL.drop_front(I)) {
19213 // Check that a previous iteration of this loop did not delete the
19214 // Value.
19215 if (auto *Inst = dyn_cast<Instruction>(V);
19216 !Inst || !R.isDeleted(Inst)) {
19217 Ops[Idx] = V;
19218 ++Idx;
19219 if (Idx == ActualVF)
19220 break;
19221 }
19222 }
19223 // Not enough vectorizable instructions - exit.
19224 if (Idx != ActualVF)
19225 break;
19226
19227 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
19228 << "\n");
19229
19230 R.buildTree(Ops);
19231 if (R.isTreeTinyAndNotFullyVectorizable())
19232 continue;
19233 R.reorderTopToBottom();
19234 R.reorderBottomToTop(
19235 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19236 !R.doesRootHaveInTreeUses());
19237 R.transformNodes();
19238 R.buildExternalUses();
19239
19240 R.computeMinimumValueSizes();
19241 InstructionCost Cost = R.getTreeCost();
19242 CandidateFound = true;
19243 MinCost = std::min(MinCost, Cost);
19244
19245 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19246 << " for VF=" << ActualVF << "\n");
19247 if (Cost < -SLPCostThreshold) {
19248 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19249 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19250 cast<Instruction>(Ops[0]))
19251 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19252 << " and with tree size "
19253 << ore::NV("TreeSize", R.getTreeSize()));
19254
19255 R.vectorizeTree();
19256 // Move to the next bundle.
19257 I += VF - 1;
19258 NextInst = I + 1;
19259 Changed = true;
19260 }
19261 }
19262 }
19263
19264 if (!Changed && CandidateFound) {
19265 R.getORE()->emit([&]() {
19266 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19267 << "List vectorization was possible but not beneficial with cost "
19268 << ore::NV("Cost", MinCost) << " >= "
19269 << ore::NV("Treshold", -SLPCostThreshold);
19270 });
19271 } else if (!Changed) {
19272 R.getORE()->emit([&]() {
19273 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19274 << "Cannot SLP vectorize list: vectorization was impossible"
19275 << " with available vectorization factors";
19276 });
19277 }
19278 return Changed;
19279}
19280
19281bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19282 if (!I)
19283 return false;
19284
19285 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19286 return false;
19287
19288 Value *P = I->getParent();
19289
19290 // Vectorize in current basic block only.
19291 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19292 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19293 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19294 R.isDeleted(Op0) || R.isDeleted(Op1))
19295 return false;
19296
19297 // First collect all possible candidates
19299 Candidates.emplace_back(Op0, Op1);
19300
19301 auto *A = dyn_cast<BinaryOperator>(Op0);
19302 auto *B = dyn_cast<BinaryOperator>(Op1);
19303 // Try to skip B.
19304 if (A && B && B->hasOneUse()) {
19305 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19306 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19307 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19308 Candidates.emplace_back(A, B0);
19309 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19310 Candidates.emplace_back(A, B1);
19311 }
19312 // Try to skip A.
19313 if (B && A && A->hasOneUse()) {
19314 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19315 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19316 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19317 Candidates.emplace_back(A0, B);
19318 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19319 Candidates.emplace_back(A1, B);
19320 }
19321
19322 if (Candidates.size() == 1)
19323 return tryToVectorizeList({Op0, Op1}, R);
19324
19325 // We have multiple options. Try to pick the single best.
19326 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19327 if (!BestCandidate)
19328 return false;
19329 return tryToVectorizeList(
19330 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19331}
19332
19333namespace {
19334
19335/// Model horizontal reductions.
19336///
19337/// A horizontal reduction is a tree of reduction instructions that has values
19338/// that can be put into a vector as its leaves. For example:
19339///
19340/// mul mul mul mul
19341/// \ / \ /
19342/// + +
19343/// \ /
19344/// +
19345/// This tree has "mul" as its leaf values and "+" as its reduction
19346/// instructions. A reduction can feed into a store or a binary operation
19347/// feeding a phi.
19348/// ...
19349/// \ /
19350/// +
19351/// |
19352/// phi +=
19353///
19354/// Or:
19355/// ...
19356/// \ /
19357/// +
19358/// |
19359/// *p =
19360///
19361class HorizontalReduction {
19362 using ReductionOpsType = SmallVector<Value *, 16>;
19363 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19364 ReductionOpsListType ReductionOps;
19365 /// List of possibly reduced values.
19367 /// Maps reduced value to the corresponding reduction operation.
19369 WeakTrackingVH ReductionRoot;
19370 /// The type of reduction operation.
19371 RecurKind RdxKind;
19372 /// Checks if the optimization of original scalar identity operations on
19373 /// matched horizontal reductions is enabled and allowed.
19374 bool IsSupportedHorRdxIdentityOp = false;
19375
19376 static bool isCmpSelMinMax(Instruction *I) {
19377 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19379 }
19380
19381 // And/or are potentially poison-safe logical patterns like:
19382 // select x, y, false
19383 // select x, true, y
19384 static bool isBoolLogicOp(Instruction *I) {
19385 return isa<SelectInst>(I) &&
19386 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19387 }
19388
19389 /// Checks if instruction is associative and can be vectorized.
19390 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19391 if (Kind == RecurKind::None)
19392 return false;
19393
19394 // Integer ops that map to select instructions or intrinsics are fine.
19396 isBoolLogicOp(I))
19397 return true;
19398
19399 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19400 // FP min/max are associative except for NaN and -0.0. We do not
19401 // have to rule out -0.0 here because the intrinsic semantics do not
19402 // specify a fixed result for it.
19403 return I->getFastMathFlags().noNaNs();
19404 }
19405
19406 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19407 return true;
19408
19409 return I->isAssociative();
19410 }
19411
19412 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19413 // Poison-safe 'or' takes the form: select X, true, Y
19414 // To make that work with the normal operand processing, we skip the
19415 // true value operand.
19416 // TODO: Change the code and data structures to handle this without a hack.
19417 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19418 return I->getOperand(2);
19419 return I->getOperand(Index);
19420 }
19421
19422 /// Creates reduction operation with the current opcode.
19423 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19424 Value *RHS, const Twine &Name, bool UseSelect) {
19425 switch (Kind) {
19426 case RecurKind::Or: {
19427 if (UseSelect &&
19429 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19430 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19431 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19432 Name);
19433 }
19434 case RecurKind::And: {
19435 if (UseSelect &&
19437 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19438 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19439 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19440 Name);
19441 }
19442 case RecurKind::Add:
19443 case RecurKind::Mul:
19444 case RecurKind::Xor:
19445 case RecurKind::FAdd:
19446 case RecurKind::FMul: {
19447 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19448 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19449 Name);
19450 }
19451 case RecurKind::SMax:
19452 case RecurKind::SMin:
19453 case RecurKind::UMax:
19454 case RecurKind::UMin:
19455 if (UseSelect) {
19457 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
19458 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19459 }
19460 [[fallthrough]];
19461 case RecurKind::FMax:
19462 case RecurKind::FMin:
19463 case RecurKind::FMaximum:
19464 case RecurKind::FMinimum: {
19466 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
19467 }
19468 default:
19469 llvm_unreachable("Unknown reduction operation.");
19470 }
19471 }
19472
19473 /// Creates reduction operation with the current opcode with the IR flags
19474 /// from \p ReductionOps, dropping nuw/nsw flags.
19475 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19476 Value *RHS, const Twine &Name,
19477 const ReductionOpsListType &ReductionOps) {
19478 bool UseSelect = ReductionOps.size() == 2 ||
19479 // Logical or/and.
19480 (ReductionOps.size() == 1 &&
19481 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19482 assert((!UseSelect || ReductionOps.size() != 2 ||
19483 isa<SelectInst>(ReductionOps[1][0])) &&
19484 "Expected cmp + select pairs for reduction");
19485 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19487 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19488 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19489 /*IncludeWrapFlags=*/false);
19490 propagateIRFlags(Op, ReductionOps[1], nullptr,
19491 /*IncludeWrapFlags=*/false);
19492 return Op;
19493 }
19494 }
19495 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19496 return Op;
19497 }
19498
19499public:
19500 static RecurKind getRdxKind(Value *V) {
19501 auto *I = dyn_cast<Instruction>(V);
19502 if (!I)
19503 return RecurKind::None;
19504 if (match(I, m_Add(m_Value(), m_Value())))
19505 return RecurKind::Add;
19506 if (match(I, m_Mul(m_Value(), m_Value())))
19507 return RecurKind::Mul;
19508 if (match(I, m_And(m_Value(), m_Value())) ||
19510 return RecurKind::And;
19511 if (match(I, m_Or(m_Value(), m_Value())) ||
19513 return RecurKind::Or;
19514 if (match(I, m_Xor(m_Value(), m_Value())))
19515 return RecurKind::Xor;
19516 if (match(I, m_FAdd(m_Value(), m_Value())))
19517 return RecurKind::FAdd;
19518 if (match(I, m_FMul(m_Value(), m_Value())))
19519 return RecurKind::FMul;
19520
19521 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19522 return RecurKind::FMax;
19523 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19524 return RecurKind::FMin;
19525
19526 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19527 return RecurKind::FMaximum;
19528 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19529 return RecurKind::FMinimum;
19530 // This matches either cmp+select or intrinsics. SLP is expected to handle
19531 // either form.
19532 // TODO: If we are canonicalizing to intrinsics, we can remove several
19533 // special-case paths that deal with selects.
19534 if (match(I, m_SMax(m_Value(), m_Value())))
19535 return RecurKind::SMax;
19536 if (match(I, m_SMin(m_Value(), m_Value())))
19537 return RecurKind::SMin;
19538 if (match(I, m_UMax(m_Value(), m_Value())))
19539 return RecurKind::UMax;
19540 if (match(I, m_UMin(m_Value(), m_Value())))
19541 return RecurKind::UMin;
19542
19543 if (auto *Select = dyn_cast<SelectInst>(I)) {
19544 // Try harder: look for min/max pattern based on instructions producing
19545 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19546 // During the intermediate stages of SLP, it's very common to have
19547 // pattern like this (since optimizeGatherSequence is run only once
19548 // at the end):
19549 // %1 = extractelement <2 x i32> %a, i32 0
19550 // %2 = extractelement <2 x i32> %a, i32 1
19551 // %cond = icmp sgt i32 %1, %2
19552 // %3 = extractelement <2 x i32> %a, i32 0
19553 // %4 = extractelement <2 x i32> %a, i32 1
19554 // %select = select i1 %cond, i32 %3, i32 %4
19555 CmpPredicate Pred;
19556 Instruction *L1;
19557 Instruction *L2;
19558
19559 Value *LHS = Select->getTrueValue();
19560 Value *RHS = Select->getFalseValue();
19561 Value *Cond = Select->getCondition();
19562
19563 // TODO: Support inverse predicates.
19564 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19565 if (!isa<ExtractElementInst>(RHS) ||
19566 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19567 return RecurKind::None;
19568 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19569 if (!isa<ExtractElementInst>(LHS) ||
19570 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19571 return RecurKind::None;
19572 } else {
19573 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19574 return RecurKind::None;
19575 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19576 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19577 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19578 return RecurKind::None;
19579 }
19580
19581 switch (Pred) {
19582 default:
19583 return RecurKind::None;
19584 case CmpInst::ICMP_SGT:
19585 case CmpInst::ICMP_SGE:
19586 return RecurKind::SMax;
19587 case CmpInst::ICMP_SLT:
19588 case CmpInst::ICMP_SLE:
19589 return RecurKind::SMin;
19590 case CmpInst::ICMP_UGT:
19591 case CmpInst::ICMP_UGE:
19592 return RecurKind::UMax;
19593 case CmpInst::ICMP_ULT:
19594 case CmpInst::ICMP_ULE:
19595 return RecurKind::UMin;
19596 }
19597 }
19598 return RecurKind::None;
19599 }
19600
19601 /// Get the index of the first operand.
19602 static unsigned getFirstOperandIndex(Instruction *I) {
19603 return isCmpSelMinMax(I) ? 1 : 0;
19604 }
19605
19606private:
19607 /// Total number of operands in the reduction operation.
19608 static unsigned getNumberOfOperands(Instruction *I) {
19609 return isCmpSelMinMax(I) ? 3 : 2;
19610 }
19611
19612 /// Checks if the instruction is in basic block \p BB.
19613 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19614 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19615 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19616 auto *Sel = cast<SelectInst>(I);
19617 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19618 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19619 }
19620 return I->getParent() == BB;
19621 }
19622
19623 /// Expected number of uses for reduction operations/reduced values.
19624 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19625 if (IsCmpSelMinMax) {
19626 // SelectInst must be used twice while the condition op must have single
19627 // use only.
19628 if (auto *Sel = dyn_cast<SelectInst>(I))
19629 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19630 return I->hasNUses(2);
19631 }
19632
19633 // Arithmetic reduction operation must be used once only.
19634 return I->hasOneUse();
19635 }
19636
19637 /// Initializes the list of reduction operations.
19638 void initReductionOps(Instruction *I) {
19639 if (isCmpSelMinMax(I))
19640 ReductionOps.assign(2, ReductionOpsType());
19641 else
19642 ReductionOps.assign(1, ReductionOpsType());
19643 }
19644
19645 /// Add all reduction operations for the reduction instruction \p I.
19646 void addReductionOps(Instruction *I) {
19647 if (isCmpSelMinMax(I)) {
19648 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19649 ReductionOps[1].emplace_back(I);
19650 } else {
19651 ReductionOps[0].emplace_back(I);
19652 }
19653 }
19654
19655 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19656 int Sz = Data.size();
19657 auto *I = dyn_cast<Instruction>(Data.front());
19658 return Sz > 1 || isConstant(Data.front()) ||
19659 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19660 }
19661
19662public:
19663 HorizontalReduction() = default;
19664
19665 /// Try to find a reduction tree.
19666 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19667 ScalarEvolution &SE, const DataLayout &DL,
19668 const TargetLibraryInfo &TLI) {
19669 RdxKind = HorizontalReduction::getRdxKind(Root);
19670 if (!isVectorizable(RdxKind, Root))
19671 return false;
19672
19673 // Analyze "regular" integer/FP types for reductions - no target-specific
19674 // types or pointers.
19675 Type *Ty = Root->getType();
19676 if (!isValidElementType(Ty) || Ty->isPointerTy())
19677 return false;
19678
19679 // Though the ultimate reduction may have multiple uses, its condition must
19680 // have only single use.
19681 if (auto *Sel = dyn_cast<SelectInst>(Root))
19682 if (!Sel->getCondition()->hasOneUse())
19683 return false;
19684
19685 ReductionRoot = Root;
19686
19687 // Iterate through all the operands of the possible reduction tree and
19688 // gather all the reduced values, sorting them by their value id.
19689 BasicBlock *BB = Root->getParent();
19690 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19692 1, std::make_pair(Root, 0));
19693 // Checks if the operands of the \p TreeN instruction are also reduction
19694 // operations or should be treated as reduced values or an extra argument,
19695 // which is not part of the reduction.
19696 auto CheckOperands = [&](Instruction *TreeN,
19697 SmallVectorImpl<Value *> &PossibleReducedVals,
19698 SmallVectorImpl<Instruction *> &ReductionOps,
19699 unsigned Level) {
19700 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19701 getNumberOfOperands(TreeN)))) {
19702 Value *EdgeVal = getRdxOperand(TreeN, I);
19703 ReducedValsToOps[EdgeVal].push_back(TreeN);
19704 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19705 // If the edge is not an instruction, or it is different from the main
19706 // reduction opcode or has too many uses - possible reduced value.
19707 // Also, do not try to reduce const values, if the operation is not
19708 // foldable.
19709 if (!EdgeInst || Level > RecursionMaxDepth ||
19710 getRdxKind(EdgeInst) != RdxKind ||
19711 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19712 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19713 !isVectorizable(RdxKind, EdgeInst) ||
19714 (R.isAnalyzedReductionRoot(EdgeInst) &&
19715 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19716 PossibleReducedVals.push_back(EdgeVal);
19717 continue;
19718 }
19719 ReductionOps.push_back(EdgeInst);
19720 }
19721 };
19722 // Try to regroup reduced values so that it gets more profitable to try to
19723 // reduce them. Values are grouped by their value ids, instructions - by
19724 // instruction op id and/or alternate op id, plus do extra analysis for
19725 // loads (grouping them by the distabce between pointers) and cmp
19726 // instructions (grouping them by the predicate).
19729 8>
19730 PossibleReducedVals;
19731 initReductionOps(Root);
19733 SmallSet<size_t, 2> LoadKeyUsed;
19734
19735 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19736 Key = hash_combine(hash_value(LI->getParent()), Key);
19737 Value *Ptr =
19739 if (!LoadKeyUsed.insert(Key).second) {
19740 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19741 if (LIt != LoadsMap.end()) {
19742 for (LoadInst *RLI : LIt->second) {
19743 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19744 LI->getType(), LI->getPointerOperand(), DL, SE,
19745 /*StrictCheck=*/true))
19746 return hash_value(RLI->getPointerOperand());
19747 }
19748 for (LoadInst *RLI : LIt->second) {
19750 LI->getPointerOperand(), TLI)) {
19751 hash_code SubKey = hash_value(RLI->getPointerOperand());
19752 return SubKey;
19753 }
19754 }
19755 if (LIt->second.size() > 2) {
19756 hash_code SubKey =
19757 hash_value(LIt->second.back()->getPointerOperand());
19758 return SubKey;
19759 }
19760 }
19761 }
19762 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19763 .first->second.push_back(LI);
19764 return hash_value(LI->getPointerOperand());
19765 };
19766
19767 while (!Worklist.empty()) {
19768 auto [TreeN, Level] = Worklist.pop_back_val();
19769 SmallVector<Value *> PossibleRedVals;
19770 SmallVector<Instruction *> PossibleReductionOps;
19771 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19772 addReductionOps(TreeN);
19773 // Add reduction values. The values are sorted for better vectorization
19774 // results.
19775 for (Value *V : PossibleRedVals) {
19776 size_t Key, Idx;
19777 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19778 /*AllowAlternate=*/false);
19779 ++PossibleReducedVals[Key][Idx]
19780 .insert(std::make_pair(V, 0))
19781 .first->second;
19782 }
19783 for (Instruction *I : reverse(PossibleReductionOps))
19784 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19785 }
19786 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19787 // Sort values by the total number of values kinds to start the reduction
19788 // from the longest possible reduced values sequences.
19789 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19790 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19791 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19792 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19793 It != E; ++It) {
19794 PossibleRedValsVect.emplace_back();
19795 auto RedValsVect = It->second.takeVector();
19796 stable_sort(RedValsVect, llvm::less_second());
19797 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19798 PossibleRedValsVect.back().append(Data.second, Data.first);
19799 }
19800 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19801 return P1.size() > P2.size();
19802 });
19803 int NewIdx = -1;
19804 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19805 if (NewIdx < 0 ||
19806 (!isGoodForReduction(Data) &&
19807 (!isa<LoadInst>(Data.front()) ||
19808 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19810 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19812 cast<LoadInst>(ReducedVals[NewIdx].front())
19813 ->getPointerOperand())))) {
19814 NewIdx = ReducedVals.size();
19815 ReducedVals.emplace_back();
19816 }
19817 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19818 }
19819 }
19820 // Sort the reduced values by number of same/alternate opcode and/or pointer
19821 // operand.
19822 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19823 return P1.size() > P2.size();
19824 });
19825 return true;
19826 }
19827
19828 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19829 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19830 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
19831 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19832 constexpr unsigned RegMaxNumber = 4;
19833 constexpr unsigned RedValsMaxNumber = 128;
19834 // If there are a sufficient number of reduction values, reduce
19835 // to a nearby power-of-2. We can safely generate oversized
19836 // vectors and rely on the backend to split them to legal sizes.
19837 if (unsigned NumReducedVals = std::accumulate(
19838 ReducedVals.begin(), ReducedVals.end(), 0,
19839 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19840 if (!isGoodForReduction(Vals))
19841 return Num;
19842 return Num + Vals.size();
19843 });
19844 NumReducedVals < ReductionLimit &&
19845 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19846 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19847 })) {
19848 for (ReductionOpsType &RdxOps : ReductionOps)
19849 for (Value *RdxOp : RdxOps)
19850 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19851 return nullptr;
19852 }
19853
19854 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19855 TargetFolder(DL));
19856 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19857
19858 // Track the reduced values in case if they are replaced by extractelement
19859 // because of the vectorization.
19860 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19861 ReducedVals.front().size());
19862
19863 // The compare instruction of a min/max is the insertion point for new
19864 // instructions and may be replaced with a new compare instruction.
19865 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19866 assert(isa<SelectInst>(RdxRootInst) &&
19867 "Expected min/max reduction to have select root instruction");
19868 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19869 assert(isa<Instruction>(ScalarCond) &&
19870 "Expected min/max reduction to have compare condition");
19871 return cast<Instruction>(ScalarCond);
19872 };
19873
19874 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19875 return isBoolLogicOp(cast<Instruction>(V));
19876 });
19877 // Return new VectorizedTree, based on previous value.
19878 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19879 if (VectorizedTree) {
19880 // Update the final value in the reduction.
19882 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19883 if (AnyBoolLogicOp) {
19884 auto It = ReducedValsToOps.find(VectorizedTree);
19885 auto It1 = ReducedValsToOps.find(Res);
19886 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19887 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19888 (It != ReducedValsToOps.end() &&
19889 any_of(It->getSecond(), [&](Instruction *I) {
19890 return isBoolLogicOp(I) &&
19891 getRdxOperand(I, 0) == VectorizedTree;
19892 }))) {
19893 ;
19894 } else if (isGuaranteedNotToBePoison(Res, AC) ||
19895 (It1 != ReducedValsToOps.end() &&
19896 any_of(It1->getSecond(), [&](Instruction *I) {
19897 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19898 }))) {
19899 std::swap(VectorizedTree, Res);
19900 } else {
19901 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19902 }
19903 }
19904
19905 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19906 ReductionOps);
19907 }
19908 // Initialize the final value in the reduction.
19909 return Res;
19910 };
19911 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19912 ReductionOps.front().size());
19913 for (ReductionOpsType &RdxOps : ReductionOps)
19914 for (Value *RdxOp : RdxOps) {
19915 if (!RdxOp)
19916 continue;
19917 IgnoreList.insert(RdxOp);
19918 }
19919 // Intersect the fast-math-flags from all reduction operations.
19920 FastMathFlags RdxFMF;
19921 RdxFMF.set();
19922 for (Value *U : IgnoreList)
19923 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19924 RdxFMF &= FPMO->getFastMathFlags();
19925 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19926
19927 // Need to track reduced vals, they may be changed during vectorization of
19928 // subvectors.
19929 for (ArrayRef<Value *> Candidates : ReducedVals)
19930 for (Value *V : Candidates)
19931 TrackedVals.try_emplace(V, V);
19932
19934 Value *V) -> unsigned & {
19935 auto *It = MV.find(V);
19936 assert(It != MV.end() && "Unable to find given key.");
19937 return It->second;
19938 };
19939
19940 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19941 // List of the values that were reduced in other trees as part of gather
19942 // nodes and thus requiring extract if fully vectorized in other trees.
19943 SmallPtrSet<Value *, 4> RequiredExtract;
19944 WeakTrackingVH VectorizedTree = nullptr;
19945 bool CheckForReusedReductionOps = false;
19946 // Try to vectorize elements based on their type.
19948 for (ArrayRef<Value *> RV : ReducedVals)
19949 States.push_back(getSameOpcode(RV, TLI));
19950 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19951 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19952 InstructionsState S = States[I];
19953 SmallVector<Value *> Candidates;
19954 Candidates.reserve(2 * OrigReducedVals.size());
19955 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19956 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19957 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19958 // Check if the reduction value was not overriden by the extractelement
19959 // instruction because of the vectorization and exclude it, if it is not
19960 // compatible with other values.
19961 // Also check if the instruction was folded to constant/other value.
19962 auto *Inst = dyn_cast<Instruction>(RdxVal);
19963 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
19964 (!S || !S.isOpcodeOrAlt(Inst))) ||
19965 (S && !Inst))
19966 continue;
19967 Candidates.push_back(RdxVal);
19968 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19969 }
19970 bool ShuffledExtracts = false;
19971 // Try to handle shuffled extractelements.
19972 if (S && S.getOpcode() == Instruction::ExtractElement &&
19973 !S.isAltShuffle() && I + 1 < E) {
19974 SmallVector<Value *> CommonCandidates(Candidates);
19975 for (Value *RV : ReducedVals[I + 1]) {
19976 Value *RdxVal = TrackedVals.at(RV);
19977 // Check if the reduction value was not overriden by the
19978 // extractelement instruction because of the vectorization and
19979 // exclude it, if it is not compatible with other values.
19980 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19981 if (!Inst)
19982 continue;
19983 CommonCandidates.push_back(RdxVal);
19984 TrackedToOrig.try_emplace(RdxVal, RV);
19985 }
19987 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
19988 ++I;
19989 Candidates.swap(CommonCandidates);
19990 ShuffledExtracts = true;
19991 }
19992 }
19993
19994 // Emit code for constant values.
19995 if (Candidates.size() > 1 && allConstant(Candidates)) {
19996 Value *Res = Candidates.front();
19997 Value *OrigV = TrackedToOrig.at(Candidates.front());
19998 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19999 for (Value *VC : ArrayRef(Candidates).drop_front()) {
20000 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
20001 Value *OrigV = TrackedToOrig.at(VC);
20002 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20003 if (auto *ResI = dyn_cast<Instruction>(Res))
20004 V.analyzedReductionRoot(ResI);
20005 }
20006 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20007 continue;
20008 }
20009
20010 unsigned NumReducedVals = Candidates.size();
20011 if (NumReducedVals < ReductionLimit &&
20012 (NumReducedVals < 2 || !isSplat(Candidates)))
20013 continue;
20014
20015 // Check if we support repeated scalar values processing (optimization of
20016 // original scalar identity operations on matched horizontal reductions).
20017 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20018 RdxKind != RecurKind::FMul &&
20019 RdxKind != RecurKind::FMulAdd;
20020 // Gather same values.
20021 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
20022 if (IsSupportedHorRdxIdentityOp)
20023 for (Value *V : Candidates) {
20024 Value *OrigV = TrackedToOrig.at(V);
20025 ++SameValuesCounter.try_emplace(OrigV).first->second;
20026 }
20027 // Used to check if the reduced values used same number of times. In this
20028 // case the compiler may produce better code. E.g. if reduced values are
20029 // aabbccdd (8 x values), then the first node of the tree will have a node
20030 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
20031 // Plus, the final reduction will be performed on <8 x aabbccdd>.
20032 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
20033 // x abcd) * 2.
20034 // Currently it only handles add/fadd/xor. and/or/min/max do not require
20035 // this analysis, other operations may require an extra estimation of
20036 // the profitability.
20037 bool SameScaleFactor = false;
20038 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20039 SameValuesCounter.size() != Candidates.size();
20040 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
20041 if (OptReusedScalars) {
20042 SameScaleFactor =
20043 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20044 RdxKind == RecurKind::Xor) &&
20045 all_of(drop_begin(SameValuesCounter),
20046 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
20047 return P.second == SameValuesCounter.front().second;
20048 });
20049 Candidates.resize(SameValuesCounter.size());
20050 transform(SameValuesCounter, Candidates.begin(),
20051 [&](const auto &P) { return TrackedVals.at(P.first); });
20052 NumReducedVals = Candidates.size();
20053 // Have a reduction of the same element.
20054 if (NumReducedVals == 1) {
20055 Value *OrigV = TrackedToOrig.at(Candidates.front());
20056 unsigned Cnt = At(SameValuesCounter, OrigV);
20057 Value *RedVal =
20058 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20059 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20060 VectorizedVals.try_emplace(OrigV, Cnt);
20061 ExternallyUsedValues.insert(OrigV);
20062 continue;
20063 }
20064 }
20065
20066 unsigned MaxVecRegSize = V.getMaxVecRegSize();
20067 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
20068 const unsigned MaxElts = std::clamp<unsigned>(
20069 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20070 RegMaxNumber * RedValsMaxNumber);
20071
20072 unsigned ReduxWidth = NumReducedVals;
20073 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20074 unsigned NumParts, NumRegs;
20075 Type *ScalarTy = Candidates.front()->getType();
20076 ReduxWidth =
20077 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20078 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20079 NumParts = TTI.getNumberOfParts(Tp);
20080 NumRegs =
20082 while (NumParts > NumRegs) {
20083 ReduxWidth = bit_floor(ReduxWidth - 1);
20084 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20085 NumParts = TTI.getNumberOfParts(Tp);
20086 NumRegs =
20088 }
20089 if (NumParts > NumRegs / 2)
20090 ReduxWidth = bit_floor(ReduxWidth);
20091 return ReduxWidth;
20092 };
20093 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20094 ReduxWidth = GetVectorFactor(ReduxWidth);
20095 ReduxWidth = std::min(ReduxWidth, MaxElts);
20096
20097 unsigned Start = 0;
20098 unsigned Pos = Start;
20099 // Restarts vectorization attempt with lower vector factor.
20100 unsigned PrevReduxWidth = ReduxWidth;
20101 bool CheckForReusedReductionOpsLocal = false;
20102 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
20103 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
20104 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20105 // Check if any of the reduction ops are gathered. If so, worth
20106 // trying again with less number of reduction ops.
20107 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20108 }
20109 ++Pos;
20110 if (Pos < NumReducedVals - ReduxWidth + 1)
20111 return IsAnyRedOpGathered;
20112 Pos = Start;
20113 --ReduxWidth;
20114 if (ReduxWidth > 1)
20115 ReduxWidth = GetVectorFactor(ReduxWidth);
20116 return IsAnyRedOpGathered;
20117 };
20118 bool AnyVectorized = false;
20119 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20120 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20121 ReduxWidth >= ReductionLimit) {
20122 // Dependency in tree of the reduction ops - drop this attempt, try
20123 // later.
20124 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20125 Start == 0) {
20126 CheckForReusedReductionOps = true;
20127 break;
20128 }
20129 PrevReduxWidth = ReduxWidth;
20130 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20131 // Been analyzed already - skip.
20132 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20133 (!has_single_bit(ReduxWidth) &&
20134 (IgnoredCandidates.contains(
20135 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
20136 IgnoredCandidates.contains(
20137 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
20138 bit_floor(ReduxWidth))))) ||
20139 V.areAnalyzedReductionVals(VL)) {
20140 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20141 continue;
20142 }
20143 // Early exit if any of the reduction values were deleted during
20144 // previous vectorization attempts.
20145 if (any_of(VL, [&V](Value *RedVal) {
20146 auto *RedValI = dyn_cast<Instruction>(RedVal);
20147 if (!RedValI)
20148 return false;
20149 return V.isDeleted(RedValI);
20150 }))
20151 break;
20152 V.buildTree(VL, IgnoreList);
20153 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20154 if (!AdjustReducedVals())
20155 V.analyzedReductionVals(VL);
20156 continue;
20157 }
20158 if (V.isLoadCombineReductionCandidate(RdxKind)) {
20159 if (!AdjustReducedVals())
20160 V.analyzedReductionVals(VL);
20161 continue;
20162 }
20163 V.reorderTopToBottom();
20164 // No need to reorder the root node at all.
20165 V.reorderBottomToTop(/*IgnoreReorder=*/true);
20166 // Keep extracted other reduction values, if they are used in the
20167 // vectorization trees.
20168 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20169 ExternallyUsedValues);
20170 // The reduction root is used as the insertion point for new
20171 // instructions, so set it as externally used to prevent it from being
20172 // deleted.
20173 LocalExternallyUsedValues.insert(ReductionRoot);
20174 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20175 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
20176 continue;
20177 for (Value *V : ReducedVals[Cnt])
20178 if (isa<Instruction>(V))
20179 LocalExternallyUsedValues.insert(TrackedVals[V]);
20180 }
20181 if (!IsSupportedHorRdxIdentityOp) {
20182 // Number of uses of the candidates in the vector of values.
20183 assert(SameValuesCounter.empty() &&
20184 "Reused values counter map is not empty");
20185 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20186 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20187 continue;
20188 Value *V = Candidates[Cnt];
20189 Value *OrigV = TrackedToOrig.at(V);
20190 ++SameValuesCounter.try_emplace(OrigV).first->second;
20191 }
20192 }
20193 V.transformNodes();
20194 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20195 // Gather externally used values.
20197 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20198 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20199 continue;
20200 Value *RdxVal = Candidates[Cnt];
20201 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20202 RdxVal = It->second;
20203 if (!Visited.insert(RdxVal).second)
20204 continue;
20205 // Check if the scalar was vectorized as part of the vectorization
20206 // tree but not the top node.
20207 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
20208 LocalExternallyUsedValues.insert(RdxVal);
20209 continue;
20210 }
20211 Value *OrigV = TrackedToOrig.at(RdxVal);
20212 unsigned NumOps =
20213 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20214 if (NumOps != ReducedValsToOps.at(OrigV).size())
20215 LocalExternallyUsedValues.insert(RdxVal);
20216 }
20217 // Do not need the list of reused scalars in regular mode anymore.
20218 if (!IsSupportedHorRdxIdentityOp)
20219 SameValuesCounter.clear();
20220 for (Value *RdxVal : VL)
20221 if (RequiredExtract.contains(RdxVal))
20222 LocalExternallyUsedValues.insert(RdxVal);
20223 V.buildExternalUses(LocalExternallyUsedValues);
20224
20225 V.computeMinimumValueSizes();
20226
20227 // Estimate cost.
20228 InstructionCost TreeCost = V.getTreeCost(VL);
20229 InstructionCost ReductionCost =
20230 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20231 InstructionCost Cost = TreeCost + ReductionCost;
20232 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
20233 << " for reduction\n");
20234 if (!Cost.isValid())
20235 break;
20236 if (Cost >= -SLPCostThreshold) {
20237 V.getORE()->emit([&]() {
20238 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20239 ReducedValsToOps.at(VL[0]).front())
20240 << "Vectorizing horizontal reduction is possible "
20241 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20242 << " and threshold "
20243 << ore::NV("Threshold", -SLPCostThreshold);
20244 });
20245 if (!AdjustReducedVals()) {
20246 V.analyzedReductionVals(VL);
20247 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20248 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20249 // Add subvectors of VL to the list of the analyzed values.
20250 for (unsigned VF = getFloorFullVectorNumberOfElements(
20251 *TTI, VL.front()->getType(), ReduxWidth - 1);
20252 VF >= ReductionLimit;
20254 *TTI, VL.front()->getType(), VF - 1)) {
20255 if (has_single_bit(VF) &&
20256 V.getCanonicalGraphSize() != V.getTreeSize())
20257 continue;
20258 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20259 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20260 }
20261 }
20262 }
20263 continue;
20264 }
20265
20266 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20267 << Cost << ". (HorRdx)\n");
20268 V.getORE()->emit([&]() {
20269 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20270 ReducedValsToOps.at(VL[0]).front())
20271 << "Vectorized horizontal reduction with cost "
20272 << ore::NV("Cost", Cost) << " and with tree size "
20273 << ore::NV("TreeSize", V.getTreeSize());
20274 });
20275
20276 Builder.setFastMathFlags(RdxFMF);
20277
20278 // Emit a reduction. If the root is a select (min/max idiom), the insert
20279 // point is the compare condition of that select.
20280 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20281 Instruction *InsertPt = RdxRootInst;
20282 if (IsCmpSelMinMax)
20283 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20284
20285 // Vectorize a tree.
20286 Value *VectorizedRoot =
20287 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20288 // Update TrackedToOrig mapping, since the tracked values might be
20289 // updated.
20290 for (Value *RdxVal : Candidates) {
20291 Value *OrigVal = TrackedToOrig.at(RdxVal);
20292 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20293 if (TransformedRdxVal != RdxVal)
20294 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20295 }
20296
20297 Builder.SetInsertPoint(InsertPt);
20298
20299 // To prevent poison from leaking across what used to be sequential,
20300 // safe, scalar boolean logic operations, the reduction operand must be
20301 // frozen.
20302 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20303 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20304
20305 // Emit code to correctly handle reused reduced values, if required.
20306 if (OptReusedScalars && !SameScaleFactor) {
20307 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20308 SameValuesCounter, TrackedToOrig);
20309 }
20310
20311 Value *ReducedSubTree;
20312 Type *ScalarTy = VL.front()->getType();
20313 if (isa<FixedVectorType>(ScalarTy)) {
20314 assert(SLPReVec && "FixedVectorType is not expected.");
20315 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20316 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20317 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20318 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20319 // Do reduction for each lane.
20320 // e.g., do reduce add for
20321 // VL[0] = <4 x Ty> <a, b, c, d>
20322 // VL[1] = <4 x Ty> <e, f, g, h>
20323 // Lane[0] = <2 x Ty> <a, e>
20324 // Lane[1] = <2 x Ty> <b, f>
20325 // Lane[2] = <2 x Ty> <c, g>
20326 // Lane[3] = <2 x Ty> <d, h>
20327 // result[0] = reduce add Lane[0]
20328 // result[1] = reduce add Lane[1]
20329 // result[2] = reduce add Lane[2]
20330 // result[3] = reduce add Lane[3]
20332 createStrideMask(I, ScalarTyNumElements, VL.size());
20333 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20334 ReducedSubTree = Builder.CreateInsertElement(
20335 ReducedSubTree,
20336 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20337 }
20338 } else {
20339 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20340 RdxRootInst->getType());
20341 }
20342 if (ReducedSubTree->getType() != VL.front()->getType()) {
20343 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20344 "Expected different reduction type.");
20345 ReducedSubTree =
20346 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20347 V.isSignedMinBitwidthRootNode());
20348 }
20349
20350 // Improved analysis for add/fadd/xor reductions with same scale factor
20351 // for all operands of reductions. We can emit scalar ops for them
20352 // instead.
20353 if (OptReusedScalars && SameScaleFactor)
20354 ReducedSubTree = emitScaleForReusedOps(
20355 ReducedSubTree, Builder, SameValuesCounter.front().second);
20356
20357 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20358 // Count vectorized reduced values to exclude them from final reduction.
20359 for (Value *RdxVal : VL) {
20360 Value *OrigV = TrackedToOrig.at(RdxVal);
20361 if (IsSupportedHorRdxIdentityOp) {
20362 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20363 continue;
20364 }
20365 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20366 if (!V.isVectorized(RdxVal))
20367 RequiredExtract.insert(RdxVal);
20368 }
20369 Pos += ReduxWidth;
20370 Start = Pos;
20371 ReduxWidth = NumReducedVals - Pos;
20372 if (ReduxWidth > 1)
20373 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20374 AnyVectorized = true;
20375 }
20376 if (OptReusedScalars && !AnyVectorized) {
20377 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20378 Value *RdxVal = TrackedVals.at(P.first);
20379 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20380 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20381 VectorizedVals.try_emplace(P.first, P.second);
20382 }
20383 continue;
20384 }
20385 }
20386 if (VectorizedTree) {
20387 // Reorder operands of bool logical op in the natural order to avoid
20388 // possible problem with poison propagation. If not possible to reorder
20389 // (both operands are originally RHS), emit an extra freeze instruction
20390 // for the LHS operand.
20391 // I.e., if we have original code like this:
20392 // RedOp1 = select i1 ?, i1 LHS, i1 false
20393 // RedOp2 = select i1 RHS, i1 ?, i1 false
20394
20395 // Then, we swap LHS/RHS to create a new op that matches the poison
20396 // semantics of the original code.
20397
20398 // If we have original code like this and both values could be poison:
20399 // RedOp1 = select i1 ?, i1 LHS, i1 false
20400 // RedOp2 = select i1 ?, i1 RHS, i1 false
20401
20402 // Then, we must freeze LHS in the new op.
20403 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20404 Instruction *RedOp1,
20405 Instruction *RedOp2,
20406 bool InitStep) {
20407 if (!AnyBoolLogicOp)
20408 return;
20409 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
20410 getRdxOperand(RedOp1, 0) == LHS ||
20412 return;
20413 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20414 getRdxOperand(RedOp2, 0) == RHS ||
20416 std::swap(LHS, RHS);
20417 return;
20418 }
20419 if (LHS != VectorizedTree)
20420 LHS = Builder.CreateFreeze(LHS);
20421 };
20422 // Finish the reduction.
20423 // Need to add extra arguments and not vectorized possible reduction
20424 // values.
20425 // Try to avoid dependencies between the scalar remainders after
20426 // reductions.
20427 auto FinalGen =
20429 bool InitStep) {
20430 unsigned Sz = InstVals.size();
20432 Sz % 2);
20433 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20434 Instruction *RedOp = InstVals[I + 1].first;
20435 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20436 Value *RdxVal1 = InstVals[I].second;
20437 Value *StableRdxVal1 = RdxVal1;
20438 auto It1 = TrackedVals.find(RdxVal1);
20439 if (It1 != TrackedVals.end())
20440 StableRdxVal1 = It1->second;
20441 Value *RdxVal2 = InstVals[I + 1].second;
20442 Value *StableRdxVal2 = RdxVal2;
20443 auto It2 = TrackedVals.find(RdxVal2);
20444 if (It2 != TrackedVals.end())
20445 StableRdxVal2 = It2->second;
20446 // To prevent poison from leaking across what used to be
20447 // sequential, safe, scalar boolean logic operations, the
20448 // reduction operand must be frozen.
20449 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20450 RedOp, InitStep);
20451 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20452 StableRdxVal2, "op.rdx", ReductionOps);
20453 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20454 }
20455 if (Sz % 2 == 1)
20456 ExtraReds[Sz / 2] = InstVals.back();
20457 return ExtraReds;
20458 };
20460 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20461 VectorizedTree);
20463 for (ArrayRef<Value *> Candidates : ReducedVals) {
20464 for (Value *RdxVal : Candidates) {
20465 if (!Visited.insert(RdxVal).second)
20466 continue;
20467 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20468 for (Instruction *RedOp :
20469 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20470 ExtraReductions.emplace_back(RedOp, RdxVal);
20471 }
20472 }
20473 // Iterate through all not-vectorized reduction values/extra arguments.
20474 bool InitStep = true;
20475 while (ExtraReductions.size() > 1) {
20477 FinalGen(ExtraReductions, InitStep);
20478 ExtraReductions.swap(NewReds);
20479 InitStep = false;
20480 }
20481 VectorizedTree = ExtraReductions.front().second;
20482
20483 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20484
20485 // The original scalar reduction is expected to have no remaining
20486 // uses outside the reduction tree itself. Assert that we got this
20487 // correct, replace internal uses with undef, and mark for eventual
20488 // deletion.
20489#ifndef NDEBUG
20490 SmallSet<Value *, 4> IgnoreSet;
20491 for (ArrayRef<Value *> RdxOps : ReductionOps)
20492 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20493#endif
20494 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20495 for (Value *Ignore : RdxOps) {
20496 if (!Ignore)
20497 continue;
20498#ifndef NDEBUG
20499 for (auto *U : Ignore->users()) {
20500 assert(IgnoreSet.count(U) &&
20501 "All users must be either in the reduction ops list.");
20502 }
20503#endif
20504 if (!Ignore->use_empty()) {
20505 Value *P = PoisonValue::get(Ignore->getType());
20506 Ignore->replaceAllUsesWith(P);
20507 }
20508 }
20509 V.removeInstructionsAndOperands(RdxOps);
20510 }
20511 } else if (!CheckForReusedReductionOps) {
20512 for (ReductionOpsType &RdxOps : ReductionOps)
20513 for (Value *RdxOp : RdxOps)
20514 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20515 }
20516 return VectorizedTree;
20517 }
20518
20519private:
20520 /// Calculate the cost of a reduction.
20521 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20522 ArrayRef<Value *> ReducedVals,
20523 bool IsCmpSelMinMax, FastMathFlags FMF,
20524 const BoUpSLP &R) {
20526 Type *ScalarTy = ReducedVals.front()->getType();
20527 unsigned ReduxWidth = ReducedVals.size();
20528 FixedVectorType *VectorTy = R.getReductionType();
20529 InstructionCost VectorCost = 0, ScalarCost;
20530 // If all of the reduced values are constant, the vector cost is 0, since
20531 // the reduction value can be calculated at the compile time.
20532 bool AllConsts = allConstant(ReducedVals);
20533 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20535 // Scalar cost is repeated for N-1 elements.
20536 int Cnt = ReducedVals.size();
20537 for (Value *RdxVal : ReducedVals) {
20538 if (Cnt == 1)
20539 break;
20540 --Cnt;
20541 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20542 Cost += GenCostFn();
20543 continue;
20544 }
20545 InstructionCost ScalarCost = 0;
20546 for (User *U : RdxVal->users()) {
20547 auto *RdxOp = cast<Instruction>(U);
20548 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20549 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20550 continue;
20551 }
20552 ScalarCost = InstructionCost::getInvalid();
20553 break;
20554 }
20555 if (ScalarCost.isValid())
20556 Cost += ScalarCost;
20557 else
20558 Cost += GenCostFn();
20559 }
20560 return Cost;
20561 };
20562 switch (RdxKind) {
20563 case RecurKind::Add:
20564 case RecurKind::Mul:
20565 case RecurKind::Or:
20566 case RecurKind::And:
20567 case RecurKind::Xor:
20568 case RecurKind::FAdd:
20569 case RecurKind::FMul: {
20570 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20571 if (!AllConsts) {
20572 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20573 assert(SLPReVec && "FixedVectorType is not expected.");
20574 unsigned ScalarTyNumElements = VecTy->getNumElements();
20575 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20576 VectorCost += TTI->getShuffleCost(
20577 TTI::SK_PermuteSingleSrc, VectorTy,
20578 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20579 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20580 CostKind);
20581 }
20582 VectorCost += TTI->getScalarizationOverhead(
20583 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20584 /*Extract*/ false, TTI::TCK_RecipThroughput);
20585 } else {
20586 Type *RedTy = VectorTy->getElementType();
20587 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20588 std::make_pair(RedTy, true));
20589 if (RType == RedTy) {
20590 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20591 FMF, CostKind);
20592 } else {
20593 VectorCost = TTI->getExtendedReductionCost(
20594 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20595 FMF, CostKind);
20596 }
20597 }
20598 }
20599 ScalarCost = EvaluateScalarCost([&]() {
20600 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20601 });
20602 break;
20603 }
20604 case RecurKind::FMax:
20605 case RecurKind::FMin:
20606 case RecurKind::FMaximum:
20607 case RecurKind::FMinimum:
20608 case RecurKind::SMax:
20609 case RecurKind::SMin:
20610 case RecurKind::UMax:
20611 case RecurKind::UMin: {
20613 if (!AllConsts)
20614 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20615 ScalarCost = EvaluateScalarCost([&]() {
20616 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20617 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20618 });
20619 break;
20620 }
20621 default:
20622 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20623 }
20624
20625 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20626 << " for reduction of " << shortBundleName(ReducedVals)
20627 << " (It is a splitting reduction)\n");
20628 return VectorCost - ScalarCost;
20629 }
20630
20631 /// Emit a horizontal reduction of the vectorized value.
20632 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20633 const TargetTransformInfo *TTI, Type *DestTy) {
20634 assert(VectorizedValue && "Need to have a vectorized tree node");
20635 assert(RdxKind != RecurKind::FMulAdd &&
20636 "A call to the llvm.fmuladd intrinsic is not handled yet");
20637
20638 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20639 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20640 RdxKind == RecurKind::Add &&
20641 DestTy->getScalarType() != FTy->getScalarType()) {
20642 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20643 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20644 Value *V = Builder.CreateBitCast(
20645 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20646 ++NumVectorInstructions;
20647 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20648 }
20649 ++NumVectorInstructions;
20650 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20651 }
20652
20653 /// Emits optimized code for unique scalar value reused \p Cnt times.
20654 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20655 unsigned Cnt) {
20656 assert(IsSupportedHorRdxIdentityOp &&
20657 "The optimization of matched scalar identity horizontal reductions "
20658 "must be supported.");
20659 if (Cnt == 1)
20660 return VectorizedValue;
20661 switch (RdxKind) {
20662 case RecurKind::Add: {
20663 // res = mul vv, n
20664 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20665 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20666 << VectorizedValue << ". (HorRdx)\n");
20667 return Builder.CreateMul(VectorizedValue, Scale);
20668 }
20669 case RecurKind::Xor: {
20670 // res = n % 2 ? 0 : vv
20671 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20672 << ". (HorRdx)\n");
20673 if (Cnt % 2 == 0)
20674 return Constant::getNullValue(VectorizedValue->getType());
20675 return VectorizedValue;
20676 }
20677 case RecurKind::FAdd: {
20678 // res = fmul v, n
20679 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20680 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20681 << VectorizedValue << ". (HorRdx)\n");
20682 return Builder.CreateFMul(VectorizedValue, Scale);
20683 }
20684 case RecurKind::And:
20685 case RecurKind::Or:
20686 case RecurKind::SMax:
20687 case RecurKind::SMin:
20688 case RecurKind::UMax:
20689 case RecurKind::UMin:
20690 case RecurKind::FMax:
20691 case RecurKind::FMin:
20692 case RecurKind::FMaximum:
20693 case RecurKind::FMinimum:
20694 // res = vv
20695 return VectorizedValue;
20696 case RecurKind::Mul:
20697 case RecurKind::FMul:
20698 case RecurKind::FMulAdd:
20699 case RecurKind::IAnyOf:
20700 case RecurKind::FAnyOf:
20701 case RecurKind::IFindLastIV:
20702 case RecurKind::FFindLastIV:
20703 case RecurKind::None:
20704 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20705 }
20706 return nullptr;
20707 }
20708
20709 /// Emits actual operation for the scalar identity values, found during
20710 /// horizontal reduction analysis.
20711 Value *
20712 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20713 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20714 const DenseMap<Value *, Value *> &TrackedToOrig) {
20715 assert(IsSupportedHorRdxIdentityOp &&
20716 "The optimization of matched scalar identity horizontal reductions "
20717 "must be supported.");
20718 ArrayRef<Value *> VL = R.getRootNodeScalars();
20719 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20720 if (VTy->getElementType() != VL.front()->getType()) {
20721 VectorizedValue = Builder.CreateIntCast(
20722 VectorizedValue,
20723 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20724 R.isSignedMinBitwidthRootNode());
20725 }
20726 switch (RdxKind) {
20727 case RecurKind::Add: {
20728 // root = mul prev_root, <1, 1, n, 1>
20730 for (Value *V : VL) {
20731 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20732 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20733 }
20734 auto *Scale = ConstantVector::get(Vals);
20735 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20736 << VectorizedValue << ". (HorRdx)\n");
20737 return Builder.CreateMul(VectorizedValue, Scale);
20738 }
20739 case RecurKind::And:
20740 case RecurKind::Or:
20741 // No need for multiple or/and(s).
20742 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20743 << ". (HorRdx)\n");
20744 return VectorizedValue;
20745 case RecurKind::SMax:
20746 case RecurKind::SMin:
20747 case RecurKind::UMax:
20748 case RecurKind::UMin:
20749 case RecurKind::FMax:
20750 case RecurKind::FMin:
20751 case RecurKind::FMaximum:
20752 case RecurKind::FMinimum:
20753 // No need for multiple min/max(s) of the same value.
20754 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20755 << ". (HorRdx)\n");
20756 return VectorizedValue;
20757 case RecurKind::Xor: {
20758 // Replace values with even number of repeats with 0, since
20759 // x xor x = 0.
20760 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20761 // 7>, if elements 4th and 6th elements have even number of repeats.
20763 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20765 std::iota(Mask.begin(), Mask.end(), 0);
20766 bool NeedShuffle = false;
20767 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20768 Value *V = VL[I];
20769 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20770 if (Cnt % 2 == 0) {
20771 Mask[I] = VF;
20772 NeedShuffle = true;
20773 }
20774 }
20775 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20776 : Mask) dbgs()
20777 << I << " ";
20778 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20779 if (NeedShuffle)
20780 VectorizedValue = Builder.CreateShuffleVector(
20781 VectorizedValue,
20782 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20783 return VectorizedValue;
20784 }
20785 case RecurKind::FAdd: {
20786 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20788 for (Value *V : VL) {
20789 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20790 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20791 }
20792 auto *Scale = ConstantVector::get(Vals);
20793 return Builder.CreateFMul(VectorizedValue, Scale);
20794 }
20795 case RecurKind::Mul:
20796 case RecurKind::FMul:
20797 case RecurKind::FMulAdd:
20798 case RecurKind::IAnyOf:
20799 case RecurKind::FAnyOf:
20800 case RecurKind::IFindLastIV:
20801 case RecurKind::FFindLastIV:
20802 case RecurKind::None:
20803 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20804 }
20805 return nullptr;
20806 }
20807};
20808} // end anonymous namespace
20809
20810/// Gets recurrence kind from the specified value.
20812 return HorizontalReduction::getRdxKind(V);
20813}
20814static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20815 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20816 return cast<FixedVectorType>(IE->getType())->getNumElements();
20817
20818 unsigned AggregateSize = 1;
20819 auto *IV = cast<InsertValueInst>(InsertInst);
20820 Type *CurrentType = IV->getType();
20821 do {
20822 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20823 for (auto *Elt : ST->elements())
20824 if (Elt != ST->getElementType(0)) // check homogeneity
20825 return std::nullopt;
20826 AggregateSize *= ST->getNumElements();
20827 CurrentType = ST->getElementType(0);
20828 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20829 AggregateSize *= AT->getNumElements();
20830 CurrentType = AT->getElementType();
20831 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20832 AggregateSize *= VT->getNumElements();
20833 return AggregateSize;
20834 } else if (CurrentType->isSingleValueType()) {
20835 return AggregateSize;
20836 } else {
20837 return std::nullopt;
20838 }
20839 } while (true);
20840}
20841
20842static void findBuildAggregate_rec(Instruction *LastInsertInst,
20844 SmallVectorImpl<Value *> &BuildVectorOpds,
20845 SmallVectorImpl<Value *> &InsertElts,
20846 unsigned OperandOffset, const BoUpSLP &R) {
20847 do {
20848 Value *InsertedOperand = LastInsertInst->getOperand(1);
20849 std::optional<unsigned> OperandIndex =
20850 getElementIndex(LastInsertInst, OperandOffset);
20851 if (!OperandIndex || R.isDeleted(LastInsertInst))
20852 return;
20853 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20854 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20855 BuildVectorOpds, InsertElts, *OperandIndex, R);
20856
20857 } else {
20858 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20859 InsertElts[*OperandIndex] = LastInsertInst;
20860 }
20861 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20862 } while (LastInsertInst != nullptr &&
20863 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20864 LastInsertInst->hasOneUse());
20865}
20866
20867/// Recognize construction of vectors like
20868/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20869/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20870/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20871/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20872/// starting from the last insertelement or insertvalue instruction.
20873///
20874/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20875/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20876/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20877///
20878/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20879///
20880/// \return true if it matches.
20881static bool findBuildAggregate(Instruction *LastInsertInst,
20883 SmallVectorImpl<Value *> &BuildVectorOpds,
20884 SmallVectorImpl<Value *> &InsertElts,
20885 const BoUpSLP &R) {
20886
20887 assert((isa<InsertElementInst>(LastInsertInst) ||
20888 isa<InsertValueInst>(LastInsertInst)) &&
20889 "Expected insertelement or insertvalue instruction!");
20890
20891 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20892 "Expected empty result vectors!");
20893
20894 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20895 if (!AggregateSize)
20896 return false;
20897 BuildVectorOpds.resize(*AggregateSize);
20898 InsertElts.resize(*AggregateSize);
20899
20900 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20901 R);
20902 llvm::erase(BuildVectorOpds, nullptr);
20903 llvm::erase(InsertElts, nullptr);
20904 if (BuildVectorOpds.size() >= 2)
20905 return true;
20906
20907 return false;
20908}
20909
20910/// Try and get a reduction instruction from a phi node.
20911///
20912/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20913/// if they come from either \p ParentBB or a containing loop latch.
20914///
20915/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20916/// if not possible.
20918 BasicBlock *ParentBB, LoopInfo *LI) {
20919 // There are situations where the reduction value is not dominated by the
20920 // reduction phi. Vectorizing such cases has been reported to cause
20921 // miscompiles. See PR25787.
20922 auto DominatedReduxValue = [&](Value *R) {
20923 return isa<Instruction>(R) &&
20924 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20925 };
20926
20927 Instruction *Rdx = nullptr;
20928
20929 // Return the incoming value if it comes from the same BB as the phi node.
20930 if (P->getIncomingBlock(0) == ParentBB) {
20931 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20932 } else if (P->getIncomingBlock(1) == ParentBB) {
20933 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20934 }
20935
20936 if (Rdx && DominatedReduxValue(Rdx))
20937 return Rdx;
20938
20939 // Otherwise, check whether we have a loop latch to look at.
20940 Loop *BBL = LI->getLoopFor(ParentBB);
20941 if (!BBL)
20942 return nullptr;
20943 BasicBlock *BBLatch = BBL->getLoopLatch();
20944 if (!BBLatch)
20945 return nullptr;
20946
20947 // There is a loop latch, return the incoming value if it comes from
20948 // that. This reduction pattern occasionally turns up.
20949 if (P->getIncomingBlock(0) == BBLatch) {
20950 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20951 } else if (P->getIncomingBlock(1) == BBLatch) {
20952 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20953 }
20954
20955 if (Rdx && DominatedReduxValue(Rdx))
20956 return Rdx;
20957
20958 return nullptr;
20959}
20960
20961static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
20962 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
20963 return true;
20964 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
20965 return true;
20966 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
20967 return true;
20968 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
20969 return true;
20970 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
20971 return true;
20972 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
20973 return true;
20974 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
20975 return true;
20976 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
20977 return true;
20978 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
20979 return true;
20980 return false;
20981}
20982
20983/// We could have an initial reduction that is not an add.
20984/// r *= v1 + v2 + v3 + v4
20985/// In such a case start looking for a tree rooted in the first '+'.
20986/// \Returns the new root if found, which may be nullptr if not an instruction.
20988 Instruction *Root) {
20989 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20990 isa<IntrinsicInst>(Root)) &&
20991 "Expected binop, select, or intrinsic for reduction matching");
20992 Value *LHS =
20993 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20994 Value *RHS =
20995 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20996 if (LHS == Phi)
20997 return dyn_cast<Instruction>(RHS);
20998 if (RHS == Phi)
20999 return dyn_cast<Instruction>(LHS);
21000 return nullptr;
21001}
21002
21003/// \p Returns the first operand of \p I that does not match \p Phi. If
21004/// operand is not an instruction it returns nullptr.
21006 Value *Op0 = nullptr;
21007 Value *Op1 = nullptr;
21008 if (!matchRdxBop(I, Op0, Op1))
21009 return nullptr;
21010 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21011}
21012
21013/// \Returns true if \p I is a candidate instruction for reduction vectorization.
21015 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
21016 Value *B0 = nullptr, *B1 = nullptr;
21017 bool IsBinop = matchRdxBop(I, B0, B1);
21018 return IsBinop || IsSelect;
21019}
21020
21021bool SLPVectorizerPass::vectorizeHorReduction(
21022 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
21023 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
21024 if (!ShouldVectorizeHor)
21025 return false;
21026 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
21027
21028 if (Root->getParent() != BB || isa<PHINode>(Root))
21029 return false;
21030
21031 // If we can find a secondary reduction root, use that instead.
21032 auto SelectRoot = [&]() {
21033 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
21034 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
21035 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
21036 return NewRoot;
21037 return Root;
21038 };
21039
21040 // Start analysis starting from Root instruction. If horizontal reduction is
21041 // found, try to vectorize it. If it is not a horizontal reduction or
21042 // vectorization is not possible or not effective, and currently analyzed
21043 // instruction is a binary operation, try to vectorize the operands, using
21044 // pre-order DFS traversal order. If the operands were not vectorized, repeat
21045 // the same procedure considering each operand as a possible root of the
21046 // horizontal reduction.
21047 // Interrupt the process if the Root instruction itself was vectorized or all
21048 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21049 // If a horizintal reduction was not matched or vectorized we collect
21050 // instructions for possible later attempts for vectorization.
21051 std::queue<std::pair<Instruction *, unsigned>> Stack;
21052 Stack.emplace(SelectRoot(), 0);
21053 SmallPtrSet<Value *, 8> VisitedInstrs;
21054 bool Res = false;
21055 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
21056 if (R.isAnalyzedReductionRoot(Inst))
21057 return nullptr;
21058 if (!isReductionCandidate(Inst))
21059 return nullptr;
21060 HorizontalReduction HorRdx;
21061 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21062 return nullptr;
21063 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
21064 };
21065 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21066 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21067 FutureSeed = getNonPhiOperand(Root, P);
21068 if (!FutureSeed)
21069 return false;
21070 }
21071 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21072 // analysis is done separately.
21073 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21074 PostponedInsts.push_back(FutureSeed);
21075 return true;
21076 };
21077
21078 while (!Stack.empty()) {
21079 Instruction *Inst;
21080 unsigned Level;
21081 std::tie(Inst, Level) = Stack.front();
21082 Stack.pop();
21083 // Do not try to analyze instruction that has already been vectorized.
21084 // This may happen when we vectorize instruction operands on a previous
21085 // iteration while stack was populated before that happened.
21086 if (R.isDeleted(Inst))
21087 continue;
21088 if (Value *VectorizedV = TryToReduce(Inst)) {
21089 Res = true;
21090 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21091 // Try to find another reduction.
21092 Stack.emplace(I, Level);
21093 continue;
21094 }
21095 if (R.isDeleted(Inst))
21096 continue;
21097 } else {
21098 // We could not vectorize `Inst` so try to use it as a future seed.
21099 if (!TryAppendToPostponedInsts(Inst)) {
21100 assert(Stack.empty() && "Expected empty stack");
21101 break;
21102 }
21103 }
21104
21105 // Try to vectorize operands.
21106 // Continue analysis for the instruction from the same basic block only to
21107 // save compile time.
21108 if (++Level < RecursionMaxDepth)
21109 for (auto *Op : Inst->operand_values())
21110 if (VisitedInstrs.insert(Op).second)
21111 if (auto *I = dyn_cast<Instruction>(Op))
21112 // Do not try to vectorize CmpInst operands, this is done
21113 // separately.
21114 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21115 !R.isDeleted(I) && I->getParent() == BB)
21116 Stack.emplace(I, Level);
21117 }
21118 return Res;
21119}
21120
21121bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
21122 BasicBlock *BB, BoUpSLP &R) {
21123 SmallVector<WeakTrackingVH> PostponedInsts;
21124 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21125 Res |= tryToVectorize(PostponedInsts, R);
21126 return Res;
21127}
21128
21129bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21130 BoUpSLP &R) {
21131 bool Res = false;
21132 for (Value *V : Insts)
21133 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21134 Res |= tryToVectorize(Inst, R);
21135 return Res;
21136}
21137
21138bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21139 BasicBlock *BB, BoUpSLP &R,
21140 bool MaxVFOnly) {
21141 if (!R.canMapToVector(IVI->getType()))
21142 return false;
21143
21144 SmallVector<Value *, 16> BuildVectorOpds;
21145 SmallVector<Value *, 16> BuildVectorInsts;
21146 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
21147 return false;
21148
21149 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21150 R.getORE()->emit([&]() {
21151 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
21152 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
21153 "trying reduction first.";
21154 });
21155 return false;
21156 }
21157 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
21158 // Aggregate value is unlikely to be processed in vector register.
21159 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21160}
21161
21162bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21163 BasicBlock *BB, BoUpSLP &R,
21164 bool MaxVFOnly) {
21165 SmallVector<Value *, 16> BuildVectorInsts;
21166 SmallVector<Value *, 16> BuildVectorOpds;
21168 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21169 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21170 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21171 return false;
21172
21173 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21174 R.getORE()->emit([&]() {
21175 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
21176 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
21177 "trying reduction first.";
21178 });
21179 return false;
21180 }
21181 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
21182 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21183}
21184
21185template <typename T>
21187 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
21188 function_ref<bool(T *, T *)> AreCompatible,
21189 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
21190 bool MaxVFOnly, BoUpSLP &R) {
21191 bool Changed = false;
21192 // Sort by type, parent, operands.
21193 stable_sort(Incoming, Comparator);
21194
21195 // Try to vectorize elements base on their type.
21196 SmallVector<T *> Candidates;
21198 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
21199 VL.clear()) {
21200 // Look for the next elements with the same type, parent and operand
21201 // kinds.
21202 auto *I = dyn_cast<Instruction>(*IncIt);
21203 if (!I || R.isDeleted(I)) {
21204 ++IncIt;
21205 continue;
21206 }
21207 auto *SameTypeIt = IncIt;
21208 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21209 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21210 AreCompatible(*SameTypeIt, *IncIt))) {
21211 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21212 ++SameTypeIt;
21213 if (I && !R.isDeleted(I))
21214 VL.push_back(cast<T>(I));
21215 }
21216
21217 // Try to vectorize them.
21218 unsigned NumElts = VL.size();
21219 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
21220 << NumElts << ")\n");
21221 // The vectorization is a 3-state attempt:
21222 // 1. Try to vectorize instructions with the same/alternate opcodes with the
21223 // size of maximal register at first.
21224 // 2. Try to vectorize remaining instructions with the same type, if
21225 // possible. This may result in the better vectorization results rather than
21226 // if we try just to vectorize instructions with the same/alternate opcodes.
21227 // 3. Final attempt to try to vectorize all instructions with the
21228 // same/alternate ops only, this may result in some extra final
21229 // vectorization.
21230 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21231 // Success start over because instructions might have been changed.
21232 Changed = true;
21233 VL.swap(Candidates);
21234 Candidates.clear();
21235 for (T *V : VL) {
21236 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21237 Candidates.push_back(V);
21238 }
21239 } else {
21240 /// \Returns the minimum number of elements that we will attempt to
21241 /// vectorize.
21242 auto GetMinNumElements = [&R](Value *V) {
21243 unsigned EltSize = R.getVectorElementSize(V);
21244 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21245 };
21246 if (NumElts < GetMinNumElements(*IncIt) &&
21247 (Candidates.empty() ||
21248 Candidates.front()->getType() == (*IncIt)->getType())) {
21249 for (T *V : VL) {
21250 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21251 Candidates.push_back(V);
21252 }
21253 }
21254 }
21255 // Final attempt to vectorize instructions with the same types.
21256 if (Candidates.size() > 1 &&
21257 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21258 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21259 // Success start over because instructions might have been changed.
21260 Changed = true;
21261 } else if (MaxVFOnly) {
21262 // Try to vectorize using small vectors.
21264 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21265 VL.clear()) {
21266 auto *I = dyn_cast<Instruction>(*It);
21267 if (!I || R.isDeleted(I)) {
21268 ++It;
21269 continue;
21270 }
21271 auto *SameTypeIt = It;
21272 while (SameTypeIt != End &&
21273 (!isa<Instruction>(*SameTypeIt) ||
21274 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21275 AreCompatible(*SameTypeIt, *It))) {
21276 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21277 ++SameTypeIt;
21278 if (I && !R.isDeleted(I))
21279 VL.push_back(cast<T>(I));
21280 }
21281 unsigned NumElts = VL.size();
21282 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21283 /*MaxVFOnly=*/false))
21284 Changed = true;
21285 It = SameTypeIt;
21286 }
21287 }
21288 Candidates.clear();
21289 }
21290
21291 // Start over at the next instruction of a different type (or the end).
21292 IncIt = SameTypeIt;
21293 }
21294 return Changed;
21295}
21296
21297/// Compare two cmp instructions. If IsCompatibility is true, function returns
21298/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21299/// operands. If IsCompatibility is false, function implements strict weak
21300/// ordering relation between two cmp instructions, returning true if the first
21301/// instruction is "less" than the second, i.e. its predicate is less than the
21302/// predicate of the second or the operands IDs are less than the operands IDs
21303/// of the second cmp instruction.
21304template <bool IsCompatibility>
21305static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21306 const DominatorTree &DT) {
21307 assert(isValidElementType(V->getType()) &&
21308 isValidElementType(V2->getType()) &&
21309 "Expected valid element types only.");
21310 if (V == V2)
21311 return IsCompatibility;
21312 auto *CI1 = cast<CmpInst>(V);
21313 auto *CI2 = cast<CmpInst>(V2);
21314 if (CI1->getOperand(0)->getType()->getTypeID() <
21315 CI2->getOperand(0)->getType()->getTypeID())
21316 return !IsCompatibility;
21317 if (CI1->getOperand(0)->getType()->getTypeID() >
21318 CI2->getOperand(0)->getType()->getTypeID())
21319 return false;
21320 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21322 return !IsCompatibility;
21323 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21325 return false;
21326 CmpInst::Predicate Pred1 = CI1->getPredicate();
21327 CmpInst::Predicate Pred2 = CI2->getPredicate();
21330 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21331 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21332 if (BasePred1 < BasePred2)
21333 return !IsCompatibility;
21334 if (BasePred1 > BasePred2)
21335 return false;
21336 // Compare operands.
21337 bool CI1Preds = Pred1 == BasePred1;
21338 bool CI2Preds = Pred2 == BasePred1;
21339 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21340 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21341 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21342 if (Op1 == Op2)
21343 continue;
21344 if (Op1->getValueID() < Op2->getValueID())
21345 return !IsCompatibility;
21346 if (Op1->getValueID() > Op2->getValueID())
21347 return false;
21348 if (auto *I1 = dyn_cast<Instruction>(Op1))
21349 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21350 if (IsCompatibility) {
21351 if (I1->getParent() != I2->getParent())
21352 return false;
21353 } else {
21354 // Try to compare nodes with same parent.
21355 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21356 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21357 if (!NodeI1)
21358 return NodeI2 != nullptr;
21359 if (!NodeI2)
21360 return false;
21361 assert((NodeI1 == NodeI2) ==
21362 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21363 "Different nodes should have different DFS numbers");
21364 if (NodeI1 != NodeI2)
21365 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21366 }
21367 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21368 if (S && (IsCompatibility || !S.isAltShuffle()))
21369 continue;
21370 if (IsCompatibility)
21371 return false;
21372 if (I1->getOpcode() != I2->getOpcode())
21373 return I1->getOpcode() < I2->getOpcode();
21374 }
21375 }
21376 return IsCompatibility;
21377}
21378
21379template <typename ItT>
21380bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21381 BasicBlock *BB, BoUpSLP &R) {
21382 bool Changed = false;
21383 // Try to find reductions first.
21384 for (CmpInst *I : CmpInsts) {
21385 if (R.isDeleted(I))
21386 continue;
21387 for (Value *Op : I->operands())
21388 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21389 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21390 if (R.isDeleted(I))
21391 break;
21392 }
21393 }
21394 // Try to vectorize operands as vector bundles.
21395 for (CmpInst *I : CmpInsts) {
21396 if (R.isDeleted(I))
21397 continue;
21398 Changed |= tryToVectorize(I, R);
21399 }
21400 // Try to vectorize list of compares.
21401 // Sort by type, compare predicate, etc.
21402 auto CompareSorter = [&](Value *V, Value *V2) {
21403 if (V == V2)
21404 return false;
21405 return compareCmp<false>(V, V2, *TLI, *DT);
21406 };
21407
21408 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21409 if (V1 == V2)
21410 return true;
21411 return compareCmp<true>(V1, V2, *TLI, *DT);
21412 };
21413
21415 for (Instruction *V : CmpInsts)
21416 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21417 Vals.push_back(V);
21418 if (Vals.size() <= 1)
21419 return Changed;
21420 Changed |= tryToVectorizeSequence<Value>(
21421 Vals, CompareSorter, AreCompatibleCompares,
21422 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21423 // Exclude possible reductions from other blocks.
21424 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21425 return any_of(V->users(), [V](User *U) {
21426 auto *Select = dyn_cast<SelectInst>(U);
21427 return Select &&
21428 Select->getParent() != cast<Instruction>(V)->getParent();
21429 });
21430 });
21431 if (ArePossiblyReducedInOtherBlock)
21432 return false;
21433 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21434 },
21435 /*MaxVFOnly=*/true, R);
21436 return Changed;
21437}
21438
21439bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21440 BasicBlock *BB, BoUpSLP &R) {
21441 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21442 "This function only accepts Insert instructions");
21443 bool OpsChanged = false;
21444 SmallVector<WeakTrackingVH> PostponedInsts;
21445 for (auto *I : reverse(Instructions)) {
21446 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21447 if (R.isDeleted(I) || isa<CmpInst>(I))
21448 continue;
21449 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21450 OpsChanged |=
21451 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21452 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21453 OpsChanged |=
21454 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21455 }
21456 // pass2 - try to vectorize reductions only
21457 if (R.isDeleted(I))
21458 continue;
21459 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21460 if (R.isDeleted(I) || isa<CmpInst>(I))
21461 continue;
21462 // pass3 - try to match and vectorize a buildvector sequence.
21463 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21464 OpsChanged |=
21465 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21466 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21467 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21468 /*MaxVFOnly=*/false);
21469 }
21470 }
21471 // Now try to vectorize postponed instructions.
21472 OpsChanged |= tryToVectorize(PostponedInsts, R);
21473
21474 Instructions.clear();
21475 return OpsChanged;
21476}
21477
21478bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21479 bool Changed = false;
21481 SmallPtrSet<Value *, 16> VisitedInstrs;
21482 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21483 // node. Allows better to identify the chains that can be vectorized in the
21484 // better way.
21486 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21488 isValidElementType(V2->getType()) &&
21489 "Expected vectorizable types only.");
21490 // It is fine to compare type IDs here, since we expect only vectorizable
21491 // types, like ints, floats and pointers, we don't care about other type.
21492 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21493 return true;
21494 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21495 return false;
21496 if (V1->getType()->getScalarSizeInBits() <
21497 V2->getType()->getScalarSizeInBits())
21498 return true;
21499 if (V1->getType()->getScalarSizeInBits() >
21500 V2->getType()->getScalarSizeInBits())
21501 return false;
21502 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21503 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21504 if (Opcodes1.size() < Opcodes2.size())
21505 return true;
21506 if (Opcodes1.size() > Opcodes2.size())
21507 return false;
21508 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21509 {
21510 // Instructions come first.
21511 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21512 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21513 if (I1 && I2) {
21514 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21515 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21516 if (!NodeI1)
21517 return NodeI2 != nullptr;
21518 if (!NodeI2)
21519 return false;
21520 assert((NodeI1 == NodeI2) ==
21521 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21522 "Different nodes should have different DFS numbers");
21523 if (NodeI1 != NodeI2)
21524 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21525 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21526 if (S && !S.isAltShuffle())
21527 continue;
21528 return I1->getOpcode() < I2->getOpcode();
21529 }
21530 if (I1)
21531 return true;
21532 if (I2)
21533 return false;
21534 }
21535 {
21536 // Non-undef constants come next.
21537 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21538 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21539 if (C1 && C2)
21540 continue;
21541 if (C1)
21542 return true;
21543 if (C2)
21544 return false;
21545 }
21546 bool U1 = isa<UndefValue>(Opcodes1[I]);
21547 bool U2 = isa<UndefValue>(Opcodes2[I]);
21548 {
21549 // Non-constant non-instructions come next.
21550 if (!U1 && !U2) {
21551 auto ValID1 = Opcodes1[I]->getValueID();
21552 auto ValID2 = Opcodes2[I]->getValueID();
21553 if (ValID1 == ValID2)
21554 continue;
21555 if (ValID1 < ValID2)
21556 return true;
21557 if (ValID1 > ValID2)
21558 return false;
21559 }
21560 if (!U1)
21561 return true;
21562 if (!U2)
21563 return false;
21564 }
21565 // Undefs come last.
21566 assert(U1 && U2 && "The only thing left should be undef & undef.");
21567 }
21568 return false;
21569 };
21570 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21571 if (V1 == V2)
21572 return true;
21573 if (V1->getType() != V2->getType())
21574 return false;
21575 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21576 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21577 if (Opcodes1.size() != Opcodes2.size())
21578 return false;
21579 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21580 // Undefs are compatible with any other value.
21581 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21582 continue;
21583 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21584 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21585 if (R.isDeleted(I1) || R.isDeleted(I2))
21586 return false;
21587 if (I1->getParent() != I2->getParent())
21588 return false;
21589 if (getSameOpcode({I1, I2}, *TLI))
21590 continue;
21591 return false;
21592 }
21593 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21594 continue;
21595 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21596 return false;
21597 }
21598 return true;
21599 };
21600
21601 bool HaveVectorizedPhiNodes = false;
21602 do {
21603 // Collect the incoming values from the PHIs.
21604 Incoming.clear();
21605 for (Instruction &I : *BB) {
21606 auto *P = dyn_cast<PHINode>(&I);
21607 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21608 break;
21609
21610 // No need to analyze deleted, vectorized and non-vectorizable
21611 // instructions.
21612 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21613 isValidElementType(P->getType()))
21614 Incoming.push_back(P);
21615 }
21616
21617 if (Incoming.size() <= 1)
21618 break;
21619
21620 // Find the corresponding non-phi nodes for better matching when trying to
21621 // build the tree.
21622 for (Value *V : Incoming) {
21623 SmallVectorImpl<Value *> &Opcodes =
21624 PHIToOpcodes.try_emplace(V).first->getSecond();
21625 if (!Opcodes.empty())
21626 continue;
21627 SmallVector<Value *, 4> Nodes(1, V);
21629 while (!Nodes.empty()) {
21630 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21631 if (!Visited.insert(PHI).second)
21632 continue;
21633 for (Value *V : PHI->incoming_values()) {
21634 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21635 Nodes.push_back(PHI1);
21636 continue;
21637 }
21638 Opcodes.emplace_back(V);
21639 }
21640 }
21641 }
21642
21643 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21644 Incoming, PHICompare, AreCompatiblePHIs,
21645 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21646 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21647 },
21648 /*MaxVFOnly=*/true, R);
21649 Changed |= HaveVectorizedPhiNodes;
21650 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21651 auto *PHI = dyn_cast<PHINode>(P.first);
21652 return !PHI || R.isDeleted(PHI);
21653 }))
21654 PHIToOpcodes.clear();
21655 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21656 } while (HaveVectorizedPhiNodes);
21657
21658 VisitedInstrs.clear();
21659
21660 InstSetVector PostProcessInserts;
21661 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21662 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21663 // also vectorizes `PostProcessCmps`.
21664 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21665 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21666 if (VectorizeCmps) {
21667 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21668 PostProcessCmps.clear();
21669 }
21670 PostProcessInserts.clear();
21671 return Changed;
21672 };
21673 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21674 auto IsInPostProcessInstrs = [&](Instruction *I) {
21675 if (auto *Cmp = dyn_cast<CmpInst>(I))
21676 return PostProcessCmps.contains(Cmp);
21677 return isa<InsertElementInst, InsertValueInst>(I) &&
21678 PostProcessInserts.contains(I);
21679 };
21680 // Returns true if `I` is an instruction without users, like terminator, or
21681 // function call with ignored return value, store. Ignore unused instructions
21682 // (basing on instruction type, except for CallInst and InvokeInst).
21683 auto HasNoUsers = [](Instruction *I) {
21684 return I->use_empty() &&
21685 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21686 };
21687 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21688 // Skip instructions with scalable type. The num of elements is unknown at
21689 // compile-time for scalable type.
21690 if (isa<ScalableVectorType>(It->getType()))
21691 continue;
21692
21693 // Skip instructions marked for the deletion.
21694 if (R.isDeleted(&*It))
21695 continue;
21696 // We may go through BB multiple times so skip the one we have checked.
21697 if (!VisitedInstrs.insert(&*It).second) {
21698 if (HasNoUsers(&*It) &&
21699 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21700 // We would like to start over since some instructions are deleted
21701 // and the iterator may become invalid value.
21702 Changed = true;
21703 It = BB->begin();
21704 E = BB->end();
21705 }
21706 continue;
21707 }
21708
21709 if (isa<DbgInfoIntrinsic>(It))
21710 continue;
21711
21712 // Try to vectorize reductions that use PHINodes.
21713 if (PHINode *P = dyn_cast<PHINode>(It)) {
21714 // Check that the PHI is a reduction PHI.
21715 if (P->getNumIncomingValues() == 2) {
21716 // Try to match and vectorize a horizontal reduction.
21717 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21718 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21719 Changed = true;
21720 It = BB->begin();
21721 E = BB->end();
21722 continue;
21723 }
21724 }
21725 // Try to vectorize the incoming values of the PHI, to catch reductions
21726 // that feed into PHIs.
21727 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21728 // Skip if the incoming block is the current BB for now. Also, bypass
21729 // unreachable IR for efficiency and to avoid crashing.
21730 // TODO: Collect the skipped incoming values and try to vectorize them
21731 // after processing BB.
21732 if (BB == P->getIncomingBlock(I) ||
21733 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21734 continue;
21735
21736 // Postponed instructions should not be vectorized here, delay their
21737 // vectorization.
21738 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21739 PI && !IsInPostProcessInstrs(PI)) {
21740 bool Res =
21741 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21742 Changed |= Res;
21743 if (Res && R.isDeleted(P)) {
21744 It = BB->begin();
21745 E = BB->end();
21746 break;
21747 }
21748 }
21749 }
21750 continue;
21751 }
21752
21753 if (HasNoUsers(&*It)) {
21754 bool OpsChanged = false;
21755 auto *SI = dyn_cast<StoreInst>(It);
21756 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21757 if (SI) {
21758 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21759 // Try to vectorize chain in store, if this is the only store to the
21760 // address in the block.
21761 // TODO: This is just a temporarily solution to save compile time. Need
21762 // to investigate if we can safely turn on slp-vectorize-hor-store
21763 // instead to allow lookup for reduction chains in all non-vectorized
21764 // stores (need to check side effects and compile time).
21765 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21766 SI->getValueOperand()->hasOneUse();
21767 }
21768 if (TryToVectorizeRoot) {
21769 for (auto *V : It->operand_values()) {
21770 // Postponed instructions should not be vectorized here, delay their
21771 // vectorization.
21772 if (auto *VI = dyn_cast<Instruction>(V);
21773 VI && !IsInPostProcessInstrs(VI))
21774 // Try to match and vectorize a horizontal reduction.
21775 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21776 }
21777 }
21778 // Start vectorization of post-process list of instructions from the
21779 // top-tree instructions to try to vectorize as many instructions as
21780 // possible.
21781 OpsChanged |=
21782 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21783 if (OpsChanged) {
21784 // We would like to start over since some instructions are deleted
21785 // and the iterator may become invalid value.
21786 Changed = true;
21787 It = BB->begin();
21788 E = BB->end();
21789 continue;
21790 }
21791 }
21792
21793 if (isa<InsertElementInst, InsertValueInst>(It))
21794 PostProcessInserts.insert(&*It);
21795 else if (isa<CmpInst>(It))
21796 PostProcessCmps.insert(cast<CmpInst>(&*It));
21797 }
21798
21799 return Changed;
21800}
21801
21802bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21803 auto Changed = false;
21804 for (auto &Entry : GEPs) {
21805 // If the getelementptr list has fewer than two elements, there's nothing
21806 // to do.
21807 if (Entry.second.size() < 2)
21808 continue;
21809
21810 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21811 << Entry.second.size() << ".\n");
21812
21813 // Process the GEP list in chunks suitable for the target's supported
21814 // vector size. If a vector register can't hold 1 element, we are done. We
21815 // are trying to vectorize the index computations, so the maximum number of
21816 // elements is based on the size of the index expression, rather than the
21817 // size of the GEP itself (the target's pointer size).
21818 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21819 return !R.isDeleted(GEP);
21820 });
21821 if (It == Entry.second.end())
21822 continue;
21823 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21824 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21825 if (MaxVecRegSize < EltSize)
21826 continue;
21827
21828 unsigned MaxElts = MaxVecRegSize / EltSize;
21829 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21830 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21831 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21832
21833 // Initialize a set a candidate getelementptrs. Note that we use a
21834 // SetVector here to preserve program order. If the index computations
21835 // are vectorizable and begin with loads, we want to minimize the chance
21836 // of having to reorder them later.
21837 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21838
21839 // Some of the candidates may have already been vectorized after we
21840 // initially collected them or their index is optimized to constant value.
21841 // If so, they are marked as deleted, so remove them from the set of
21842 // candidates.
21843 Candidates.remove_if([&R](Value *I) {
21844 return R.isDeleted(cast<Instruction>(I)) ||
21845 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21846 });
21847
21848 // Remove from the set of candidates all pairs of getelementptrs with
21849 // constant differences. Such getelementptrs are likely not good
21850 // candidates for vectorization in a bottom-up phase since one can be
21851 // computed from the other. We also ensure all candidate getelementptr
21852 // indices are unique.
21853 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21854 auto *GEPI = GEPList[I];
21855 if (!Candidates.count(GEPI))
21856 continue;
21857 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21858 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21859 auto *GEPJ = GEPList[J];
21860 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21861 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21862 Candidates.remove(GEPI);
21863 Candidates.remove(GEPJ);
21864 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21865 Candidates.remove(GEPJ);
21866 }
21867 }
21868 }
21869
21870 // We break out of the above computation as soon as we know there are
21871 // fewer than two candidates remaining.
21872 if (Candidates.size() < 2)
21873 continue;
21874
21875 // Add the single, non-constant index of each candidate to the bundle. We
21876 // ensured the indices met these constraints when we originally collected
21877 // the getelementptrs.
21878 SmallVector<Value *, 16> Bundle(Candidates.size());
21879 auto BundleIndex = 0u;
21880 for (auto *V : Candidates) {
21881 auto *GEP = cast<GetElementPtrInst>(V);
21882 auto *GEPIdx = GEP->idx_begin()->get();
21883 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21884 Bundle[BundleIndex++] = GEPIdx;
21885 }
21886
21887 // Try and vectorize the indices. We are currently only interested in
21888 // gather-like cases of the form:
21889 //
21890 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21891 //
21892 // where the loads of "a", the loads of "b", and the subtractions can be
21893 // performed in parallel. It's likely that detecting this pattern in a
21894 // bottom-up phase will be simpler and less costly than building a
21895 // full-blown top-down phase beginning at the consecutive loads.
21896 Changed |= tryToVectorizeList(Bundle, R);
21897 }
21898 }
21899 return Changed;
21900}
21901
21902bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21903 bool Changed = false;
21904 // Sort by type, base pointers and values operand. Value operands must be
21905 // compatible (have the same opcode, same parent), otherwise it is
21906 // definitely not profitable to try to vectorize them.
21907 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21908 if (V->getValueOperand()->getType()->getTypeID() <
21909 V2->getValueOperand()->getType()->getTypeID())
21910 return true;
21911 if (V->getValueOperand()->getType()->getTypeID() >
21912 V2->getValueOperand()->getType()->getTypeID())
21913 return false;
21914 if (V->getPointerOperandType()->getTypeID() <
21915 V2->getPointerOperandType()->getTypeID())
21916 return true;
21917 if (V->getPointerOperandType()->getTypeID() >
21918 V2->getPointerOperandType()->getTypeID())
21919 return false;
21920 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21921 V2->getValueOperand()->getType()->getScalarSizeInBits())
21922 return true;
21923 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21924 V2->getValueOperand()->getType()->getScalarSizeInBits())
21925 return false;
21926 // UndefValues are compatible with all other values.
21927 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21928 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21930 DT->getNode(I1->getParent());
21932 DT->getNode(I2->getParent());
21933 assert(NodeI1 && "Should only process reachable instructions");
21934 assert(NodeI2 && "Should only process reachable instructions");
21935 assert((NodeI1 == NodeI2) ==
21936 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21937 "Different nodes should have different DFS numbers");
21938 if (NodeI1 != NodeI2)
21939 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21940 return I1->getOpcode() < I2->getOpcode();
21941 }
21942 return V->getValueOperand()->getValueID() <
21943 V2->getValueOperand()->getValueID();
21944 };
21945
21946 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21947 if (V1 == V2)
21948 return true;
21949 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21950 return false;
21951 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21952 return false;
21953 // Undefs are compatible with any other value.
21954 if (isa<UndefValue>(V1->getValueOperand()) ||
21955 isa<UndefValue>(V2->getValueOperand()))
21956 return true;
21957 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21958 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21959 if (I1->getParent() != I2->getParent())
21960 return false;
21961 return getSameOpcode({I1, I2}, *TLI).valid();
21962 }
21963 if (isa<Constant>(V1->getValueOperand()) &&
21964 isa<Constant>(V2->getValueOperand()))
21965 return true;
21966 return V1->getValueOperand()->getValueID() ==
21967 V2->getValueOperand()->getValueID();
21968 };
21969
21970 // Attempt to sort and vectorize each of the store-groups.
21972 for (auto &Pair : Stores) {
21973 if (Pair.second.size() < 2)
21974 continue;
21975
21976 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
21977 << Pair.second.size() << ".\n");
21978
21979 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21980 continue;
21981
21982 // Reverse stores to do bottom-to-top analysis. This is important if the
21983 // values are stores to the same addresses several times, in this case need
21984 // to follow the stores order (reversed to meet the memory dependecies).
21985 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21986 Pair.second.rend());
21987 Changed |= tryToVectorizeSequence<StoreInst>(
21988 ReversedStores, StoreSorter, AreCompatibleStores,
21989 [&](ArrayRef<StoreInst *> Candidates, bool) {
21990 return vectorizeStores(Candidates, R, Attempted);
21991 },
21992 /*MaxVFOnly=*/false, R);
21993 }
21994 return Changed;
21995}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
reverse_iterator rend()
Definition: BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:675
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1975
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1870
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2112
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1969
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1199
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1494
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1966
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:980
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:859
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1071
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2510
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:530
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1079
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2498
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1814
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2573
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2185
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1873
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:866
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1760
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:879
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2403
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2434
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:871
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2532
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2448
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1670
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2224
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1833
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2379
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1613
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:283
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:763
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:168
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
bool erase(const ValueT &V)
Definition: DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7297
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1683
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:255
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1054
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2133
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.