LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107using namespace std::placeholders;
108
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
111
112STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
113
114DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
115 "Controls which SLP graphs should be vectorized.");
116
117static cl::opt<bool>
118 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
119 cl::desc("Run the SLP vectorization passes"));
120
121static cl::opt<bool>
122 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
123 cl::desc("Enable vectorization for wider vector utilization"));
124
125static cl::opt<int>
127 cl::desc("Only vectorize if you gain more than this "
128 "number "));
129
131 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
132 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
134
135static cl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
137 cl::desc("Attempt to vectorize horizontal reductions"));
138
140 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
141 cl::desc(
142 "Attempt to vectorize horizontal reductions feeding into a store"));
143
144static cl::opt<int>
146 cl::desc("Attempt to vectorize for this register size in bits"));
147
150 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
151
152/// Limits the size of scheduling regions in a block.
153/// It avoid long compile times for _very_ large blocks where vector
154/// instructions are spread over a wide range.
155/// This limit is way higher than needed by real-world functions.
156static cl::opt<int>
157ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
158 cl::desc("Limit the size of the SLP scheduling region per block"));
159
161 "slp-min-reg-size", cl::init(128), cl::Hidden,
162 cl::desc("Attempt to vectorize for this register size in bits"));
163
165 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
166 cl::desc("Limit the recursion depth when building a vectorizable tree"));
167
169 "slp-min-tree-size", cl::init(3), cl::Hidden,
170 cl::desc("Only vectorize small trees if they are fully vectorizable"));
171
172// The maximum depth that the look-ahead score heuristic will explore.
173// The higher this value, the higher the compilation time overhead.
175 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for operand reordering scores"));
177
178// The maximum depth that the look-ahead score heuristic will explore
179// when it probing among candidates for vectorization tree roots.
180// The higher this value, the higher the compilation time overhead but unlike
181// similar limit for operands ordering this is less frequently used, hence
182// impact of higher value is less noticeable.
184 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
185 cl::desc("The maximum look-ahead depth for searching best rooting option"));
186
188 "slp-min-strided-loads", cl::init(2), cl::Hidden,
189 cl::desc("The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
191
193 "slp-max-stride", cl::init(8), cl::Hidden,
194 cl::desc("The maximum stride, considered to be profitable."));
195
196static cl::opt<bool>
197 ViewSLPTree("view-slp-tree", cl::Hidden,
198 cl::desc("Display the SLP trees with Graphviz"));
199
201 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
202 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
203
204// Limit the number of alias checks. The limit is chosen so that
205// it has no negative effect on the llvm benchmarks.
206static const unsigned AliasedCheckLimit = 10;
207
208// Limit of the number of uses for potentially transformed instructions/values,
209// used in checks to avoid compile-time explode.
210static constexpr int UsesLimit = 64;
211
212// Another limit for the alias checks: The maximum distance between load/store
213// instructions where alias checks are done.
214// This limit is useful for very large basic blocks.
215static const unsigned MaxMemDepDistance = 160;
216
217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
218/// regions to be handled.
219static const int MinScheduleRegionSize = 16;
220
221/// Maximum allowed number of operands in the PHI nodes.
222static const unsigned MaxPHINumOperands = 128;
223
224/// Predicate for the element types that the SLP vectorizer supports.
225///
226/// The most important thing to filter here are types which are invalid in LLVM
227/// vectors. We also filter target specific types which have absolutely no
228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
229/// avoids spending time checking the cost model and realizing that they will
230/// be inevitably scalarized.
231static bool isValidElementType(Type *Ty) {
232 // TODO: Support ScalableVectorType.
233 if (SLPReVec && isa<FixedVectorType>(Ty))
234 Ty = Ty->getScalarType();
235 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
236 !Ty->isPPC_FP128Ty();
237}
238
239/// Returns the type of the given value/instruction \p V. If it is store,
240/// returns the type of its value operand, for Cmp - the types of the compare
241/// operands and for insertelement - the type os the inserted operand.
242/// Otherwise, just the type of the value is returned.
244 if (auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
250 return V->getType();
251}
252
253/// \returns the number of elements for Ty.
254static unsigned getNumElements(Type *Ty) {
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
259 return 1;
260}
261
262/// \returns the vector type of ScalarTy based on vectorization factor.
263static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
264 return FixedVectorType::get(ScalarTy->getScalarType(),
265 VF * getNumElements(ScalarTy));
266}
267
268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
269/// which forms type, which splits by \p TTI into whole vector types during
270/// legalization.
272 Type *Ty, unsigned Sz) {
273 if (!isValidElementType(Ty))
274 return bit_ceil(Sz);
275 // Find the number of elements, which forms full vectors.
276 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
277 if (NumParts == 0 || NumParts >= Sz)
278 return bit_ceil(Sz);
279 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
280}
281
282/// Returns the number of elements of the given type \p Ty, not greater than \p
283/// Sz, which forms type, which splits by \p TTI into whole vector types during
284/// legalization.
285static unsigned
287 unsigned Sz) {
288 if (!isValidElementType(Ty))
289 return bit_floor(Sz);
290 // Find the number of elements, which forms full vectors.
291 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
292 if (NumParts == 0 || NumParts >= Sz)
293 return bit_floor(Sz);
294 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
295 if (RegVF > Sz)
296 return bit_floor(Sz);
297 return (Sz / RegVF) * RegVF;
298}
299
300static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
301 SmallVectorImpl<int> &Mask) {
302 // The ShuffleBuilder implementation use shufflevector to splat an "element".
303 // But the element have different meaning for SLP (scalar) and REVEC
304 // (vector). We need to expand Mask into masks which shufflevector can use
305 // directly.
306 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
307 for (unsigned I : seq<unsigned>(Mask.size()))
308 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
309 I * VecTyNumElements, VecTyNumElements)))
310 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
311 : Mask[I] * VecTyNumElements + J;
312 Mask.swap(NewMask);
313}
314
315/// \returns the number of groups of shufflevector
316/// A group has the following features
317/// 1. All of value in a group are shufflevector.
318/// 2. The mask of all shufflevector is isExtractSubvectorMask.
319/// 3. The mask of all shufflevector uses all of the elements of the source.
320/// e.g., it is 1 group (%0)
321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325/// it is 2 groups (%3 and %4)
326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
334/// it is 0 group
335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
340 if (VL.empty())
341 return 0;
342 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
343 return 0;
344 auto *SV = cast<ShuffleVectorInst>(VL.front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
349 return 0;
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
352 return 0;
353 unsigned NumGroup = 0;
354 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[I]);
356 Value *Src = SV->getOperand(0);
357 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
358 SmallBitVector ExpectedIndex(GroupSize);
359 if (!all_of(Group, [&](Value *V) {
360 auto *SV = cast<ShuffleVectorInst>(V);
361 // From the same source.
362 if (SV->getOperand(0) != Src)
363 return false;
364 int Index;
365 if (!SV->isExtractSubvectorMask(Index))
366 return false;
367 ExpectedIndex.set(Index / ShuffleMaskSize);
368 return true;
369 }))
370 return 0;
371 if (!ExpectedIndex.all())
372 return 0;
373 ++NumGroup;
374 }
375 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
376 return NumGroup;
377}
378
379/// \returns a shufflevector mask which is used to vectorize shufflevectors
380/// e.g.,
381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
389/// the result is
390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
392 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
393 auto *SV = cast<ShuffleVectorInst>(VL.front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 SmallVector<int> Mask;
397 unsigned AccumulateLength = 0;
398 for (Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (int M : SV->getShuffleMask())
401 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
404 }
405 return Mask;
406}
407
408/// \returns True if the value is a constant (but not globals/constant
409/// expressions).
410static bool isConstant(Value *V) {
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
412}
413
414/// Checks if \p V is one of vector-like instructions, i.e. undef,
415/// insertelement/extractelement with constant indices for fixed vector type or
416/// extractvalue instruction.
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
420 return false;
421 auto *I = dyn_cast<Instruction>(V);
422 if (!I || isa<ExtractValueInst>(I))
423 return true;
424 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
425 return false;
426 if (isa<ExtractElementInst>(I))
427 return isConstant(I->getOperand(1));
428 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
429 return isConstant(I->getOperand(2));
430}
431
432/// Returns power-of-2 number of elements in a single register (part), given the
433/// total number of elements \p Size and number of registers (parts) \p
434/// NumParts.
435static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
436 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
437}
438
439/// Returns correct remaining number of elements, considering total amount \p
440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
441/// and current register (part) \p Part.
442static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
443 unsigned Part) {
444 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
445}
446
447#if !defined(NDEBUG)
448/// Print a short descriptor of the instruction bundle suitable for debug output.
449static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
450 std::string Result;
451 raw_string_ostream OS(Result);
452 if (Idx >= 0)
453 OS << "Idx: " << Idx << ", ";
454 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
455 return Result;
456}
457#endif
458
459/// \returns true if all of the instructions in \p VL are in the same block or
460/// false otherwise.
462 auto *It = find_if(VL, IsaPred<Instruction>);
463 if (It == VL.end())
464 return false;
465 Instruction *I0 = cast<Instruction>(*It);
467 return true;
468
469 BasicBlock *BB = I0->getParent();
470 for (Value *V : iterator_range(It, VL.end())) {
471 if (isa<PoisonValue>(V))
472 continue;
473 auto *II = dyn_cast<Instruction>(V);
474 if (!II)
475 return false;
476
477 if (BB != II->getParent())
478 return false;
479 }
480 return true;
481}
482
483/// \returns True if all of the values in \p VL are constants (but not
484/// globals/constant expressions).
486 // Constant expressions and globals can't be vectorized like normal integer/FP
487 // constants.
488 return all_of(VL, isConstant);
489}
490
491/// \returns True if all of the values in \p VL are identical or some of them
492/// are UndefValue.
493static bool isSplat(ArrayRef<Value *> VL) {
494 Value *FirstNonUndef = nullptr;
495 for (Value *V : VL) {
496 if (isa<UndefValue>(V))
497 continue;
498 if (!FirstNonUndef) {
499 FirstNonUndef = V;
500 continue;
501 }
502 if (V != FirstNonUndef)
503 return false;
504 }
505 return FirstNonUndef != nullptr;
506}
507
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
510 if (auto *Cmp = dyn_cast<CmpInst>(I))
511 return Cmp->isCommutative();
512 if (auto *BO = dyn_cast<BinaryOperator>(I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
515 !BO->hasNUsesOrMore(UsesLimit) &&
516 all_of(
517 BO->uses(),
518 [](const Use &U) {
519 // Commutative, if icmp eq/ne sub, 0
520 CmpPredicate Pred;
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
524 return true;
525 // Commutative, if abs(sub nsw, true) or abs(sub, false).
526 ConstantInt *Flag;
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
531 Flag->isOne());
532 })) ||
533 (BO->getOpcode() == Instruction::FSub &&
534 !BO->hasNUsesOrMore(UsesLimit) &&
535 all_of(BO->uses(), [](const Use &U) {
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 }));
539 return I->isCommutative();
540}
541
542template <typename T>
543static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
544 unsigned Offset) {
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
547 "unsupported T");
548 int Index = Offset;
549 if (const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
551 if (!VT)
552 return std::nullopt;
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
554 if (!CI)
555 return std::nullopt;
556 if (CI->getValue().uge(VT->getNumElements()))
557 return std::nullopt;
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
560 return Index;
561 }
562 return std::nullopt;
563}
564
565/// \returns inserting or extracting index of InsertElement, ExtractElement or
566/// InsertValue instruction, using Offset as base offset for index.
567/// \returns std::nullopt if the index is not an immediate.
568static std::optional<unsigned> getElementIndex(const Value *Inst,
569 unsigned Offset = 0) {
570 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
571 return Index;
572 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
573 return Index;
574
575 int Index = Offset;
576
577 const auto *IV = dyn_cast<InsertValueInst>(Inst);
578 if (!IV)
579 return std::nullopt;
580
581 Type *CurrentType = IV->getType();
582 for (unsigned I : IV->indices()) {
583 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(I);
586 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
589 } else {
590 return std::nullopt;
591 }
592 Index += I;
593 }
594 return Index;
595}
596
597namespace {
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements
599/// in the shuffle mask.
600enum class UseMask {
601 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
602 ///< check for the mask elements for the first argument (mask
603 ///< indices are in range [0:VF)).
604 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
605 ///< for the mask elements for the second argument (mask indices
606 ///< are in range [VF:2*VF))
607 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
608 ///< future shuffle elements and mark them as ones as being used
609 ///< in future. Non-undef elements are considered as unused since
610 ///< they're already marked as used in the mask.
611};
612} // namespace
613
614/// Prepares a use bitset for the given mask either for the first argument or
615/// for the second.
617 UseMask MaskArg) {
618 SmallBitVector UseMask(VF, true);
619 for (auto [Idx, Value] : enumerate(Mask)) {
620 if (Value == PoisonMaskElem) {
621 if (MaskArg == UseMask::UndefsAsMask)
622 UseMask.reset(Idx);
623 continue;
624 }
625 if (MaskArg == UseMask::FirstArg && Value < VF)
626 UseMask.reset(Value);
627 else if (MaskArg == UseMask::SecondArg && Value >= VF)
628 UseMask.reset(Value - VF);
629 }
630 return UseMask;
631}
632
633/// Checks if the given value is actually an undefined constant vector.
634/// Also, if the \p UseMask is not empty, tries to check if the non-masked
635/// elements actually mask the insertelement buildvector, if any.
636template <bool IsPoisonOnly = false>
638 const SmallBitVector &UseMask = {}) {
639 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
641 if (isa<T>(V))
642 return Res;
643 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
644 if (!VecTy)
645 return Res.reset();
646 auto *C = dyn_cast<Constant>(V);
647 if (!C) {
648 if (!UseMask.empty()) {
649 const Value *Base = V;
650 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
651 Base = II->getOperand(0);
652 if (isa<T>(II->getOperand(1)))
653 continue;
654 std::optional<unsigned> Idx = getElementIndex(II);
655 if (!Idx) {
656 Res.reset();
657 return Res;
658 }
659 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
660 Res.reset(*Idx);
661 }
662 // TODO: Add analysis for shuffles here too.
663 if (V == Base) {
664 Res.reset();
665 } else {
666 SmallBitVector SubMask(UseMask.size(), false);
667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
668 }
669 } else {
670 Res.reset();
671 }
672 return Res;
673 }
674 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
675 if (Constant *Elem = C->getAggregateElement(I))
676 if (!isa<T>(Elem) &&
677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
678 Res.reset(I);
679 }
680 return Res;
681}
682
683/// Checks if the vector of instructions can be represented as a shuffle, like:
684/// %x0 = extractelement <4 x i8> %x, i32 0
685/// %x3 = extractelement <4 x i8> %x, i32 3
686/// %y1 = extractelement <4 x i8> %y, i32 1
687/// %y2 = extractelement <4 x i8> %y, i32 2
688/// %x0x0 = mul i8 %x0, %x0
689/// %x3x3 = mul i8 %x3, %x3
690/// %y1y1 = mul i8 %y1, %y1
691/// %y2y2 = mul i8 %y2, %y2
692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
696/// ret <4 x i8> %ins4
697/// can be transformed into:
698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
699/// i32 6>
700/// %2 = mul <4 x i8> %1, %1
701/// ret <4 x i8> %2
702/// Mask will return the Shuffle Mask equivalent to the extracted elements.
703/// TODO: Can we split off and reuse the shuffle mask detection from
704/// ShuffleVectorInst/getShuffleCost?
705static std::optional<TargetTransformInfo::ShuffleKind>
707 AssumptionCache *AC) {
708 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
709 if (It == VL.end())
710 return std::nullopt;
711 unsigned Size =
712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
714 if (!EI)
715 return S;
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 if (!VTy)
718 return S;
719 return std::max(S, VTy->getNumElements());
720 });
721
722 Value *Vec1 = nullptr;
723 Value *Vec2 = nullptr;
724 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
725 auto *EE = dyn_cast<ExtractElementInst>(V);
726 if (!EE)
727 return false;
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
730 return false;
731 return isGuaranteedNotToBePoison(Vec, AC);
732 });
733 enum ShuffleMode { Unknown, Select, Permute };
734 ShuffleMode CommonShuffleMode = Unknown;
735 Mask.assign(VL.size(), PoisonMaskElem);
736 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
737 // Undef can be represented as an undef element in a vector.
738 if (isa<UndefValue>(VL[I]))
739 continue;
740 auto *EI = cast<ExtractElementInst>(VL[I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 return std::nullopt;
743 auto *Vec = EI->getVectorOperand();
744 // We can extractelement from undef or poison vector.
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746 continue;
747 // All vector operands must have the same number of vector elements.
748 if (isa<UndefValue>(Vec)) {
749 Mask[I] = I;
750 } else {
751 if (isa<UndefValue>(EI->getIndexOperand()))
752 continue;
753 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
754 if (!Idx)
755 return std::nullopt;
756 // Undefined behavior if Idx is negative or >= Size.
757 if (Idx->getValue().uge(Size))
758 continue;
759 unsigned IntIdx = Idx->getValue().getZExtValue();
760 Mask[I] = IntIdx;
761 }
762 if (isUndefVector(Vec).all() && HasNonUndefVec)
763 continue;
764 // For correct shuffling we have to have at most 2 different vector operands
765 // in all extractelement instructions.
766 if (!Vec1 || Vec1 == Vec) {
767 Vec1 = Vec;
768 } else if (!Vec2 || Vec2 == Vec) {
769 Vec2 = Vec;
770 Mask[I] += Size;
771 } else {
772 return std::nullopt;
773 }
774 if (CommonShuffleMode == Permute)
775 continue;
776 // If the extract index is not the same as the operation number, it is a
777 // permutation.
778 if (Mask[I] % Size != I) {
779 CommonShuffleMode = Permute;
780 continue;
781 }
782 CommonShuffleMode = Select;
783 }
784 // If we're not crossing lanes in different vectors, consider it as blending.
785 if (CommonShuffleMode == Select && Vec2)
787 // If Vec2 was never used, we have a permutation of a single vector, otherwise
788 // we have permutation of 2 vectors.
791}
792
793/// \returns True if Extract{Value,Element} instruction extracts element Idx.
794static std::optional<unsigned> getExtractIndex(Instruction *E) {
795 unsigned Opcode = E->getOpcode();
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
801 if (!CI)
802 return std::nullopt;
803 return CI->getZExtValue();
804 }
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
807 return std::nullopt;
808 return *EI->idx_begin();
809}
810
811namespace {
812
813/// Main data required for vectorization of instructions.
814class InstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0.
816 Instruction *MainOp = nullptr;
817 Instruction *AltOp = nullptr;
818
819public:
820 Instruction *getMainOp() const {
821 assert(valid() && "InstructionsState is invalid.");
822 return MainOp;
823 }
824
825 Instruction *getAltOp() const {
826 assert(valid() && "InstructionsState is invalid.");
827 return AltOp;
828 }
829
830 /// The main/alternate opcodes for the list of instructions.
831 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
832
833 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
834
835 /// Some of the instructions in the list have alternate opcodes.
836 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
837
838 bool isOpcodeOrAlt(Instruction *I) const {
839 unsigned CheckedOpcode = I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
841 }
842
843 /// Checks if the current state is valid, i.e. has non-null MainOp
844 bool valid() const { return MainOp && AltOp; }
845
846 explicit operator bool() const { return valid(); }
847
848 InstructionsState() = delete;
849 InstructionsState(Instruction *MainOp, Instruction *AltOp)
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() { return {nullptr, nullptr}; }
852};
853
854} // end anonymous namespace
855
856/// \returns true if \p Opcode is allowed as part of the main/alternate
857/// instruction for SLP vectorization.
858///
859/// Example of unsupported opcode is SDIV that can potentially cause UB if the
860/// "shuffled out" lane would result in division by zero.
861static bool isValidForAlternation(unsigned Opcode) {
862 if (Instruction::isIntDivRem(Opcode))
863 return false;
864
865 return true;
866}
867
868static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
869 const TargetLibraryInfo &TLI);
870
871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
872/// compatible instructions or constants, or just some other regular values.
873static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
874 Value *Op1, const TargetLibraryInfo &TLI) {
875 return (isConstant(BaseOp0) && isConstant(Op0)) ||
876 (isConstant(BaseOp1) && isConstant(Op1)) ||
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
880 getSameOpcode({BaseOp0, Op0}, TLI) ||
881 getSameOpcode({BaseOp1, Op1}, TLI);
882}
883
884/// \returns true if a compare instruction \p CI has similar "look" and
885/// same predicate as \p BaseCI, "as is" or with its operands and predicate
886/// swapped, false otherwise.
887static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
888 const TargetLibraryInfo &TLI) {
889 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
890 "Assessing comparisons of different types?");
891 CmpInst::Predicate BasePred = BaseCI->getPredicate();
892 CmpInst::Predicate Pred = CI->getPredicate();
894
895 Value *BaseOp0 = BaseCI->getOperand(0);
896 Value *BaseOp1 = BaseCI->getOperand(1);
897 Value *Op0 = CI->getOperand(0);
898 Value *Op1 = CI->getOperand(1);
899
900 return (BasePred == Pred &&
901 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
902 (BasePred == SwappedPred &&
903 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
904}
905
906/// \returns analysis of the Instructions in \p VL described in
907/// InstructionsState, the Opcode that we suppose the whole list
908/// could be vectorized even if its structure is diverse.
909static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
910 const TargetLibraryInfo &TLI) {
911 // Make sure these are all Instructions.
912 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
914
915 auto *It = find_if(VL, IsaPred<Instruction>);
916 if (It == VL.end())
917 return InstructionsState::invalid();
918
919 Instruction *MainOp = cast<Instruction>(*It);
920 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
921 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
922 (VL.size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
924
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
928 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
930 Instruction *AltOp = MainOp;
931 unsigned Opcode = MainOp->getOpcode();
932 unsigned AltOpcode = Opcode;
933
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
935 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
936 UniquePreds.insert(BasePred);
937 UniqueNonSwappedPreds.insert(BasePred);
938 for (Value *V : VL) {
939 auto *I = dyn_cast<CmpInst>(V);
940 if (!I)
941 return false;
942 CmpInst::Predicate CurrentPred = I->getPredicate();
943 CmpInst::Predicate SwappedCurrentPred =
944 CmpInst::getSwappedPredicate(CurrentPred);
945 UniqueNonSwappedPreds.insert(CurrentPred);
946 if (!UniquePreds.contains(CurrentPred) &&
947 !UniquePreds.contains(SwappedCurrentPred))
948 UniquePreds.insert(CurrentPred);
949 }
950 // Total number of predicates > 2, but if consider swapped predicates
951 // compatible only 2, consider swappable predicates as compatible opcodes,
952 // not alternate.
953 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
954 }();
955 // Check for one alternate opcode from another BinaryOperator.
956 // TODO - generalize to support all operators (types, calls etc.).
957 Intrinsic::ID BaseID = 0;
958 SmallVector<VFInfo> BaseMappings;
959 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
961 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
962 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
963 return InstructionsState::invalid();
964 }
965 bool AnyPoison = InstCnt != VL.size();
966 // Skip MainOp.
967 for (Value *V : iterator_range(It + 1, VL.end())) {
968 auto *I = dyn_cast<Instruction>(V);
969 if (!I)
970 continue;
971
972 // Cannot combine poison and divisions.
973 // TODO: do some smart analysis of the CallInsts to exclude divide-like
974 // intrinsics/functions only.
975 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
976 return InstructionsState::invalid();
977 unsigned InstOpcode = I->getOpcode();
978 if (IsBinOp && isa<BinaryOperator>(I)) {
979 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
980 continue;
981 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
982 isValidForAlternation(Opcode)) {
983 AltOpcode = InstOpcode;
984 AltOp = I;
985 continue;
986 }
987 } else if (IsCastOp && isa<CastInst>(I)) {
988 Value *Op0 = MainOp->getOperand(0);
989 Type *Ty0 = Op0->getType();
990 Value *Op1 = I->getOperand(0);
991 Type *Ty1 = Op1->getType();
992 if (Ty0 == Ty1) {
993 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
994 continue;
995 if (Opcode == AltOpcode) {
997 isValidForAlternation(InstOpcode) &&
998 "Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1000 AltOp = I;
1001 continue;
1002 }
1003 }
1004 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1005 auto *BaseInst = cast<CmpInst>(MainOp);
1006 Type *Ty0 = BaseInst->getOperand(0)->getType();
1007 Type *Ty1 = Inst->getOperand(0)->getType();
1008 if (Ty0 == Ty1) {
1009 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1010 assert(InstOpcode == AltOpcode &&
1011 "Alternate instructions are only supported by BinaryOperator "
1012 "and CastInst.");
1013 // Check for compatible operands. If the corresponding operands are not
1014 // compatible - need to perform alternate vectorization.
1015 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1016 CmpInst::Predicate SwappedCurrentPred =
1017 CmpInst::getSwappedPredicate(CurrentPred);
1018
1019 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1021 continue;
1022
1023 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1024 continue;
1025 auto *AltInst = cast<CmpInst>(AltOp);
1026 if (MainOp != AltOp) {
1027 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1028 continue;
1029 } else if (BasePred != CurrentPred) {
1030 assert(
1031 isValidForAlternation(InstOpcode) &&
1032 "CmpInst isn't safe for alternation, logic needs to be updated!");
1033 AltOp = I;
1034 continue;
1035 }
1036 CmpInst::Predicate AltPred = AltInst->getPredicate();
1037 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1039 continue;
1040 }
1041 } else if (InstOpcode == Opcode) {
1042 assert(InstOpcode == AltOpcode &&
1043 "Alternate instructions are only supported by BinaryOperator and "
1044 "CastInst.");
1045 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1046 if (Gep->getNumOperands() != 2 ||
1047 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1048 return InstructionsState::invalid();
1049 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1051 return InstructionsState::invalid();
1052 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1053 auto *BaseLI = cast<LoadInst>(MainOp);
1054 if (!LI->isSimple() || !BaseLI->isSimple())
1055 return InstructionsState::invalid();
1056 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1057 auto *CallBase = cast<CallInst>(MainOp);
1058 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1059 return InstructionsState::invalid();
1060 if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1064 CallBase->op_begin() +
1066 return InstructionsState::invalid();
1068 if (ID != BaseID)
1069 return InstructionsState::invalid();
1070 if (!ID) {
1071 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1072 if (Mappings.size() != BaseMappings.size() ||
1073 Mappings.front().ISA != BaseMappings.front().ISA ||
1074 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1075 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1076 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1077 Mappings.front().Shape.Parameters !=
1078 BaseMappings.front().Shape.Parameters)
1079 return InstructionsState::invalid();
1080 }
1081 }
1082 continue;
1083 }
1084 return InstructionsState::invalid();
1085 }
1086
1087 return InstructionsState(MainOp, AltOp);
1088}
1089
1090/// \returns true if all of the values in \p VL have the same type or false
1091/// otherwise.
1093 Type *Ty = VL.front()->getType();
1094 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1095}
1096
1097/// \returns True if in-tree use also needs extract. This refers to
1098/// possible scalar operand in vectorized instruction.
1099static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1100 TargetLibraryInfo *TLI,
1101 const TargetTransformInfo *TTI) {
1102 if (!UserInst)
1103 return false;
1104 unsigned Opcode = UserInst->getOpcode();
1105 switch (Opcode) {
1106 case Instruction::Load: {
1107 LoadInst *LI = cast<LoadInst>(UserInst);
1108 return (LI->getPointerOperand() == Scalar);
1109 }
1110 case Instruction::Store: {
1111 StoreInst *SI = cast<StoreInst>(UserInst);
1112 return (SI->getPointerOperand() == Scalar);
1113 }
1114 case Instruction::Call: {
1115 CallInst *CI = cast<CallInst>(UserInst);
1117 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1120 });
1121 }
1122 default:
1123 return false;
1124 }
1125}
1126
1127/// \returns the AA location that is being access by the instruction.
1129 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1130 return MemoryLocation::get(SI);
1131 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1132 return MemoryLocation::get(LI);
1133 return MemoryLocation();
1134}
1135
1136/// \returns True if the instruction is not a volatile or atomic load/store.
1137static bool isSimple(Instruction *I) {
1138 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1139 return LI->isSimple();
1140 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1141 return SI->isSimple();
1142 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1143 return !MI->isVolatile();
1144 return true;
1145}
1146
1147/// Shuffles \p Mask in accordance with the given \p SubMask.
1148/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1149/// one but two input vectors.
1150static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1151 bool ExtendingManyInputs = false) {
1152 if (SubMask.empty())
1153 return;
1154 assert(
1155 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1156 // Check if input scalars were extended to match the size of other node.
1157 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1158 "SubMask with many inputs support must be larger than the mask.");
1159 if (Mask.empty()) {
1160 Mask.append(SubMask.begin(), SubMask.end());
1161 return;
1162 }
1163 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1164 int TermValue = std::min(Mask.size(), SubMask.size());
1165 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1166 if (SubMask[I] == PoisonMaskElem ||
1167 (!ExtendingManyInputs &&
1168 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1169 continue;
1170 NewMask[I] = Mask[SubMask[I]];
1171 }
1172 Mask.swap(NewMask);
1173}
1174
1175/// Order may have elements assigned special value (size) which is out of
1176/// bounds. Such indices only appear on places which correspond to undef values
1177/// (see canReuseExtract for details) and used in order to avoid undef values
1178/// have effect on operands ordering.
1179/// The first loop below simply finds all unused indices and then the next loop
1180/// nest assigns these indices for undef values positions.
1181/// As an example below Order has two undef positions and they have assigned
1182/// values 3 and 7 respectively:
1183/// before: 6 9 5 4 9 2 1 0
1184/// after: 6 3 5 4 7 2 1 0
1186 const unsigned Sz = Order.size();
1187 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1188 SmallBitVector MaskedIndices(Sz);
1189 for (unsigned I = 0; I < Sz; ++I) {
1190 if (Order[I] < Sz)
1191 UnusedIndices.reset(Order[I]);
1192 else
1193 MaskedIndices.set(I);
1194 }
1195 if (MaskedIndices.none())
1196 return;
1197 assert(UnusedIndices.count() == MaskedIndices.count() &&
1198 "Non-synced masked/available indices.");
1199 int Idx = UnusedIndices.find_first();
1200 int MIdx = MaskedIndices.find_first();
1201 while (MIdx >= 0) {
1202 assert(Idx >= 0 && "Indices must be synced.");
1203 Order[MIdx] = Idx;
1204 Idx = UnusedIndices.find_next(Idx);
1205 MIdx = MaskedIndices.find_next(MIdx);
1206 }
1207}
1208
1209/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1210/// Opcode1.
1212 unsigned Opcode1) {
1213 Type *ScalarTy = VL[0]->getType();
1214 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1215 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1216 for (unsigned Lane : seq<unsigned>(VL.size())) {
1217 if (isa<PoisonValue>(VL[Lane]))
1218 continue;
1219 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1220 OpcodeMask.set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1222 }
1223 return OpcodeMask;
1224}
1225
1226namespace llvm {
1227
1229 SmallVectorImpl<int> &Mask) {
1230 Mask.clear();
1231 const unsigned E = Indices.size();
1232 Mask.resize(E, PoisonMaskElem);
1233 for (unsigned I = 0; I < E; ++I)
1234 Mask[Indices[I]] = I;
1235}
1236
1237/// Reorders the list of scalars in accordance with the given \p Mask.
1239 ArrayRef<int> Mask) {
1240 assert(!Mask.empty() && "Expected non-empty mask.");
1241 SmallVector<Value *> Prev(Scalars.size(),
1242 PoisonValue::get(Scalars.front()->getType()));
1243 Prev.swap(Scalars);
1244 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1245 if (Mask[I] != PoisonMaskElem)
1246 Scalars[Mask[I]] = Prev[I];
1247}
1248
1249/// Checks if the provided value does not require scheduling. It does not
1250/// require scheduling if this is not an instruction or it is an instruction
1251/// that does not read/write memory and all operands are either not instructions
1252/// or phi nodes or instructions from different blocks.
1254 auto *I = dyn_cast<Instruction>(V);
1255 if (!I)
1256 return true;
1257 return !mayHaveNonDefUseDependency(*I) &&
1258 all_of(I->operands(), [I](Value *V) {
1259 auto *IO = dyn_cast<Instruction>(V);
1260 if (!IO)
1261 return true;
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1263 });
1264}
1265
1266/// Checks if the provided value does not require scheduling. It does not
1267/// require scheduling if this is not an instruction or it is an instruction
1268/// that does not read/write memory and all users are phi nodes or instructions
1269/// from the different blocks.
1270static bool isUsedOutsideBlock(Value *V) {
1271 auto *I = dyn_cast<Instruction>(V);
1272 if (!I)
1273 return true;
1274 // Limits the number of uses to save compile time.
1275 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1276 all_of(I->users(), [I](User *U) {
1277 auto *IU = dyn_cast<Instruction>(U);
1278 if (!IU)
1279 return true;
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1281 });
1282}
1283
1284/// Checks if the specified value does not require scheduling. It does not
1285/// require scheduling if all operands and all users do not need to be scheduled
1286/// in the current basic block.
1289}
1290
1291/// Checks if the specified array of instructions does not require scheduling.
1292/// It is so if all either instructions have operands that do not require
1293/// scheduling or their users do not require scheduling since they are phis or
1294/// in other basic blocks.
1296 return !VL.empty() &&
1298}
1299
1300/// Returns true if widened type of \p Ty elements with size \p Sz represents
1301/// full vector type, i.e. adding extra element results in extra parts upon type
1302/// legalization.
1304 unsigned Sz) {
1305 if (Sz <= 1)
1306 return false;
1307 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1308 return false;
1309 if (has_single_bit(Sz))
1310 return true;
1311 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1312 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1313 Sz % NumParts == 0;
1314}
1315
1316namespace slpvectorizer {
1317
1318/// Bottom Up SLP Vectorizer.
1319class BoUpSLP {
1320 struct TreeEntry;
1321 struct ScheduleData;
1324
1325public:
1326 /// Tracks the state we can represent the loads in the given sequence.
1327 enum class LoadsState {
1328 Gather,
1329 Vectorize,
1332 };
1333
1340
1342 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1345 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB), DL(DL), ORE(ORE),
1347 Builder(Se->getContext(), TargetFolder(*DL)) {
1348 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1349 // Use the vector register size specified by the target unless overridden
1350 // by a command-line option.
1351 // TODO: It would be better to limit the vectorization factor based on
1352 // data type rather than just register size. For example, x86 AVX has
1353 // 256-bit registers, but it does not support integer operations
1354 // at that width (that requires AVX2).
1355 if (MaxVectorRegSizeOption.getNumOccurrences())
1356 MaxVecRegSize = MaxVectorRegSizeOption;
1357 else
1358 MaxVecRegSize =
1360 .getFixedValue();
1361
1362 if (MinVectorRegSizeOption.getNumOccurrences())
1363 MinVecRegSize = MinVectorRegSizeOption;
1364 else
1365 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1366 }
1367
1368 /// Vectorize the tree that starts with the elements in \p VL.
1369 /// Returns the vectorized root.
1371
1372 /// Vectorize the tree but with the list of externally used values \p
1373 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1374 /// generated extractvalue instructions.
1375 Value *
1376 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1377 Instruction *ReductionRoot = nullptr);
1378
1379 /// \returns the cost incurred by unwanted spills and fills, caused by
1380 /// holding live values over call sites.
1382
1383 /// \returns the vectorization cost of the subtree that starts at \p VL.
1384 /// A negative number means that this is profitable.
1385 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1386
1387 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1388 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1389 void buildTree(ArrayRef<Value *> Roots,
1390 const SmallDenseSet<Value *> &UserIgnoreLst);
1391
1392 /// Construct a vectorizable tree that starts at \p Roots.
1393 void buildTree(ArrayRef<Value *> Roots);
1394
1395 /// Returns whether the root node has in-tree uses.
1397 return !VectorizableTree.empty() &&
1398 !VectorizableTree.front()->UserTreeIndices.empty();
1399 }
1400
1401 /// Return the scalars of the root node.
1403 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1404 return VectorizableTree.front()->Scalars;
1405 }
1406
1407 /// Returns the type/is-signed info for the root node in the graph without
1408 /// casting.
1409 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1410 const TreeEntry &Root = *VectorizableTree.front().get();
1411 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413 return std::nullopt;
1414 auto It = MinBWs.find(&Root);
1415 if (It != MinBWs.end())
1416 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1417 It->second.first),
1418 It->second.second);
1419 if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423 return std::nullopt;
1424 }
1425
1426 /// Checks if the root graph node can be emitted with narrower bitwidth at
1427 /// codegen and returns it signedness, if so.
1429 return MinBWs.at(VectorizableTree.front().get()).second;
1430 }
1431
1432 /// Returns reduction type after minbitdth analysis.
1434 if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437 DL->getTypeSizeInBits(
1438 VectorizableTree.front()->Scalars.front()->getType()))
1439 return getWidenedType(
1440 VectorizableTree.front()->Scalars.front()->getType(),
1441 VectorizableTree.front()->getVectorFactor());
1442 return getWidenedType(
1444 VectorizableTree.front()->Scalars.front()->getContext(),
1445 ReductionBitWidth),
1446 VectorizableTree.front()->getVectorFactor());
1447 }
1448
1449 /// Builds external uses of the vectorized scalars, i.e. the list of
1450 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1451 /// ExternallyUsedValues contains additional list of external uses to handle
1452 /// vectorization of reductions.
1453 void
1454 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1455
1456 /// Transforms graph nodes to target specific representations, if profitable.
1457 void transformNodes();
1458
1459 /// Clear the internal data structures that are created by 'buildTree'.
1460 void deleteTree() {
1461 VectorizableTree.clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1464 MustGather.clear();
1465 NonScheduledFirst.clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.clear();
1468 IsGraphTransformMode = false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472 for (auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1474 BS->clear();
1475 }
1476 MinBWs.clear();
1477 ReductionBitWidth = 0;
1478 BaseGraphSize = 1;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList = nullptr;
1483 PostponedGathers.clear();
1484 ValueToGatherNodes.clear();
1485 }
1486
1487 unsigned getTreeSize() const { return VectorizableTree.size(); }
1488
1489 /// Returns the base graph size, before any transformations.
1490 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1491
1492 /// Perform LICM and CSE on the newly generated gather sequences.
1494
1495 /// Does this non-empty order represent an identity order? Identity
1496 /// should be represented as an empty order, so this is used to
1497 /// decide if we can canonicalize a computed order. Undef elements
1498 /// (represented as size) are ignored.
1500 assert(!Order.empty() && "expected non-empty order");
1501 const unsigned Sz = Order.size();
1502 return all_of(enumerate(Order), [&](const auto &P) {
1503 return P.value() == P.index() || P.value() == Sz;
1504 });
1505 }
1506
1507 /// Checks if the specified gather tree entry \p TE can be represented as a
1508 /// shuffled vector entry + (possibly) permutation with other gathers. It
1509 /// implements the checks only for possibly ordered scalars (Loads,
1510 /// ExtractElement, ExtractValue), which can be part of the graph.
1511 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1512
1513 /// Sort loads into increasing pointers offsets to allow greater clustering.
1514 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1515
1516 /// Gets reordering data for the given tree entry. If the entry is vectorized
1517 /// - just return ReorderIndices, otherwise check if the scalars can be
1518 /// reordered and return the most optimal order.
1519 /// \return std::nullopt if ordering is not important, empty order, if
1520 /// identity order is important, or the actual order.
1521 /// \param TopToBottom If true, include the order of vectorized stores and
1522 /// insertelement nodes, otherwise skip them.
1523 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1524 bool TopToBottom);
1525
1526 /// Reorders the current graph to the most profitable order starting from the
1527 /// root node to the leaf nodes. The best order is chosen only from the nodes
1528 /// of the same size (vectorization factor). Smaller nodes are considered
1529 /// parts of subgraph with smaller VF and they are reordered independently. We
1530 /// can make it because we still need to extend smaller nodes to the wider VF
1531 /// and we can merge reordering shuffles with the widening shuffles.
1532 void reorderTopToBottom();
1533
1534 /// Reorders the current graph to the most profitable order starting from
1535 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1536 /// number of reshuffles if the leaf nodes use the same order. In this case we
1537 /// can merge the orders and just shuffle user node instead of shuffling its
1538 /// operands. Plus, even the leaf nodes have different orders, it allows to
1539 /// sink reordering in the graph closer to the root node and merge it later
1540 /// during analysis.
1541 void reorderBottomToTop(bool IgnoreReorder = false);
1542
1543 /// \return The vector element size in bits to use when vectorizing the
1544 /// expression tree ending at \p V. If V is a store, the size is the width of
1545 /// the stored value. Otherwise, the size is the width of the largest loaded
1546 /// value reaching V. This method is used by the vectorizer to calculate
1547 /// vectorization factors.
1548 unsigned getVectorElementSize(Value *V);
1549
1550 /// Compute the minimum type sizes required to represent the entries in a
1551 /// vectorizable tree.
1553
1554 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1555 unsigned getMaxVecRegSize() const {
1556 return MaxVecRegSize;
1557 }
1558
1559 // \returns minimum vector register size as set by cl::opt.
1560 unsigned getMinVecRegSize() const {
1561 return MinVecRegSize;
1562 }
1563
1564 unsigned getMinVF(unsigned Sz) const {
1565 return std::max(2U, getMinVecRegSize() / Sz);
1566 }
1567
1568 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1569 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1570 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1571 return MaxVF ? MaxVF : UINT_MAX;
1572 }
1573
1574 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1575 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1576 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1577 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1578 ///
1579 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1580 unsigned canMapToVector(Type *T) const;
1581
1582 /// \returns True if the VectorizableTree is both tiny and not fully
1583 /// vectorizable. We do not vectorize such trees.
1584 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1585
1586 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1587 /// It may happen, if all gather nodes are loads and they cannot be
1588 /// "clusterized". In this case even subgraphs cannot be vectorized more
1589 /// effectively than the base graph.
1590 bool isTreeNotExtendable() const;
1591
1592 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1593 /// can be load combined in the backend. Load combining may not be allowed in
1594 /// the IR optimizer, so we do not want to alter the pattern. For example,
1595 /// partially transforming a scalar bswap() pattern into vector code is
1596 /// effectively impossible for the backend to undo.
1597 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1598 /// may not be necessary.
1599 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1600
1601 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1602 /// can be load combined in the backend. Load combining may not be allowed in
1603 /// the IR optimizer, so we do not want to alter the pattern. For example,
1604 /// partially transforming a scalar bswap() pattern into vector code is
1605 /// effectively impossible for the backend to undo.
1606 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1607 /// may not be necessary.
1608 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1609
1610 /// Checks if the given array of loads can be represented as a vectorized,
1611 /// scatter or just simple gather.
1612 /// \param VL list of loads.
1613 /// \param VL0 main load value.
1614 /// \param Order returned order of load instructions.
1615 /// \param PointerOps returned list of pointer operands.
1616 /// \param BestVF return best vector factor, if recursive check found better
1617 /// vectorization sequences rather than masked gather.
1618 /// \param TryRecursiveCheck used to check if long masked gather can be
1619 /// represented as a serie of loads/insert subvector, if profitable.
1622 SmallVectorImpl<Value *> &PointerOps,
1623 unsigned *BestVF = nullptr,
1624 bool TryRecursiveCheck = true) const;
1625
1626 /// Registers non-vectorizable sequence of loads
1627 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1628 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1629 }
1630
1631 /// Checks if the given loads sequence is known as not vectorizable
1632 template <typename T>
1634 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1635 }
1636
1638
1639 /// This structure holds any data we need about the edges being traversed
1640 /// during buildTree_rec(). We keep track of:
1641 /// (i) the user TreeEntry index, and
1642 /// (ii) the index of the edge.
1643 struct EdgeInfo {
1644 EdgeInfo() = default;
1645 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1647 /// The user TreeEntry.
1648 TreeEntry *UserTE = nullptr;
1649 /// The operand index of the use.
1650 unsigned EdgeIdx = UINT_MAX;
1651#ifndef NDEBUG
1653 const BoUpSLP::EdgeInfo &EI) {
1654 EI.dump(OS);
1655 return OS;
1656 }
1657 /// Debug print.
1658 void dump(raw_ostream &OS) const {
1659 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1660 << " EdgeIdx:" << EdgeIdx << "}";
1661 }
1662 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1663#endif
1664 bool operator == (const EdgeInfo &Other) const {
1665 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1666 }
1667 };
1668
1669 /// A helper class used for scoring candidates for two consecutive lanes.
1671 const TargetLibraryInfo &TLI;
1672 const DataLayout &DL;
1673 ScalarEvolution &SE;
1674 const BoUpSLP &R;
1675 int NumLanes; // Total number of lanes (aka vectorization factor).
1676 int MaxLevel; // The maximum recursion depth for accumulating score.
1677
1678 public:
1680 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1681 int MaxLevel)
1682 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1684
1685 // The hard-coded scores listed here are not very important, though it shall
1686 // be higher for better matches to improve the resulting cost. When
1687 // computing the scores of matching one sub-tree with another, we are
1688 // basically counting the number of values that are matching. So even if all
1689 // scores are set to 1, we would still get a decent matching result.
1690 // However, sometimes we have to break ties. For example we may have to
1691 // choose between matching loads vs matching opcodes. This is what these
1692 // scores are helping us with: they provide the order of preference. Also,
1693 // this is important if the scalar is externally used or used in another
1694 // tree entry node in the different lane.
1695
1696 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1697 static const int ScoreConsecutiveLoads = 4;
1698 /// The same load multiple times. This should have a better score than
1699 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1700 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1701 /// a vector load and 1.0 for a broadcast.
1702 static const int ScoreSplatLoads = 3;
1703 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1704 static const int ScoreReversedLoads = 3;
1705 /// A load candidate for masked gather.
1706 static const int ScoreMaskedGatherCandidate = 1;
1707 /// ExtractElementInst from same vector and consecutive indexes.
1708 static const int ScoreConsecutiveExtracts = 4;
1709 /// ExtractElementInst from same vector and reversed indices.
1710 static const int ScoreReversedExtracts = 3;
1711 /// Constants.
1712 static const int ScoreConstants = 2;
1713 /// Instructions with the same opcode.
1714 static const int ScoreSameOpcode = 2;
1715 /// Instructions with alt opcodes (e.g, add + sub).
1716 static const int ScoreAltOpcodes = 1;
1717 /// Identical instructions (a.k.a. splat or broadcast).
1718 static const int ScoreSplat = 1;
1719 /// Matching with an undef is preferable to failing.
1720 static const int ScoreUndef = 1;
1721 /// Score for failing to find a decent match.
1722 static const int ScoreFail = 0;
1723 /// Score if all users are vectorized.
1724 static const int ScoreAllUserVectorized = 1;
1725
1726 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1727 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1728 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1729 /// MainAltOps.
1731 ArrayRef<Value *> MainAltOps) const {
1732 if (!isValidElementType(V1->getType()) ||
1733 !isValidElementType(V2->getType()))
1735
1736 if (V1 == V2) {
1737 if (isa<LoadInst>(V1)) {
1738 // Retruns true if the users of V1 and V2 won't need to be extracted.
1739 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1740 // Bail out if we have too many uses to save compilation time.
1741 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1742 return false;
1743
1744 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1745 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1747 });
1748 };
1749 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1750 };
1751 // A broadcast of a load can be cheaper on some targets.
1752 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1753 ElementCount::getFixed(NumLanes)) &&
1754 ((int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1757 }
1759 }
1760
1761 auto CheckSameEntryOrFail = [&]() {
1762 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1766 };
1767
1768 auto *LI1 = dyn_cast<LoadInst>(V1);
1769 auto *LI2 = dyn_cast<LoadInst>(V2);
1770 if (LI1 && LI2) {
1771 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1772 !LI2->isSimple())
1773 return CheckSameEntryOrFail();
1774
1775 std::optional<int> Dist = getPointersDiff(
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1778 if (!Dist || *Dist == 0) {
1779 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1780 getUnderlyingObject(LI2->getPointerOperand()) &&
1781 R.TTI->isLegalMaskedGather(
1782 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1784 return CheckSameEntryOrFail();
1785 }
1786 // The distance is too large - still may be profitable to use masked
1787 // loads/gathers.
1788 if (std::abs(*Dist) > NumLanes / 2)
1790 // This still will detect consecutive loads, but we might have "holes"
1791 // in some cases. It is ok for non-power-2 vectorization and may produce
1792 // better results. It should not affect current vectorization.
1795 }
1796
1797 auto *C1 = dyn_cast<Constant>(V1);
1798 auto *C2 = dyn_cast<Constant>(V2);
1799 if (C1 && C2)
1801
1802 // Extracts from consecutive indexes of the same vector better score as
1803 // the extracts could be optimized away.
1804 Value *EV1;
1805 ConstantInt *Ex1Idx;
1806 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1807 // Undefs are always profitable for extractelements.
1808 // Compiler can easily combine poison and extractelement <non-poison> or
1809 // undef and extractelement <poison>. But combining undef +
1810 // extractelement <non-poison-but-may-produce-poison> requires some
1811 // extra operations.
1812 if (isa<UndefValue>(V2))
1813 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1816 Value *EV2 = nullptr;
1817 ConstantInt *Ex2Idx = nullptr;
1818 if (match(V2,
1820 m_Undef())))) {
1821 // Undefs are always profitable for extractelements.
1822 if (!Ex2Idx)
1824 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1826 if (EV2 == EV1) {
1827 int Idx1 = Ex1Idx->getZExtValue();
1828 int Idx2 = Ex2Idx->getZExtValue();
1829 int Dist = Idx2 - Idx1;
1830 // The distance is too large - still may be profitable to use
1831 // shuffles.
1832 if (std::abs(Dist) == 0)
1834 if (std::abs(Dist) > NumLanes / 2)
1838 }
1840 }
1841 return CheckSameEntryOrFail();
1842 }
1843
1844 auto *I1 = dyn_cast<Instruction>(V1);
1845 auto *I2 = dyn_cast<Instruction>(V2);
1846 if (I1 && I2) {
1847 if (I1->getParent() != I2->getParent())
1848 return CheckSameEntryOrFail();
1849 SmallVector<Value *, 4> Ops(MainAltOps);
1850 Ops.push_back(I1);
1851 Ops.push_back(I2);
1852 InstructionsState S = getSameOpcode(Ops, TLI);
1853 // Note: Only consider instructions with <= 2 operands to avoid
1854 // complexity explosion.
1855 if (S &&
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1857 !S.isAltShuffle()) &&
1858 all_of(Ops, [&S](Value *V) {
1859 return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1862 }))
1863 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1865 }
1866
1867 if (I1 && isa<PoisonValue>(V2))
1869
1870 if (isa<UndefValue>(V2))
1872
1873 return CheckSameEntryOrFail();
1874 }
1875
1876 /// Go through the operands of \p LHS and \p RHS recursively until
1877 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1878 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1879 /// of \p U1 and \p U2), except at the beginning of the recursion where
1880 /// these are set to nullptr.
1881 ///
1882 /// For example:
1883 /// \verbatim
1884 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1885 /// \ / \ / \ / \ /
1886 /// + + + +
1887 /// G1 G2 G3 G4
1888 /// \endverbatim
1889 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1890 /// each level recursively, accumulating the score. It starts from matching
1891 /// the additions at level 0, then moves on to the loads (level 1). The
1892 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1893 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1894 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1895 /// Please note that the order of the operands does not matter, as we
1896 /// evaluate the score of all profitable combinations of operands. In
1897 /// other words the score of G1 and G4 is the same as G1 and G2. This
1898 /// heuristic is based on ideas described in:
1899 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1900 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1901 /// Luís F. W. Góes
1903 Instruction *U2, int CurrLevel,
1904 ArrayRef<Value *> MainAltOps) const {
1905
1906 // Get the shallow score of V1 and V2.
1907 int ShallowScoreAtThisLevel =
1908 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1909
1910 // If reached MaxLevel,
1911 // or if V1 and V2 are not instructions,
1912 // or if they are SPLAT,
1913 // or if they are not consecutive,
1914 // or if profitable to vectorize loads or extractelements, early return
1915 // the current cost.
1916 auto *I1 = dyn_cast<Instruction>(LHS);
1917 auto *I2 = dyn_cast<Instruction>(RHS);
1918 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1919 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924 return ShallowScoreAtThisLevel;
1925 assert(I1 && I2 && "Should have early exited.");
1926
1927 // Contains the I2 operand indexes that got matched with I1 operands.
1928 SmallSet<unsigned, 4> Op2Used;
1929
1930 // Recursion towards the operands of I1 and I2. We are trying all possible
1931 // operand pairs, and keeping track of the best score.
1932 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1934 // Try to pair op1I with the best operand of I2.
1935 int MaxTmpScore = 0;
1936 unsigned MaxOpIdx2 = 0;
1937 bool FoundBest = false;
1938 // If I2 is commutative try all combinations.
1939 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1940 unsigned ToIdx = isCommutative(I2)
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943 assert(FromIdx <= ToIdx && "Bad index");
1944 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1945 // Skip operands already paired with OpIdx1.
1946 if (Op2Used.count(OpIdx2))
1947 continue;
1948 // Recursively calculate the cost at each level
1949 int TmpScore =
1950 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1951 I1, I2, CurrLevel + 1, {});
1952 // Look for the best score.
1953 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1956 MaxOpIdx2 = OpIdx2;
1957 FoundBest = true;
1958 }
1959 }
1960 if (FoundBest) {
1961 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1962 Op2Used.insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1964 }
1965 }
1966 return ShallowScoreAtThisLevel;
1967 }
1968 };
1969 /// A helper data structure to hold the operands of a vector of instructions.
1970 /// This supports a fixed vector length for all operand vectors.
1972 /// For each operand we need (i) the value, and (ii) the opcode that it
1973 /// would be attached to if the expression was in a left-linearized form.
1974 /// This is required to avoid illegal operand reordering.
1975 /// For example:
1976 /// \verbatim
1977 /// 0 Op1
1978 /// |/
1979 /// Op1 Op2 Linearized + Op2
1980 /// \ / ----------> |/
1981 /// - -
1982 ///
1983 /// Op1 - Op2 (0 + Op1) - Op2
1984 /// \endverbatim
1985 ///
1986 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1987 ///
1988 /// Another way to think of this is to track all the operations across the
1989 /// path from the operand all the way to the root of the tree and to
1990 /// calculate the operation that corresponds to this path. For example, the
1991 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1992 /// corresponding operation is a '-' (which matches the one in the
1993 /// linearized tree, as shown above).
1994 ///
1995 /// For lack of a better term, we refer to this operation as Accumulated
1996 /// Path Operation (APO).
1997 struct OperandData {
1998 OperandData() = default;
1999 OperandData(Value *V, bool APO, bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2001 /// The operand value.
2002 Value *V = nullptr;
2003 /// TreeEntries only allow a single opcode, or an alternate sequence of
2004 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2005 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2006 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2007 /// (e.g., Add/Mul)
2008 bool APO = false;
2009 /// Helper data for the reordering function.
2010 bool IsUsed = false;
2011 };
2012
2013 /// During operand reordering, we are trying to select the operand at lane
2014 /// that matches best with the operand at the neighboring lane. Our
2015 /// selection is based on the type of value we are looking for. For example,
2016 /// if the neighboring lane has a load, we need to look for a load that is
2017 /// accessing a consecutive address. These strategies are summarized in the
2018 /// 'ReorderingMode' enumerator.
2019 enum class ReorderingMode {
2020 Load, ///< Matching loads to consecutive memory addresses
2021 Opcode, ///< Matching instructions based on opcode (same or alternate)
2022 Constant, ///< Matching constants
2023 Splat, ///< Matching the same instruction multiple times (broadcast)
2024 Failed, ///< We failed to create a vectorizable group
2025 };
2026
2028
2029 /// A vector of operand vectors.
2031 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2032 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2033 unsigned ArgSize = 0;
2034
2035 const TargetLibraryInfo &TLI;
2036 const DataLayout &DL;
2037 ScalarEvolution &SE;
2038 const BoUpSLP &R;
2039 const Loop *L = nullptr;
2040
2041 /// \returns the operand data at \p OpIdx and \p Lane.
2042 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2043 return OpsVec[OpIdx][Lane];
2044 }
2045
2046 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2047 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2048 return OpsVec[OpIdx][Lane];
2049 }
2050
2051 /// Clears the used flag for all entries.
2052 void clearUsed() {
2053 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2056 ++Lane)
2057 OpsVec[OpIdx][Lane].IsUsed = false;
2058 }
2059
2060 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2061 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2062 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2063 }
2064
2065 /// \param Lane lane of the operands under analysis.
2066 /// \param OpIdx operand index in \p Lane lane we're looking the best
2067 /// candidate for.
2068 /// \param Idx operand index of the current candidate value.
2069 /// \returns The additional score due to possible broadcasting of the
2070 /// elements in the lane. It is more profitable to have power-of-2 unique
2071 /// elements in the lane, it will be vectorized with higher probability
2072 /// after removing duplicates. Currently the SLP vectorizer supports only
2073 /// vectorization of the power-of-2 number of unique scalars.
2074 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2075 const SmallBitVector &UsedLanes) const {
2076 Value *IdxLaneV = getData(Idx, Lane).V;
2077 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2079 return 0;
2081 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2082 if (Ln == Lane)
2083 continue;
2084 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085 if (!isa<Instruction>(OpIdxLnV))
2086 return 0;
2087 Uniques.try_emplace(OpIdxLnV, Ln);
2088 }
2089 unsigned UniquesCount = Uniques.size();
2090 auto IdxIt = Uniques.find(IdxLaneV);
2091 unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2093 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2095 unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2097 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2098 return 0;
2099 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2102 bit_floor(UniquesCntWithOpIdxLaneV)) -
2103 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2105 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2106 }
2107
2108 /// \param Lane lane of the operands under analysis.
2109 /// \param OpIdx operand index in \p Lane lane we're looking the best
2110 /// candidate for.
2111 /// \param Idx operand index of the current candidate value.
2112 /// \returns The additional score for the scalar which users are all
2113 /// vectorized.
2114 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2115 Value *IdxLaneV = getData(Idx, Lane).V;
2116 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2117 // Do not care about number of uses for vector-like instructions
2118 // (extractelement/extractvalue with constant indices), they are extracts
2119 // themselves and already externally used. Vectorization of such
2120 // instructions does not add extra extractelement instruction, just may
2121 // remove it.
2122 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2123 isVectorLikeInstWithConstOps(OpIdxLaneV))
2125 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2127 return 0;
2128 return R.areAllUsersVectorized(IdxLaneI)
2130 : 0;
2131 }
2132
2133 /// Score scaling factor for fully compatible instructions but with
2134 /// different number of external uses. Allows better selection of the
2135 /// instructions with less external uses.
2136 static const int ScoreScaleFactor = 10;
2137
2138 /// \Returns the look-ahead score, which tells us how much the sub-trees
2139 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2140 /// score. This helps break ties in an informed way when we cannot decide on
2141 /// the order of the operands by just considering the immediate
2142 /// predecessors.
2143 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2144 int Lane, unsigned OpIdx, unsigned Idx,
2145 bool &IsUsed, const SmallBitVector &UsedLanes) {
2146 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2148 // Keep track of the instruction stack as we recurse into the operands
2149 // during the look-ahead score exploration.
2150 int Score =
2151 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2152 /*CurrLevel=*/1, MainAltOps);
2153 if (Score) {
2154 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2155 if (Score <= -SplatScore) {
2156 // Failed score.
2157 Score = 0;
2158 } else {
2159 Score += SplatScore;
2160 // Scale score to see the difference between different operands
2161 // and similar operands but all vectorized/not all vectorized
2162 // uses. It does not affect actual selection of the best
2163 // compatible operand in general, just allows to select the
2164 // operand with all vectorized uses.
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx, Idx);
2167 IsUsed = true;
2168 }
2169 }
2170 return Score;
2171 }
2172
2173 /// Best defined scores per lanes between the passes. Used to choose the
2174 /// best operand (with the highest score) between the passes.
2175 /// The key - {Operand Index, Lane}.
2176 /// The value - the best score between the passes for the lane and the
2177 /// operand.
2179 BestScoresPerLanes;
2180
2181 // Search all operands in Ops[*][Lane] for the one that matches best
2182 // Ops[OpIdx][LastLane] and return its opreand index.
2183 // If no good match can be found, return std::nullopt.
2184 std::optional<unsigned>
2185 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2186 ArrayRef<ReorderingMode> ReorderingModes,
2187 ArrayRef<Value *> MainAltOps,
2188 const SmallBitVector &UsedLanes) {
2189 unsigned NumOperands = getNumOperands();
2190
2191 // The operand of the previous lane at OpIdx.
2192 Value *OpLastLane = getData(OpIdx, LastLane).V;
2193
2194 // Our strategy mode for OpIdx.
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196 if (RMode == ReorderingMode::Failed)
2197 return std::nullopt;
2198
2199 // The linearized opcode of the operand at OpIdx, Lane.
2200 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2201
2202 // The best operand index and its score.
2203 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2204 // are using the score to differentiate between the two.
2205 struct BestOpData {
2206 std::optional<unsigned> Idx;
2207 unsigned Score = 0;
2208 } BestOp;
2209 BestOp.Score =
2210 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2211 .first->second;
2212
2213 // Track if the operand must be marked as used. If the operand is set to
2214 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2215 // want to reestimate the operands again on the following iterations).
2216 bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2219 // Iterate through all unused operands and look for the best.
2220 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2221 // Get the operand at Idx and Lane.
2222 OperandData &OpData = getData(Idx, Lane);
2223 Value *Op = OpData.V;
2224 bool OpAPO = OpData.APO;
2225
2226 // Skip already selected operands.
2227 if (OpData.IsUsed)
2228 continue;
2229
2230 // Skip if we are trying to move the operand to a position with a
2231 // different opcode in the linearized tree form. This would break the
2232 // semantics.
2233 if (OpAPO != OpIdxAPO)
2234 continue;
2235
2236 // Look for an operand that matches the current mode.
2237 switch (RMode) {
2238 case ReorderingMode::Load:
2239 case ReorderingMode::Opcode: {
2240 bool LeftToRight = Lane > LastLane;
2241 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2242 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2243 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx, Idx, IsUsed, UsedLanes);
2245 if (Score > static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2247 Idx == OpIdx)) {
2248 BestOp.Idx = Idx;
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2251 }
2252 break;
2253 }
2254 case ReorderingMode::Constant:
2255 if (isa<Constant>(Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2257 BestOp.Idx = Idx;
2258 if (isa<Constant>(Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2262 }
2263 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2264 IsUsed = false;
2265 }
2266 break;
2267 case ReorderingMode::Splat:
2268 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2269 IsUsed = Op == OpLastLane;
2270 if (Op == OpLastLane) {
2271 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2274 }
2275 BestOp.Idx = Idx;
2276 }
2277 break;
2278 case ReorderingMode::Failed:
2279 llvm_unreachable("Not expected Failed reordering mode.");
2280 }
2281 }
2282
2283 if (BestOp.Idx) {
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2285 return BestOp.Idx;
2286 }
2287 // If we could not find a good match return std::nullopt.
2288 return std::nullopt;
2289 }
2290
2291 /// Helper for reorderOperandVecs.
2292 /// \returns the lane that we should start reordering from. This is the one
2293 /// which has the least number of operands that can freely move about or
2294 /// less profitable because it already has the most optimal set of operands.
2295 unsigned getBestLaneToStartReordering() const {
2296 unsigned Min = UINT_MAX;
2297 unsigned SameOpNumber = 0;
2298 // std::pair<unsigned, unsigned> is used to implement a simple voting
2299 // algorithm and choose the lane with the least number of operands that
2300 // can freely move about or less profitable because it already has the
2301 // most optimal set of operands. The first unsigned is a counter for
2302 // voting, the second unsigned is the counter of lanes with instructions
2303 // with same/alternate opcodes and same parent basic block.
2305 // Try to be closer to the original results, if we have multiple lanes
2306 // with same cost. If 2 lanes have the same cost, use the one with the
2307 // highest index.
2308 for (int I = getNumLanes(); I > 0; --I) {
2309 unsigned Lane = I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2312 // Compare the number of operands that can move and choose the one with
2313 // the least number.
2314 if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2317 HashMap.clear();
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2321 // Select the most optimal lane in terms of number of operands that
2322 // should be moved around.
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327 auto [It, Inserted] =
2328 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2329 if (!Inserted)
2330 ++It->second.first;
2331 }
2332 }
2333 // Select the lane with the minimum counter.
2334 unsigned BestLane = 0;
2335 unsigned CntMin = UINT_MAX;
2336 for (const auto &Data : reverse(HashMap)) {
2337 if (Data.second.first < CntMin) {
2338 CntMin = Data.second.first;
2339 BestLane = Data.second.second;
2340 }
2341 }
2342 return BestLane;
2343 }
2344
2345 /// Data structure that helps to reorder operands.
2346 struct OperandsOrderData {
2347 /// The best number of operands with the same APOs, which can be
2348 /// reordered.
2349 unsigned NumOfAPOs = UINT_MAX;
2350 /// Number of operands with the same/alternate instruction opcode and
2351 /// parent.
2352 unsigned NumOpsWithSameOpcodeParent = 0;
2353 /// Hash for the actual operands ordering.
2354 /// Used to count operands, actually their position id and opcode
2355 /// value. It is used in the voting mechanism to find the lane with the
2356 /// least number of operands that can freely move about or less profitable
2357 /// because it already has the most optimal set of operands. Can be
2358 /// replaced with SmallVector<unsigned> instead but hash code is faster
2359 /// and requires less memory.
2360 unsigned Hash = 0;
2361 };
2362 /// \returns the maximum number of operands that are allowed to be reordered
2363 /// for \p Lane and the number of compatible instructions(with the same
2364 /// parent/opcode). This is used as a heuristic for selecting the first lane
2365 /// to start operand reordering.
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2367 unsigned CntTrue = 0;
2368 unsigned NumOperands = getNumOperands();
2369 // Operands with the same APO can be reordered. We therefore need to count
2370 // how many of them we have for each APO, like this: Cnt[APO] = x.
2371 // Since we only have two APOs, namely true and false, we can avoid using
2372 // a map. Instead we can simply count the number of operands that
2373 // correspond to one of them (in this case the 'true' APO), and calculate
2374 // the other by subtracting it from the total number of operands.
2375 // Operands with the same instruction opcode and parent are more
2376 // profitable since we don't need to move them in many cases, with a high
2377 // probability such lane already can be vectorized effectively.
2378 bool AllUndefs = true;
2379 unsigned NumOpsWithSameOpcodeParent = 0;
2380 Instruction *OpcodeI = nullptr;
2381 BasicBlock *Parent = nullptr;
2382 unsigned Hash = 0;
2383 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 const OperandData &OpData = getData(OpIdx, Lane);
2385 if (OpData.APO)
2386 ++CntTrue;
2387 // Use Boyer-Moore majority voting for finding the majority opcode and
2388 // the number of times it occurs.
2389 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2390 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
2391 I->getParent() != Parent) {
2392 if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2394 OpcodeI = I;
2395 Parent = I->getParent();
2396 } else {
2397 --NumOpsWithSameOpcodeParent;
2398 }
2399 } else {
2400 ++NumOpsWithSameOpcodeParent;
2401 }
2402 }
2403 Hash = hash_combine(
2404 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2406 }
2407 if (AllUndefs)
2408 return {};
2409 OperandsOrderData Data;
2410 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2412 Data.Hash = Hash;
2413 return Data;
2414 }
2415
2416 /// Go through the instructions in VL and append their operands.
2417 void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
2418 assert(!VL.empty() && "Bad VL");
2419 assert((empty() || VL.size() == getNumLanes()) &&
2420 "Expected same number of lanes");
2421 assert(S.valid() && "InstructionsState is invalid.");
2422 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2423 // arguments to the intrinsic produces the same result.
2424 constexpr unsigned IntrinsicNumOperands = 2;
2425 Instruction *MainOp = S.getMainOp();
2426 unsigned NumOperands = MainOp->getNumOperands();
2427 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2428 OpsVec.resize(NumOperands);
2429 unsigned NumLanes = VL.size();
2430 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2431 OpsVec[OpIdx].resize(NumLanes);
2432 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2433 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2434 "Expected instruction or poison value");
2435 // Our tree has just 3 nodes: the root and two operands.
2436 // It is therefore trivial to get the APO. We only need to check the
2437 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2438 // RHS operand. The LHS operand of both add and sub is never attached
2439 // to an inversese operation in the linearized form, therefore its APO
2440 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2441
2442 // Since operand reordering is performed on groups of commutative
2443 // operations or alternating sequences (e.g., +, -), we can safely
2444 // tell the inverse operations by checking commutativity.
2445 if (isa<PoisonValue>(VL[Lane])) {
2446 if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2447 if (OpIdx == 0) {
2448 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
2449 continue;
2450 }
2451 } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2452 if (OpIdx == 0) {
2453 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
2454 continue;
2455 }
2456 }
2457 OpsVec[OpIdx][Lane] = {
2458 PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
2459 false};
2460 continue;
2461 }
2462 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2463 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2464 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2465 APO, false};
2466 }
2467 }
2468 }
2469
2470 /// \returns the number of operands.
2471 unsigned getNumOperands() const { return ArgSize; }
2472
2473 /// \returns the number of lanes.
2474 unsigned getNumLanes() const { return OpsVec[0].size(); }
2475
2476 /// \returns the operand value at \p OpIdx and \p Lane.
2477 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2478 return getData(OpIdx, Lane).V;
2479 }
2480
2481 /// \returns true if the data structure is empty.
2482 bool empty() const { return OpsVec.empty(); }
2483
2484 /// Clears the data.
2485 void clear() { OpsVec.clear(); }
2486
2487 /// \Returns true if there are enough operands identical to \p Op to fill
2488 /// the whole vector (it is mixed with constants or loop invariant values).
2489 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2490 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2491 assert(Op == getValue(OpIdx, Lane) &&
2492 "Op is expected to be getValue(OpIdx, Lane).");
2493 // Small number of loads - try load matching.
2494 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2495 return false;
2496 bool OpAPO = getData(OpIdx, Lane).APO;
2497 bool IsInvariant = L && L->isLoopInvariant(Op);
2498 unsigned Cnt = 0;
2499 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2500 if (Ln == Lane)
2501 continue;
2502 // This is set to true if we found a candidate for broadcast at Lane.
2503 bool FoundCandidate = false;
2504 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2505 OperandData &Data = getData(OpI, Ln);
2506 if (Data.APO != OpAPO || Data.IsUsed)
2507 continue;
2508 Value *OpILane = getValue(OpI, Lane);
2509 bool IsConstantOp = isa<Constant>(OpILane);
2510 // Consider the broadcast candidate if:
2511 // 1. Same value is found in one of the operands.
2512 if (Data.V == Op ||
2513 // 2. The operand in the given lane is not constant but there is a
2514 // constant operand in another lane (which can be moved to the
2515 // given lane). In this case we can represent it as a simple
2516 // permutation of constant and broadcast.
2517 (!IsConstantOp &&
2518 ((Lns > 2 && isa<Constant>(Data.V)) ||
2519 // 2.1. If we have only 2 lanes, need to check that value in the
2520 // next lane does not build same opcode sequence.
2521 (Lns == 2 &&
2522 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2523 isa<Constant>(Data.V)))) ||
2524 // 3. The operand in the current lane is loop invariant (can be
2525 // hoisted out) and another operand is also a loop invariant
2526 // (though not a constant). In this case the whole vector can be
2527 // hoisted out.
2528 // FIXME: need to teach the cost model about this case for better
2529 // estimation.
2530 (IsInvariant && !isa<Constant>(Data.V) &&
2531 !getSameOpcode({Op, Data.V}, TLI) &&
2532 L->isLoopInvariant(Data.V))) {
2533 FoundCandidate = true;
2534 Data.IsUsed = Data.V == Op;
2535 if (Data.V == Op)
2536 ++Cnt;
2537 break;
2538 }
2539 }
2540 if (!FoundCandidate)
2541 return false;
2542 }
2543 return getNumLanes() == 2 || Cnt > 1;
2544 }
2545
2546 /// Checks if there is at least single compatible operand in lanes other
2547 /// than \p Lane, compatible with the operand \p Op.
2548 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2549 assert(Op == getValue(OpIdx, Lane) &&
2550 "Op is expected to be getValue(OpIdx, Lane).");
2551 bool OpAPO = getData(OpIdx, Lane).APO;
2552 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2553 if (Ln == Lane)
2554 continue;
2555 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2556 const OperandData &Data = getData(OpI, Ln);
2557 if (Data.APO != OpAPO || Data.IsUsed)
2558 return true;
2559 Value *OpILn = getValue(OpI, Ln);
2560 return (L && L->isLoopInvariant(OpILn)) ||
2561 (getSameOpcode({Op, OpILn}, TLI) &&
2562 allSameBlock({Op, OpILn}));
2563 }))
2564 return true;
2565 }
2566 return false;
2567 }
2568
2569 public:
2570 /// Initialize with all the operands of the instruction vector \p RootVL.
2571 VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
2572 const BoUpSLP &R)
2573 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2574 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
2575 // Append all the operands of RootVL.
2576 appendOperandsOfVL(RootVL, S);
2577 }
2578
2579 /// \Returns a value vector with the operands across all lanes for the
2580 /// opearnd at \p OpIdx.
2581 ValueList getVL(unsigned OpIdx) const {
2582 ValueList OpVL(OpsVec[OpIdx].size());
2583 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2584 "Expected same num of lanes across all operands");
2585 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2586 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2587 return OpVL;
2588 }
2589
2590 // Performs operand reordering for 2 or more operands.
2591 // The original operands are in OrigOps[OpIdx][Lane].
2592 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2593 void reorder() {
2594 unsigned NumOperands = getNumOperands();
2595 unsigned NumLanes = getNumLanes();
2596 // Each operand has its own mode. We are using this mode to help us select
2597 // the instructions for each lane, so that they match best with the ones
2598 // we have selected so far.
2599 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2600
2601 // This is a greedy single-pass algorithm. We are going over each lane
2602 // once and deciding on the best order right away with no back-tracking.
2603 // However, in order to increase its effectiveness, we start with the lane
2604 // that has operands that can move the least. For example, given the
2605 // following lanes:
2606 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2607 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2608 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2609 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2610 // we will start at Lane 1, since the operands of the subtraction cannot
2611 // be reordered. Then we will visit the rest of the lanes in a circular
2612 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2613
2614 // Find the first lane that we will start our search from.
2615 unsigned FirstLane = getBestLaneToStartReordering();
2616
2617 // Initialize the modes.
2618 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2619 Value *OpLane0 = getValue(OpIdx, FirstLane);
2620 // Keep track if we have instructions with all the same opcode on one
2621 // side.
2622 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2623 // Check if OpLane0 should be broadcast.
2624 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2625 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2626 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2627 else if (isa<LoadInst>(OpILane0))
2628 ReorderingModes[OpIdx] = ReorderingMode::Load;
2629 else
2630 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2631 } else if (isa<Constant>(OpLane0)) {
2632 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2633 } else if (isa<Argument>(OpLane0)) {
2634 // Our best hope is a Splat. It may save some cost in some cases.
2635 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2636 } else {
2637 llvm_unreachable("Unexpected value kind.");
2638 }
2639 }
2640
2641 // Check that we don't have same operands. No need to reorder if operands
2642 // are just perfect diamond or shuffled diamond match. Do not do it only
2643 // for possible broadcasts or non-power of 2 number of scalars (just for
2644 // now).
2645 auto &&SkipReordering = [this]() {
2646 SmallPtrSet<Value *, 4> UniqueValues;
2647 ArrayRef<OperandData> Op0 = OpsVec.front();
2648 for (const OperandData &Data : Op0)
2649 UniqueValues.insert(Data.V);
2651 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2652 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2653 return !UniqueValues.contains(Data.V);
2654 }))
2655 return false;
2656 }
2657 // TODO: Check if we can remove a check for non-power-2 number of
2658 // scalars after full support of non-power-2 vectorization.
2659 return UniqueValues.size() != 2 &&
2660 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2661 UniqueValues.size());
2662 };
2663
2664 // If the initial strategy fails for any of the operand indexes, then we
2665 // perform reordering again in a second pass. This helps avoid assigning
2666 // high priority to the failed strategy, and should improve reordering for
2667 // the non-failed operand indexes.
2668 for (int Pass = 0; Pass != 2; ++Pass) {
2669 // Check if no need to reorder operands since they're are perfect or
2670 // shuffled diamond match.
2671 // Need to do it to avoid extra external use cost counting for
2672 // shuffled matches, which may cause regressions.
2673 if (SkipReordering())
2674 break;
2675 // Skip the second pass if the first pass did not fail.
2676 bool StrategyFailed = false;
2677 // Mark all operand data as free to use.
2678 clearUsed();
2679 // We keep the original operand order for the FirstLane, so reorder the
2680 // rest of the lanes. We are visiting the nodes in a circular fashion,
2681 // using FirstLane as the center point and increasing the radius
2682 // distance.
2683 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2684 for (unsigned I = 0; I < NumOperands; ++I)
2685 MainAltOps[I].push_back(getData(I, FirstLane).V);
2686
2687 SmallBitVector UsedLanes(NumLanes);
2688 UsedLanes.set(FirstLane);
2689 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2690 // Visit the lane on the right and then the lane on the left.
2691 for (int Direction : {+1, -1}) {
2692 int Lane = FirstLane + Direction * Distance;
2693 if (Lane < 0 || Lane >= (int)NumLanes)
2694 continue;
2695 UsedLanes.set(Lane);
2696 int LastLane = Lane - Direction;
2697 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2698 "Out of bounds");
2699 // Look for a good match for each operand.
2700 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2701 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2702 std::optional<unsigned> BestIdx =
2703 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2704 MainAltOps[OpIdx], UsedLanes);
2705 // By not selecting a value, we allow the operands that follow to
2706 // select a better matching value. We will get a non-null value in
2707 // the next run of getBestOperand().
2708 if (BestIdx) {
2709 // Swap the current operand with the one returned by
2710 // getBestOperand().
2711 swap(OpIdx, *BestIdx, Lane);
2712 } else {
2713 // Enable the second pass.
2714 StrategyFailed = true;
2715 }
2716 // Try to get the alternate opcode and follow it during analysis.
2717 if (MainAltOps[OpIdx].size() != 2) {
2718 OperandData &AltOp = getData(OpIdx, Lane);
2719 InstructionsState OpS =
2720 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2721 if (OpS && OpS.isAltShuffle())
2722 MainAltOps[OpIdx].push_back(AltOp.V);
2723 }
2724 }
2725 }
2726 }
2727 // Skip second pass if the strategy did not fail.
2728 if (!StrategyFailed)
2729 break;
2730 }
2731 }
2732
2733#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2734 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2735 switch (RMode) {
2736 case ReorderingMode::Load:
2737 return "Load";
2738 case ReorderingMode::Opcode:
2739 return "Opcode";
2740 case ReorderingMode::Constant:
2741 return "Constant";
2742 case ReorderingMode::Splat:
2743 return "Splat";
2744 case ReorderingMode::Failed:
2745 return "Failed";
2746 }
2747 llvm_unreachable("Unimplemented Reordering Type");
2748 }
2749
2750 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2751 raw_ostream &OS) {
2752 return OS << getModeStr(RMode);
2753 }
2754
2755 /// Debug print.
2756 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2757 printMode(RMode, dbgs());
2758 }
2759
2760 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2761 return printMode(RMode, OS);
2762 }
2763
2765 const unsigned Indent = 2;
2766 unsigned Cnt = 0;
2767 for (const OperandDataVec &OpDataVec : OpsVec) {
2768 OS << "Operand " << Cnt++ << "\n";
2769 for (const OperandData &OpData : OpDataVec) {
2770 OS.indent(Indent) << "{";
2771 if (Value *V = OpData.V)
2772 OS << *V;
2773 else
2774 OS << "null";
2775 OS << ", APO:" << OpData.APO << "}\n";
2776 }
2777 OS << "\n";
2778 }
2779 return OS;
2780 }
2781
2782 /// Debug print.
2783 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2784#endif
2785 };
2786
2787 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2788 /// for a pair which have highest score deemed to have best chance to form
2789 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2790 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2791 /// of the cost, considered to be good enough score.
2792 std::optional<int>
2793 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2794 int Limit = LookAheadHeuristics::ScoreFail) const {
2795 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2797 int BestScore = Limit;
2798 std::optional<int> Index;
2799 for (int I : seq<int>(0, Candidates.size())) {
2800 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2801 Candidates[I].second,
2802 /*U1=*/nullptr, /*U2=*/nullptr,
2803 /*CurrLevel=*/1, {});
2804 if (Score > BestScore) {
2805 BestScore = Score;
2806 Index = I;
2807 }
2808 }
2809 return Index;
2810 }
2811
2812 /// Checks if the instruction is marked for deletion.
2813 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2814
2815 /// Removes an instruction from its block and eventually deletes it.
2816 /// It's like Instruction::eraseFromParent() except that the actual deletion
2817 /// is delayed until BoUpSLP is destructed.
2819 DeletedInstructions.insert(I);
2820 }
2821
2822 /// Remove instructions from the parent function and clear the operands of \p
2823 /// DeadVals instructions, marking for deletion trivially dead operands.
2824 template <typename T>
2827 for (T *V : DeadVals) {
2828 auto *I = cast<Instruction>(V);
2829 DeletedInstructions.insert(I);
2830 }
2831 DenseSet<Value *> Processed;
2832 for (T *V : DeadVals) {
2833 if (!V || !Processed.insert(V).second)
2834 continue;
2835 auto *I = cast<Instruction>(V);
2838 if (const TreeEntry *Entry = getTreeEntry(I)) {
2839 Entries.push_back(Entry);
2840 auto It = MultiNodeScalars.find(I);
2841 if (It != MultiNodeScalars.end())
2842 Entries.append(It->second.begin(), It->second.end());
2843 }
2844 for (Use &U : I->operands()) {
2845 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2846 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2848 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2849 return Entry->VectorizedValue == OpI;
2850 })))
2851 DeadInsts.push_back(OpI);
2852 }
2853 I->dropAllReferences();
2854 }
2855 for (T *V : DeadVals) {
2856 auto *I = cast<Instruction>(V);
2857 if (!I->getParent())
2858 continue;
2859 assert((I->use_empty() || all_of(I->uses(),
2860 [&](Use &U) {
2861 return isDeleted(
2862 cast<Instruction>(U.getUser()));
2863 })) &&
2864 "trying to erase instruction with users.");
2865 I->removeFromParent();
2866 SE->forgetValue(I);
2867 }
2868 // Process the dead instruction list until empty.
2869 while (!DeadInsts.empty()) {
2870 Value *V = DeadInsts.pop_back_val();
2871 Instruction *VI = cast_or_null<Instruction>(V);
2872 if (!VI || !VI->getParent())
2873 continue;
2875 "Live instruction found in dead worklist!");
2876 assert(VI->use_empty() && "Instructions with uses are not dead.");
2877
2878 // Don't lose the debug info while deleting the instructions.
2879 salvageDebugInfo(*VI);
2880
2881 // Null out all of the instruction's operands to see if any operand
2882 // becomes dead as we go.
2883 for (Use &OpU : VI->operands()) {
2884 Value *OpV = OpU.get();
2885 if (!OpV)
2886 continue;
2887 OpU.set(nullptr);
2888
2889 if (!OpV->use_empty())
2890 continue;
2891
2892 // If the operand is an instruction that became dead as we nulled out
2893 // the operand, and if it is 'trivially' dead, delete it in a future
2894 // loop iteration.
2895 if (auto *OpI = dyn_cast<Instruction>(OpV))
2896 if (!DeletedInstructions.contains(OpI) &&
2898 DeadInsts.push_back(OpI);
2899 }
2900
2901 VI->removeFromParent();
2902 DeletedInstructions.insert(VI);
2903 SE->forgetValue(VI);
2904 }
2905 }
2906
2907 /// Checks if the instruction was already analyzed for being possible
2908 /// reduction root.
2910 return AnalyzedReductionsRoots.count(I);
2911 }
2912 /// Register given instruction as already analyzed for being possible
2913 /// reduction root.
2915 AnalyzedReductionsRoots.insert(I);
2916 }
2917 /// Checks if the provided list of reduced values was checked already for
2918 /// vectorization.
2920 return AnalyzedReductionVals.contains(hash_value(VL));
2921 }
2922 /// Adds the list of reduced values to list of already checked values for the
2923 /// vectorization.
2925 AnalyzedReductionVals.insert(hash_value(VL));
2926 }
2927 /// Clear the list of the analyzed reduction root instructions.
2929 AnalyzedReductionsRoots.clear();
2930 AnalyzedReductionVals.clear();
2931 AnalyzedMinBWVals.clear();
2932 }
2933 /// Checks if the given value is gathered in one of the nodes.
2934 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2935 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2936 }
2937 /// Checks if the given value is gathered in one of the nodes.
2938 bool isGathered(const Value *V) const {
2939 return MustGather.contains(V);
2940 }
2941 /// Checks if the specified value was not schedule.
2942 bool isNotScheduled(const Value *V) const {
2943 return NonScheduledFirst.contains(V);
2944 }
2945
2946 /// Check if the value is vectorized in the tree.
2947 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2948
2949 ~BoUpSLP();
2950
2951private:
2952 /// Determine if a node \p E in can be demoted to a smaller type with a
2953 /// truncation. We collect the entries that will be demoted in ToDemote.
2954 /// \param E Node for analysis
2955 /// \param ToDemote indices of the nodes to be demoted.
2956 bool collectValuesToDemote(
2957 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2959 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2960 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2961
2962 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2963 /// reordering (i.e. the operands can be reordered because they have only one
2964 /// user and reordarable).
2965 /// \param ReorderableGathers List of all gather nodes that require reordering
2966 /// (e.g., gather of extractlements or partially vectorizable loads).
2967 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2968 /// reordering, subset of \p NonVectorized.
2969 bool
2970 canReorderOperands(TreeEntry *UserTE,
2971 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2972 ArrayRef<TreeEntry *> ReorderableGathers,
2973 SmallVectorImpl<TreeEntry *> &GatherOps);
2974
2975 /// Checks if the given \p TE is a gather node with clustered reused scalars
2976 /// and reorders it per given \p Mask.
2977 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2978
2979 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2980 /// if any. If it is not vectorized (gather node), returns nullptr.
2981 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2982 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2983 TreeEntry *TE = nullptr;
2984 const auto *It = find_if(VL, [&](Value *V) {
2985 TE = getTreeEntry(V);
2986 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2987 return true;
2988 auto It = MultiNodeScalars.find(V);
2989 if (It != MultiNodeScalars.end()) {
2990 for (TreeEntry *E : It->second) {
2991 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2992 TE = E;
2993 return true;
2994 }
2995 }
2996 }
2997 return false;
2998 });
2999 if (It != VL.end()) {
3000 assert(TE->isSame(VL) && "Expected same scalars.");
3001 return TE;
3002 }
3003 return nullptr;
3004 }
3005
3006 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
3007 /// if any. If it is not vectorized (gather node), returns nullptr.
3008 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
3009 unsigned OpIdx) const {
3010 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
3011 const_cast<TreeEntry *>(UserTE), OpIdx);
3012 }
3013
3014 /// Checks if all users of \p I are the part of the vectorization tree.
3015 bool areAllUsersVectorized(
3016 Instruction *I,
3017 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3018
3019 /// Return information about the vector formed for the specified index
3020 /// of a vector of (the same) instruction.
3022
3023 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3024 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3025
3026 /// Gets the root instruction for the given node. If the node is a strided
3027 /// load/store node with the reverse order, the root instruction is the last
3028 /// one.
3029 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3030
3031 /// \returns Cast context for the given graph node.
3033 getCastContextHint(const TreeEntry &TE) const;
3034
3035 /// \returns the cost of the vectorizable entry.
3036 InstructionCost getEntryCost(const TreeEntry *E,
3037 ArrayRef<Value *> VectorizedVals,
3038 SmallPtrSetImpl<Value *> &CheckedExtracts);
3039
3040 /// This is the recursive part of buildTree.
3041 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3042 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3043
3044 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3045 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3046 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3047 /// returns false, setting \p CurrentOrder to either an empty vector or a
3048 /// non-identity permutation that allows to reuse extract instructions.
3049 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3050 /// extract order.
3051 bool canReuseExtract(ArrayRef<Value *> VL,
3052 SmallVectorImpl<unsigned> &CurrentOrder,
3053 bool ResizeAllowed = false) const;
3054
3055 /// Vectorize a single entry in the tree.
3056 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3057 /// avoid issues with def-use order.
3058 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3059
3060 /// Returns vectorized operand node, that matches the order of the scalars
3061 /// operand number \p NodeIdx in entry \p E.
3062 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3063 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3064 unsigned NodeIdx) const {
3065 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3066 }
3067
3068 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3069 /// \p E.
3070 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3071 /// avoid issues with def-use order.
3072 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3073
3074 /// Create a new vector from a list of scalar values. Produces a sequence
3075 /// which exploits values reused across lanes, and arranges the inserts
3076 /// for ease of later optimization.
3077 template <typename BVTy, typename ResTy, typename... Args>
3078 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3079
3080 /// Create a new vector from a list of scalar values. Produces a sequence
3081 /// which exploits values reused across lanes, and arranges the inserts
3082 /// for ease of later optimization.
3083 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3084 bool PostponedPHIs);
3085
3086 /// Returns the instruction in the bundle, which can be used as a base point
3087 /// for scheduling. Usually it is the last instruction in the bundle, except
3088 /// for the case when all operands are external (in this case, it is the first
3089 /// instruction in the list).
3090 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3091
3092 /// Tries to find extractelement instructions with constant indices from fixed
3093 /// vector type and gather such instructions into a bunch, which highly likely
3094 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3095 /// was successful, the matched scalars are replaced by poison values in \p VL
3096 /// for future analysis.
3097 std::optional<TargetTransformInfo::ShuffleKind>
3098 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3099 SmallVectorImpl<int> &Mask) const;
3100
3101 /// Tries to find extractelement instructions with constant indices from fixed
3102 /// vector type and gather such instructions into a bunch, which highly likely
3103 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3104 /// was successful, the matched scalars are replaced by poison values in \p VL
3105 /// for future analysis.
3107 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3109 unsigned NumParts) const;
3110
3111 /// Checks if the gathered \p VL can be represented as a single register
3112 /// shuffle(s) of previous tree entries.
3113 /// \param TE Tree entry checked for permutation.
3114 /// \param VL List of scalars (a subset of the TE scalar), checked for
3115 /// permutations. Must form single-register vector.
3116 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3117 /// commands to build the mask using the original vector value, without
3118 /// relying on the potential reordering.
3119 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3120 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3121 std::optional<TargetTransformInfo::ShuffleKind>
3122 isGatherShuffledSingleRegisterEntry(
3123 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3124 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3125 bool ForOrder);
3126
3127 /// Checks if the gathered \p VL can be represented as multi-register
3128 /// shuffle(s) of previous tree entries.
3129 /// \param TE Tree entry checked for permutation.
3130 /// \param VL List of scalars (a subset of the TE scalar), checked for
3131 /// permutations.
3132 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3133 /// commands to build the mask using the original vector value, without
3134 /// relying on the potential reordering.
3135 /// \returns per-register series of ShuffleKind, if gathered values can be
3136 /// represented as shuffles of previous tree entries. \p Mask is filled with
3137 /// the shuffle mask (also on per-register base).
3139 isGatherShuffledEntry(
3140 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3142 unsigned NumParts, bool ForOrder = false);
3143
3144 /// \returns the cost of gathering (inserting) the values in \p VL into a
3145 /// vector.
3146 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3147 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3148 Type *ScalarTy) const;
3149
3150 /// Set the Builder insert point to one after the last instruction in
3151 /// the bundle
3152 void setInsertPointAfterBundle(const TreeEntry *E);
3153
3154 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3155 /// specified, the starting vector value is poison.
3156 Value *
3157 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3158 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3159
3160 /// \returns whether the VectorizableTree is fully vectorizable and will
3161 /// be beneficial even the tree height is tiny.
3162 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3163
3164 /// Run through the list of all gathered loads in the graph and try to find
3165 /// vector loads/masked gathers instead of regular gathers. Later these loads
3166 /// are reshufled to build final gathered nodes.
3167 void tryToVectorizeGatheredLoads(
3168 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3169 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3170 8> &GatheredLoads);
3171
3172 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3173 /// users of \p TE and collects the stores. It returns the map from the store
3174 /// pointers to the collected stores.
3176 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3177
3178 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3179 /// stores in \p StoresVec can form a vector instruction. If so it returns
3180 /// true and populates \p ReorderIndices with the shuffle indices of the
3181 /// stores when compared to the sorted vector.
3182 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3183 OrdersType &ReorderIndices) const;
3184
3185 /// Iterates through the users of \p TE, looking for scalar stores that can be
3186 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3187 /// their order and builds an order index vector for each store bundle. It
3188 /// returns all these order vectors found.
3189 /// We run this after the tree has formed, otherwise we may come across user
3190 /// instructions that are not yet in the tree.
3192 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3193
3194 /// Tries to reorder the gathering node for better vectorization
3195 /// opportunities.
3196 void reorderGatherNode(TreeEntry &TE);
3197
3198 struct TreeEntry {
3199 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3200 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3201
3202 /// \returns Common mask for reorder indices and reused scalars.
3203 SmallVector<int> getCommonMask() const {
3205 inversePermutation(ReorderIndices, Mask);
3206 ::addMask(Mask, ReuseShuffleIndices);
3207 return Mask;
3208 }
3209
3210 /// \returns true if the scalars in VL are equal to this entry.
3211 bool isSame(ArrayRef<Value *> VL) const {
3212 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3213 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3214 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3215 return VL.size() == Mask.size() &&
3216 std::equal(VL.begin(), VL.end(), Mask.begin(),
3217 [Scalars](Value *V, int Idx) {
3218 return (isa<UndefValue>(V) &&
3219 Idx == PoisonMaskElem) ||
3220 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3221 });
3222 };
3223 if (!ReorderIndices.empty()) {
3224 // TODO: implement matching if the nodes are just reordered, still can
3225 // treat the vector as the same if the list of scalars matches VL
3226 // directly, without reordering.
3228 inversePermutation(ReorderIndices, Mask);
3229 if (VL.size() == Scalars.size())
3230 return IsSame(Scalars, Mask);
3231 if (VL.size() == ReuseShuffleIndices.size()) {
3232 ::addMask(Mask, ReuseShuffleIndices);
3233 return IsSame(Scalars, Mask);
3234 }
3235 return false;
3236 }
3237 return IsSame(Scalars, ReuseShuffleIndices);
3238 }
3239
3240 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3241 return isGather() && !UserTreeIndices.empty() &&
3242 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3243 UserTreeIndices.front().UserTE == UserEI.UserTE;
3244 }
3245
3246 /// \returns true if current entry has same operands as \p TE.
3247 bool hasEqualOperands(const TreeEntry &TE) const {
3248 if (TE.getNumOperands() != getNumOperands())
3249 return false;
3250 SmallBitVector Used(getNumOperands());
3251 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3252 unsigned PrevCount = Used.count();
3253 for (unsigned K = 0; K < E; ++K) {
3254 if (Used.test(K))
3255 continue;
3256 if (getOperand(K) == TE.getOperand(I)) {
3257 Used.set(K);
3258 break;
3259 }
3260 }
3261 // Check if we actually found the matching operand.
3262 if (PrevCount == Used.count())
3263 return false;
3264 }
3265 return true;
3266 }
3267
3268 /// \return Final vectorization factor for the node. Defined by the total
3269 /// number of vectorized scalars, including those, used several times in the
3270 /// entry and counted in the \a ReuseShuffleIndices, if any.
3271 unsigned getVectorFactor() const {
3272 if (!ReuseShuffleIndices.empty())
3273 return ReuseShuffleIndices.size();
3274 return Scalars.size();
3275 };
3276
3277 /// Checks if the current node is a gather node.
3278 bool isGather() const { return State == NeedToGather; }
3279
3280 /// A vector of scalars.
3281 ValueList Scalars;
3282
3283 /// The Scalars are vectorized into this value. It is initialized to Null.
3284 WeakTrackingVH VectorizedValue = nullptr;
3285
3286 /// New vector phi instructions emitted for the vectorized phi nodes.
3287 PHINode *PHI = nullptr;
3288
3289 /// Do we need to gather this sequence or vectorize it
3290 /// (either with vector instruction or with scatter/gather
3291 /// intrinsics for store/load)?
3292 enum EntryState {
3293 Vectorize, ///< The node is regularly vectorized.
3294 ScatterVectorize, ///< Masked scatter/gather node.
3295 StridedVectorize, ///< Strided loads (and stores)
3296 NeedToGather, ///< Gather/buildvector node.
3297 CombinedVectorize, ///< Vectorized node, combined with its user into more
3298 ///< complex node like select/cmp to minmax, mul/add to
3299 ///< fma, etc. Must be used for the following nodes in
3300 ///< the pattern, not the very first one.
3301 };
3302 EntryState State;
3303
3304 /// List of combined opcodes supported by the vectorizer.
3305 enum CombinedOpcode {
3306 NotCombinedOp = -1,
3307 MinMax = Instruction::OtherOpsEnd + 1,
3308 };
3309 CombinedOpcode CombinedOp = NotCombinedOp;
3310
3311 /// Does this sequence require some shuffling?
3312 SmallVector<int, 4> ReuseShuffleIndices;
3313
3314 /// Does this entry require reordering?
3315 SmallVector<unsigned, 4> ReorderIndices;
3316
3317 /// Points back to the VectorizableTree.
3318 ///
3319 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3320 /// to be a pointer and needs to be able to initialize the child iterator.
3321 /// Thus we need a reference back to the container to translate the indices
3322 /// to entries.
3323 VecTreeTy &Container;
3324
3325 /// The TreeEntry index containing the user of this entry. We can actually
3326 /// have multiple users so the data structure is not truly a tree.
3327 SmallVector<EdgeInfo, 1> UserTreeIndices;
3328
3329 /// The index of this treeEntry in VectorizableTree.
3330 unsigned Idx = 0;
3331
3332 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3333 /// other nodes as a series of insertvector instructions.
3334 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3335
3336 private:
3337 /// The operands of each instruction in each lane Operands[op_index][lane].
3338 /// Note: This helps avoid the replication of the code that performs the
3339 /// reordering of operands during buildTree_rec() and vectorizeTree().
3341
3342 /// MainOp and AltOp are recorded inside. S should be obtained from
3343 /// newTreeEntry.
3344 InstructionsState S = InstructionsState::invalid();
3345
3346 /// Interleaving factor for interleaved loads Vectorize nodes.
3347 unsigned InterleaveFactor = 0;
3348
3349 public:
3350 /// Returns interleave factor for interleave nodes.
3351 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3352 /// Sets interleaving factor for the interleaving nodes.
3353 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3354
3355 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3356 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3357 if (Operands.size() < OpIdx + 1)
3358 Operands.resize(OpIdx + 1);
3359 assert(Operands[OpIdx].empty() && "Already resized?");
3360 assert(OpVL.size() <= Scalars.size() &&
3361 "Number of operands is greater than the number of scalars.");
3362 Operands[OpIdx].resize(OpVL.size());
3363 copy(OpVL, Operands[OpIdx].begin());
3364 }
3365
3366 /// Set this bundle's operand from Scalars.
3367 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3368 VLOperands Ops(Scalars, S, R);
3369 if (RequireReorder)
3370 Ops.reorder();
3371 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
3372 setOperand(I, Ops.getVL(I));
3373 }
3374
3375 /// Reorders operands of the node to the given mask \p Mask.
3376 void reorderOperands(ArrayRef<int> Mask) {
3377 for (ValueList &Operand : Operands)
3378 reorderScalars(Operand, Mask);
3379 }
3380
3381 /// \returns the \p OpIdx operand of this TreeEntry.
3382 ValueList &getOperand(unsigned OpIdx) {
3383 assert(OpIdx < Operands.size() && "Off bounds");
3384 return Operands[OpIdx];
3385 }
3386
3387 /// \returns the \p OpIdx operand of this TreeEntry.
3388 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3389 assert(OpIdx < Operands.size() && "Off bounds");
3390 return Operands[OpIdx];
3391 }
3392
3393 /// \returns the number of operands.
3394 unsigned getNumOperands() const { return Operands.size(); }
3395
3396 /// \return the single \p OpIdx operand.
3397 Value *getSingleOperand(unsigned OpIdx) const {
3398 assert(OpIdx < Operands.size() && "Off bounds");
3399 assert(!Operands[OpIdx].empty() && "No operand available");
3400 return Operands[OpIdx][0];
3401 }
3402
3403 /// Some of the instructions in the list have alternate opcodes.
3404 bool isAltShuffle() const { return S.isAltShuffle(); }
3405
3406 bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
3407
3408 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3409 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3410 /// \p OpValue.
3411 Value *isOneOf(Value *Op) const {
3412 auto *I = dyn_cast<Instruction>(Op);
3413 if (I && isOpcodeOrAlt(I))
3414 return Op;
3415 return S.getMainOp();
3416 }
3417
3418 void setOperations(const InstructionsState &S) {
3419 assert(S && "InstructionsState is invalid.");
3420 this->S = S;
3421 }
3422
3423 Instruction *getMainOp() const { return S.getMainOp(); }
3424
3425 Instruction *getAltOp() const { return S.getAltOp(); }
3426
3427 /// The main/alternate opcodes for the list of instructions.
3428 unsigned getOpcode() const { return S.getOpcode(); }
3429
3430 unsigned getAltOpcode() const { return S.getAltOpcode(); }
3431
3432 bool hasState() const { return S.valid(); }
3433
3434 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3435 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3436 int findLaneForValue(Value *V) const {
3437 unsigned FoundLane = getVectorFactor();
3438 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3439 std::advance(It, 1)) {
3440 if (*It != V)
3441 continue;
3442 FoundLane = std::distance(Scalars.begin(), It);
3443 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3444 if (!ReorderIndices.empty())
3445 FoundLane = ReorderIndices[FoundLane];
3446 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3447 if (ReuseShuffleIndices.empty())
3448 break;
3449 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3450 RIt != ReuseShuffleIndices.end()) {
3451 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3452 break;
3453 }
3454 }
3455 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3456 return FoundLane;
3457 }
3458
3459 /// Build a shuffle mask for graph entry which represents a merge of main
3460 /// and alternate operations.
3461 void
3462 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3464 SmallVectorImpl<Value *> *OpScalars = nullptr,
3465 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3466
3467 /// Return true if this is a non-power-of-2 node.
3468 bool isNonPowOf2Vec() const {
3469 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3470 return IsNonPowerOf2;
3471 }
3472
3473 /// Return true if this is a node, which tries to vectorize number of
3474 /// elements, forming whole vectors.
3475 bool
3476 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3477 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3478 TTI, getValueType(Scalars.front()), Scalars.size());
3479 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3480 "Reshuffling not supported with non-power-of-2 vectors yet.");
3481 return IsNonPowerOf2;
3482 }
3483
3484 Value *getOrdered(unsigned Idx) const {
3485 assert(isGather() && "Must be used only for buildvectors/gathers.");
3486 if (ReorderIndices.empty())
3487 return Scalars[Idx];
3489 inversePermutation(ReorderIndices, Mask);
3490 return Scalars[Mask[Idx]];
3491 }
3492
3493#ifndef NDEBUG
3494 /// Debug printer.
3495 LLVM_DUMP_METHOD void dump() const {
3496 dbgs() << Idx << ".\n";
3497 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3498 dbgs() << "Operand " << OpI << ":\n";
3499 for (const Value *V : Operands[OpI])
3500 dbgs().indent(2) << *V << "\n";
3501 }
3502 dbgs() << "Scalars: \n";
3503 for (Value *V : Scalars)
3504 dbgs().indent(2) << *V << "\n";
3505 dbgs() << "State: ";
3506 switch (State) {
3507 case Vectorize:
3508 if (InterleaveFactor > 0) {
3509 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3510 << "\n";
3511 } else {
3512 dbgs() << "Vectorize\n";
3513 }
3514 break;
3515 case ScatterVectorize:
3516 dbgs() << "ScatterVectorize\n";
3517 break;
3518 case StridedVectorize:
3519 dbgs() << "StridedVectorize\n";
3520 break;
3521 case NeedToGather:
3522 dbgs() << "NeedToGather\n";
3523 break;
3524 case CombinedVectorize:
3525 dbgs() << "CombinedVectorize\n";
3526 break;
3527 }
3528 if (S) {
3529 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
3530 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
3531 } else {
3532 dbgs() << "MainOp: NULL\n";
3533 dbgs() << "AltOp: NULL\n";
3534 }
3535 dbgs() << "VectorizedValue: ";
3536 if (VectorizedValue)
3537 dbgs() << *VectorizedValue << "\n";
3538 else
3539 dbgs() << "NULL\n";
3540 dbgs() << "ReuseShuffleIndices: ";
3541 if (ReuseShuffleIndices.empty())
3542 dbgs() << "Empty";
3543 else
3544 for (int ReuseIdx : ReuseShuffleIndices)
3545 dbgs() << ReuseIdx << ", ";
3546 dbgs() << "\n";
3547 dbgs() << "ReorderIndices: ";
3548 for (unsigned ReorderIdx : ReorderIndices)
3549 dbgs() << ReorderIdx << ", ";
3550 dbgs() << "\n";
3551 dbgs() << "UserTreeIndices: ";
3552 for (const auto &EInfo : UserTreeIndices)
3553 dbgs() << EInfo << ", ";
3554 dbgs() << "\n";
3555 if (!CombinedEntriesWithIndices.empty()) {
3556 dbgs() << "Combined entries: ";
3557 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
3558 dbgs() << "Entry index " << P.first << " with offset " << P.second;
3559 });
3560 dbgs() << "\n";
3561 }
3562 }
3563#endif
3564 };
3565
3566#ifndef NDEBUG
3567 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3568 InstructionCost VecCost, InstructionCost ScalarCost,
3569 StringRef Banner) const {
3570 dbgs() << "SLP: " << Banner << ":\n";
3571 E->dump();
3572 dbgs() << "SLP: Costs:\n";
3573 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3574 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3575 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3576 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3577 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3578 }
3579#endif
3580
3581 /// Create a new VectorizableTree entry.
3582 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3583 std::optional<ScheduleData *> Bundle,
3584 const InstructionsState &S,
3585 const EdgeInfo &UserTreeIdx,
3586 ArrayRef<int> ReuseShuffleIndices = {},
3587 ArrayRef<unsigned> ReorderIndices = {},
3588 unsigned InterleaveFactor = 0) {
3589 TreeEntry::EntryState EntryState =
3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3592 ReuseShuffleIndices, ReorderIndices);
3593 if (E && InterleaveFactor > 0)
3594 E->setInterleave(InterleaveFactor);
3595 return E;
3596 }
3597
3598 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3599 TreeEntry::EntryState EntryState,
3600 std::optional<ScheduleData *> Bundle,
3601 const InstructionsState &S,
3602 const EdgeInfo &UserTreeIdx,
3603 ArrayRef<int> ReuseShuffleIndices = {},
3604 ArrayRef<unsigned> ReorderIndices = {}) {
3605 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3607 "Need to vectorize gather entry?");
3608 // Gathered loads still gathered? Do not create entry, use the original one.
3609 if (GatheredLoadsEntriesFirst.has_value() &&
3610 EntryState == TreeEntry::NeedToGather && S &&
3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3612 !UserTreeIdx.UserTE)
3613 return nullptr;
3614 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3615 TreeEntry *Last = VectorizableTree.back().get();
3616 Last->Idx = VectorizableTree.size() - 1;
3617 Last->State = EntryState;
3618 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3619 // for non-power-of-two vectors.
3620 assert(
3622 ReuseShuffleIndices.empty()) &&
3623 "Reshuffling scalars not yet supported for nodes with padding");
3624 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3625 ReuseShuffleIndices.end());
3626 if (ReorderIndices.empty()) {
3627 Last->Scalars.assign(VL.begin(), VL.end());
3628 if (S)
3629 Last->setOperations(S);
3630 } else {
3631 // Reorder scalars and build final mask.
3632 Last->Scalars.assign(VL.size(), nullptr);
3633 transform(ReorderIndices, Last->Scalars.begin(),
3634 [VL](unsigned Idx) -> Value * {
3635 if (Idx >= VL.size())
3636 return UndefValue::get(VL.front()->getType());
3637 return VL[Idx];
3638 });
3639 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3640 if (S)
3641 Last->setOperations(S);
3642 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3643 }
3644 if (!Last->isGather()) {
3645 for (Value *V : VL) {
3646 const TreeEntry *TE = getTreeEntry(V);
3647 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3648 "Scalar already in tree!");
3649 if (TE) {
3650 if (TE != Last)
3651 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3652 continue;
3653 }
3654 ScalarToTreeEntry[V] = Last;
3655 }
3656 // Update the scheduler bundle to point to this TreeEntry.
3657 ScheduleData *BundleMember = *Bundle;
3658 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3659 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3660 doesNotNeedToSchedule(VL)) &&
3661 "Bundle and VL out of sync");
3662 if (BundleMember) {
3663 for (Value *V : VL) {
3665 continue;
3666 if (!BundleMember)
3667 continue;
3668 BundleMember->TE = Last;
3669 BundleMember = BundleMember->NextInBundle;
3670 }
3671 }
3672 assert(!BundleMember && "Bundle and VL out of sync");
3673 } else {
3674 // Build a map for gathered scalars to the nodes where they are used.
3675 bool AllConstsOrCasts = true;
3676 for (Value *V : VL)
3677 if (!isConstant(V)) {
3678 auto *I = dyn_cast<CastInst>(V);
3679 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3680 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3681 !UserTreeIdx.UserTE->isGather())
3682 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3683 }
3684 if (AllConstsOrCasts)
3685 CastMaxMinBWSizes =
3686 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3687 MustGather.insert(VL.begin(), VL.end());
3688 }
3689
3690 if (UserTreeIdx.UserTE)
3691 Last->UserTreeIndices.push_back(UserTreeIdx);
3692 return Last;
3693 }
3694
3695 /// -- Vectorization State --
3696 /// Holds all of the tree entries.
3697 TreeEntry::VecTreeTy VectorizableTree;
3698
3699#ifndef NDEBUG
3700 /// Debug printer.
3701 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3702 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3703 VectorizableTree[Id]->dump();
3704 dbgs() << "\n";
3705 }
3706 }
3707#endif
3708
3709 TreeEntry *getTreeEntry(Value *V) {
3710 assert(V && "V cannot be nullptr.");
3711 return ScalarToTreeEntry.lookup(V);
3712 }
3713
3714 const TreeEntry *getTreeEntry(Value *V) const {
3715 assert(V && "V cannot be nullptr.");
3716 return ScalarToTreeEntry.lookup(V);
3717 }
3718
3719 /// Check that the operand node of alternate node does not generate
3720 /// buildvector sequence. If it is, then probably not worth it to build
3721 /// alternate shuffle, if number of buildvector operands + alternate
3722 /// instruction > than the number of buildvector instructions.
3723 /// \param S the instructions state of the analyzed values.
3724 /// \param VL list of the instructions with alternate opcodes.
3725 bool areAltOperandsProfitable(const InstructionsState &S,
3726 ArrayRef<Value *> VL) const;
3727
3728 /// Checks if the specified list of the instructions/values can be vectorized
3729 /// and fills required data before actual scheduling of the instructions.
3730 TreeEntry::EntryState
3731 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3732 bool IsScatterVectorizeUserTE,
3733 OrdersType &CurrentOrder,
3734 SmallVectorImpl<Value *> &PointerOps);
3735
3736 /// Maps a specific scalar to its tree entry.
3737 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3738
3739 /// List of scalars, used in several vectorize nodes, and the list of the
3740 /// nodes.
3742
3743 /// Maps a value to the proposed vectorizable size.
3744 SmallDenseMap<Value *, unsigned> InstrElementSize;
3745
3746 /// A list of scalars that we found that we need to keep as scalars.
3747 ValueSet MustGather;
3748
3749 /// A set of first non-schedulable values.
3750 ValueSet NonScheduledFirst;
3751
3752 /// A map between the vectorized entries and the last instructions in the
3753 /// bundles. The bundles are built in use order, not in the def order of the
3754 /// instructions. So, we cannot rely directly on the last instruction in the
3755 /// bundle being the last instruction in the program order during
3756 /// vectorization process since the basic blocks are affected, need to
3757 /// pre-gather them before.
3758 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3759
3760 /// List of gather nodes, depending on other gather/vector nodes, which should
3761 /// be emitted after the vector instruction emission process to correctly
3762 /// handle order of the vector instructions and shuffles.
3763 SetVector<const TreeEntry *> PostponedGathers;
3764
3765 using ValueToGatherNodesMap =
3767 ValueToGatherNodesMap ValueToGatherNodes;
3768
3769 /// A list of the load entries (node indices), which can be vectorized using
3770 /// strided or masked gather approach, but attempted to be represented as
3771 /// contiguous loads.
3772 SetVector<unsigned> LoadEntriesToVectorize;
3773
3774 /// true if graph nodes transforming mode is on.
3775 bool IsGraphTransformMode = false;
3776
3777 /// The index of the first gathered load entry in the VectorizeTree.
3778 std::optional<unsigned> GatheredLoadsEntriesFirst;
3779
3780 /// This POD struct describes one external user in the vectorized tree.
3781 struct ExternalUser {
3782 ExternalUser(Value *S, llvm::User *U, int L)
3783 : Scalar(S), User(U), Lane(L) {}
3784
3785 // Which scalar in our function.
3786 Value *Scalar;
3787
3788 // Which user that uses the scalar.
3790
3791 // Which lane does the scalar belong to.
3792 int Lane;
3793 };
3794 using UserList = SmallVector<ExternalUser, 16>;
3795
3796 /// Checks if two instructions may access the same memory.
3797 ///
3798 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3799 /// is invariant in the calling loop.
3800 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3801 Instruction *Inst2) {
3802 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3803 return true;
3804 // First check if the result is already in the cache.
3805 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3806 auto It = AliasCache.find(Key);
3807 if (It != AliasCache.end())
3808 return It->second;
3809 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3810 // Store the result in the cache.
3811 AliasCache.try_emplace(Key, Aliased);
3812 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3813 return Aliased;
3814 }
3815
3816 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3817
3818 /// Cache for alias results.
3819 /// TODO: consider moving this to the AliasAnalysis itself.
3821
3822 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3823 // globally through SLP because we don't perform any action which
3824 // invalidates capture results.
3825 BatchAAResults BatchAA;
3826
3827 /// Temporary store for deleted instructions. Instructions will be deleted
3828 /// eventually when the BoUpSLP is destructed. The deferral is required to
3829 /// ensure that there are no incorrect collisions in the AliasCache, which
3830 /// can happen if a new instruction is allocated at the same address as a
3831 /// previously deleted instruction.
3832 DenseSet<Instruction *> DeletedInstructions;
3833
3834 /// Set of the instruction, being analyzed already for reductions.
3835 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3836
3837 /// Set of hashes for the list of reduction values already being analyzed.
3838 DenseSet<size_t> AnalyzedReductionVals;
3839
3840 /// Values, already been analyzed for mininmal bitwidth and found to be
3841 /// non-profitable.
3842 DenseSet<Value *> AnalyzedMinBWVals;
3843
3844 /// A list of values that need to extracted out of the tree.
3845 /// This list holds pairs of (Internal Scalar : External User). External User
3846 /// can be nullptr, it means that this Internal Scalar will be used later,
3847 /// after vectorization.
3848 UserList ExternalUses;
3849
3850 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3851 /// extractelement instructions.
3852 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3853
3854 /// Values used only by @llvm.assume calls.
3856
3857 /// Holds all of the instructions that we gathered, shuffle instructions and
3858 /// extractelements.
3859 SetVector<Instruction *> GatherShuffleExtractSeq;
3860
3861 /// A list of blocks that we are going to CSE.
3862 DenseSet<BasicBlock *> CSEBlocks;
3863
3864 /// List of hashes of vector of loads, which are known to be non vectorizable.
3865 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3866
3867 /// Contains all scheduling relevant data for an instruction.
3868 /// A ScheduleData either represents a single instruction or a member of an
3869 /// instruction bundle (= a group of instructions which is combined into a
3870 /// vector instruction).
3871 struct ScheduleData {
3872 // The initial value for the dependency counters. It means that the
3873 // dependencies are not calculated yet.
3874 enum { InvalidDeps = -1 };
3875
3876 ScheduleData() = default;
3877
3878 void init(int BlockSchedulingRegionID, Instruction *I) {
3879 FirstInBundle = this;
3880 NextInBundle = nullptr;
3881 NextLoadStore = nullptr;
3882 IsScheduled = false;
3883 SchedulingRegionID = BlockSchedulingRegionID;
3884 clearDependencies();
3885 Inst = I;
3886 TE = nullptr;
3887 }
3888
3889 /// Verify basic self consistency properties
3890 void verify() {
3891 if (hasValidDependencies()) {
3892 assert(UnscheduledDeps <= Dependencies && "invariant");
3893 } else {
3894 assert(UnscheduledDeps == Dependencies && "invariant");
3895 }
3896
3897 if (IsScheduled) {
3898 assert(isSchedulingEntity() &&
3899 "unexpected scheduled state");
3900 for (const ScheduleData *BundleMember = this; BundleMember;
3901 BundleMember = BundleMember->NextInBundle) {
3902 assert(BundleMember->hasValidDependencies() &&
3903 BundleMember->UnscheduledDeps == 0 &&
3904 "unexpected scheduled state");
3905 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3906 "only bundle is marked scheduled");
3907 }
3908 }
3909
3910 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3911 "all bundle members must be in same basic block");
3912 }
3913
3914 /// Returns true if the dependency information has been calculated.
3915 /// Note that depenendency validity can vary between instructions within
3916 /// a single bundle.
3917 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3918
3919 /// Returns true for single instructions and for bundle representatives
3920 /// (= the head of a bundle).
3921 bool isSchedulingEntity() const { return FirstInBundle == this; }
3922
3923 /// Returns true if it represents an instruction bundle and not only a
3924 /// single instruction.
3925 bool isPartOfBundle() const {
3926 return NextInBundle != nullptr || FirstInBundle != this || TE;
3927 }
3928
3929 /// Returns true if it is ready for scheduling, i.e. it has no more
3930 /// unscheduled depending instructions/bundles.
3931 bool isReady() const {
3932 assert(isSchedulingEntity() &&
3933 "can't consider non-scheduling entity for ready list");
3934 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3935 }
3936
3937 /// Modifies the number of unscheduled dependencies for this instruction,
3938 /// and returns the number of remaining dependencies for the containing
3939 /// bundle.
3940 int incrementUnscheduledDeps(int Incr) {
3941 assert(hasValidDependencies() &&
3942 "increment of unscheduled deps would be meaningless");
3943 UnscheduledDeps += Incr;
3944 return FirstInBundle->unscheduledDepsInBundle();
3945 }
3946
3947 /// Sets the number of unscheduled dependencies to the number of
3948 /// dependencies.
3949 void resetUnscheduledDeps() {
3950 UnscheduledDeps = Dependencies;
3951 }
3952
3953 /// Clears all dependency information.
3954 void clearDependencies() {
3955 Dependencies = InvalidDeps;
3956 resetUnscheduledDeps();
3957 MemoryDependencies.clear();
3958 ControlDependencies.clear();
3959 }
3960
3961 int unscheduledDepsInBundle() const {
3962 assert(isSchedulingEntity() && "only meaningful on the bundle");
3963 int Sum = 0;
3964 for (const ScheduleData *BundleMember = this; BundleMember;
3965 BundleMember = BundleMember->NextInBundle) {
3966 if (BundleMember->UnscheduledDeps == InvalidDeps)
3967 return InvalidDeps;
3968 Sum += BundleMember->UnscheduledDeps;
3969 }
3970 return Sum;
3971 }
3972
3973 void dump(raw_ostream &os) const {
3974 if (!isSchedulingEntity()) {
3975 os << "/ " << *Inst;
3976 } else if (NextInBundle) {
3977 os << '[' << *Inst;
3978 ScheduleData *SD = NextInBundle;
3979 while (SD) {
3980 os << ';' << *SD->Inst;
3981 SD = SD->NextInBundle;
3982 }
3983 os << ']';
3984 } else {
3985 os << *Inst;
3986 }
3987 }
3988
3989 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
3990
3991 Instruction *Inst = nullptr;
3992
3993 /// The TreeEntry that this instruction corresponds to.
3994 TreeEntry *TE = nullptr;
3995
3996 /// Points to the head in an instruction bundle (and always to this for
3997 /// single instructions).
3998 ScheduleData *FirstInBundle = nullptr;
3999
4000 /// Single linked list of all instructions in a bundle. Null if it is a
4001 /// single instruction.
4002 ScheduleData *NextInBundle = nullptr;
4003
4004 /// Single linked list of all memory instructions (e.g. load, store, call)
4005 /// in the block - until the end of the scheduling region.
4006 ScheduleData *NextLoadStore = nullptr;
4007
4008 /// The dependent memory instructions.
4009 /// This list is derived on demand in calculateDependencies().
4010 SmallVector<ScheduleData *, 4> MemoryDependencies;
4011
4012 /// List of instructions which this instruction could be control dependent
4013 /// on. Allowing such nodes to be scheduled below this one could introduce
4014 /// a runtime fault which didn't exist in the original program.
4015 /// ex: this is a load or udiv following a readonly call which inf loops
4016 SmallVector<ScheduleData *, 4> ControlDependencies;
4017
4018 /// This ScheduleData is in the current scheduling region if this matches
4019 /// the current SchedulingRegionID of BlockScheduling.
4020 int SchedulingRegionID = 0;
4021
4022 /// Used for getting a "good" final ordering of instructions.
4023 int SchedulingPriority = 0;
4024
4025 /// The number of dependencies. Constitutes of the number of users of the
4026 /// instruction plus the number of dependent memory instructions (if any).
4027 /// This value is calculated on demand.
4028 /// If InvalidDeps, the number of dependencies is not calculated yet.
4029 int Dependencies = InvalidDeps;
4030
4031 /// The number of dependencies minus the number of dependencies of scheduled
4032 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4033 /// for scheduling.
4034 /// Note that this is negative as long as Dependencies is not calculated.
4035 int UnscheduledDeps = InvalidDeps;
4036
4037 /// True if this instruction is scheduled (or considered as scheduled in the
4038 /// dry-run).
4039 bool IsScheduled = false;
4040 };
4041
4042#ifndef NDEBUG
4044 const BoUpSLP::ScheduleData &SD) {
4045 SD.dump(os);
4046 return os;
4047 }
4048#endif
4049
4050 friend struct GraphTraits<BoUpSLP *>;
4051 friend struct DOTGraphTraits<BoUpSLP *>;
4052
4053 /// Contains all scheduling data for a basic block.
4054 /// It does not schedules instructions, which are not memory read/write
4055 /// instructions and their operands are either constants, or arguments, or
4056 /// phis, or instructions from others blocks, or their users are phis or from
4057 /// the other blocks. The resulting vector instructions can be placed at the
4058 /// beginning of the basic block without scheduling (if operands does not need
4059 /// to be scheduled) or at the end of the block (if users are outside of the
4060 /// block). It allows to save some compile time and memory used by the
4061 /// compiler.
4062 /// ScheduleData is assigned for each instruction in between the boundaries of
4063 /// the tree entry, even for those, which are not part of the graph. It is
4064 /// required to correctly follow the dependencies between the instructions and
4065 /// their correct scheduling. The ScheduleData is not allocated for the
4066 /// instructions, which do not require scheduling, like phis, nodes with
4067 /// extractelements/insertelements only or nodes with instructions, with
4068 /// uses/operands outside of the block.
4069 struct BlockScheduling {
4070 BlockScheduling(BasicBlock *BB)
4071 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4072
4073 void clear() {
4074 ReadyInsts.clear();
4075 ScheduleStart = nullptr;
4076 ScheduleEnd = nullptr;
4077 FirstLoadStoreInRegion = nullptr;
4078 LastLoadStoreInRegion = nullptr;
4079 RegionHasStackSave = false;
4080
4081 // Reduce the maximum schedule region size by the size of the
4082 // previous scheduling run.
4083 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4084 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4085 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4086 ScheduleRegionSize = 0;
4087
4088 // Make a new scheduling region, i.e. all existing ScheduleData is not
4089 // in the new region yet.
4090 ++SchedulingRegionID;
4091 }
4092
4093 ScheduleData *getScheduleData(Instruction *I) {
4094 if (BB != I->getParent())
4095 // Avoid lookup if can't possibly be in map.
4096 return nullptr;
4097 ScheduleData *SD = ScheduleDataMap.lookup(I);
4098 if (SD && isInSchedulingRegion(SD))
4099 return SD;
4100 return nullptr;
4101 }
4102
4103 ScheduleData *getScheduleData(Value *V) {
4104 if (auto *I = dyn_cast<Instruction>(V))
4105 return getScheduleData(I);
4106 return nullptr;
4107 }
4108
4109 bool isInSchedulingRegion(ScheduleData *SD) const {
4110 return SD->SchedulingRegionID == SchedulingRegionID;
4111 }
4112
4113 /// Marks an instruction as scheduled and puts all dependent ready
4114 /// instructions into the ready-list.
4115 template <typename ReadyListType>
4116 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4117 SD->IsScheduled = true;
4118 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4119
4120 for (ScheduleData *BundleMember = SD; BundleMember;
4121 BundleMember = BundleMember->NextInBundle) {
4122
4123 // Handle the def-use chain dependencies.
4124
4125 // Decrement the unscheduled counter and insert to ready list if ready.
4126 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4127 ScheduleData *OpDef = getScheduleData(I);
4128 if (OpDef && OpDef->hasValidDependencies() &&
4129 OpDef->incrementUnscheduledDeps(-1) == 0) {
4130 // There are no more unscheduled dependencies after
4131 // decrementing, so we can put the dependent instruction
4132 // into the ready list.
4133 ScheduleData *DepBundle = OpDef->FirstInBundle;
4134 assert(!DepBundle->IsScheduled &&
4135 "already scheduled bundle gets ready");
4136 ReadyList.insert(DepBundle);
4138 << "SLP: gets ready (def): " << *DepBundle << "\n");
4139 }
4140 };
4141
4142 // If BundleMember is a vector bundle, its operands may have been
4143 // reordered during buildTree(). We therefore need to get its operands
4144 // through the TreeEntry.
4145 if (TreeEntry *TE = BundleMember->TE) {
4146 // Need to search for the lane since the tree entry can be reordered.
4147 auto *In = BundleMember->Inst;
4148 int Lane = std::distance(TE->Scalars.begin(),
4149 find(TE->Scalars, In));
4150 assert(Lane >= 0 && "Lane not set");
4151
4152 // Since vectorization tree is being built recursively this assertion
4153 // ensures that the tree entry has all operands set before reaching
4154 // this code. Couple of exceptions known at the moment are extracts
4155 // where their second (immediate) operand is not added. Since
4156 // immediates do not affect scheduler behavior this is considered
4157 // okay.
4158 assert(
4159 In &&
4160 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4161 In->getNumOperands() == TE->getNumOperands()) &&
4162 "Missed TreeEntry operands?");
4163
4164 for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands()))
4165 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4166 DecrUnsched(I);
4167 } else {
4168 // If BundleMember is a stand-alone instruction, no operand reordering
4169 // has taken place, so we directly access its operands.
4170 for (Use &U : BundleMember->Inst->operands())
4171 if (auto *I = dyn_cast<Instruction>(U.get()))
4172 DecrUnsched(I);
4173 }
4174 // Handle the memory dependencies.
4175 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4176 if (MemoryDepSD->hasValidDependencies() &&
4177 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4178 // There are no more unscheduled dependencies after decrementing,
4179 // so we can put the dependent instruction into the ready list.
4180 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4181 assert(!DepBundle->IsScheduled &&
4182 "already scheduled bundle gets ready");
4183 ReadyList.insert(DepBundle);
4185 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4186 }
4187 }
4188 // Handle the control dependencies.
4189 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4190 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4191 // There are no more unscheduled dependencies after decrementing,
4192 // so we can put the dependent instruction into the ready list.
4193 ScheduleData *DepBundle = DepSD->FirstInBundle;
4194 assert(!DepBundle->IsScheduled &&
4195 "already scheduled bundle gets ready");
4196 ReadyList.insert(DepBundle);
4198 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4199 }
4200 }
4201 }
4202 }
4203
4204 /// Verify basic self consistency properties of the data structure.
4205 void verify() {
4206 if (!ScheduleStart)
4207 return;
4208
4209 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4210 ScheduleStart->comesBefore(ScheduleEnd) &&
4211 "Not a valid scheduling region?");
4212
4213 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4214 auto *SD = getScheduleData(I);
4215 if (!SD)
4216 continue;
4217 assert(isInSchedulingRegion(SD) &&
4218 "primary schedule data not in window?");
4219 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4220 "entire bundle in window!");
4221 SD->verify();
4222 }
4223
4224 for (auto *SD : ReadyInsts) {
4225 assert(SD->isSchedulingEntity() && SD->isReady() &&
4226 "item in ready list not ready?");
4227 (void)SD;
4228 }
4229 }
4230
4231 /// Put all instructions into the ReadyList which are ready for scheduling.
4232 template <typename ReadyListType>
4233 void initialFillReadyList(ReadyListType &ReadyList) {
4234 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4235 ScheduleData *SD = getScheduleData(I);
4236 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4237 SD->isReady()) {
4238 ReadyList.insert(SD);
4240 << "SLP: initially in ready list: " << *SD << "\n");
4241 }
4242 }
4243 }
4244
4245 /// Build a bundle from the ScheduleData nodes corresponding to the
4246 /// scalar instruction for each lane.
4247 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4248
4249 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4250 /// cyclic dependencies. This is only a dry-run, no instructions are
4251 /// actually moved at this stage.
4252 /// \returns the scheduling bundle. The returned Optional value is not
4253 /// std::nullopt if \p VL is allowed to be scheduled.
4254 std::optional<ScheduleData *>
4255 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4256 const InstructionsState &S);
4257
4258 /// Un-bundles a group of instructions.
4259 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4260
4261 /// Allocates schedule data chunk.
4262 ScheduleData *allocateScheduleDataChunks();
4263
4264 /// Extends the scheduling region so that V is inside the region.
4265 /// \returns true if the region size is within the limit.
4266 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4267
4268 /// Initialize the ScheduleData structures for new instructions in the
4269 /// scheduling region.
4270 void initScheduleData(Instruction *FromI, Instruction *ToI,
4271 ScheduleData *PrevLoadStore,
4272 ScheduleData *NextLoadStore);
4273
4274 /// Updates the dependency information of a bundle and of all instructions/
4275 /// bundles which depend on the original bundle.
4276 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4277 BoUpSLP *SLP);
4278
4279 /// Sets all instruction in the scheduling region to un-scheduled.
4280 void resetSchedule();
4281
4282 BasicBlock *BB;
4283
4284 /// Simple memory allocation for ScheduleData.
4286
4287 /// The size of a ScheduleData array in ScheduleDataChunks.
4288 int ChunkSize;
4289
4290 /// The allocator position in the current chunk, which is the last entry
4291 /// of ScheduleDataChunks.
4292 int ChunkPos;
4293
4294 /// Attaches ScheduleData to Instruction.
4295 /// Note that the mapping survives during all vectorization iterations, i.e.
4296 /// ScheduleData structures are recycled.
4298
4299 /// The ready-list for scheduling (only used for the dry-run).
4300 SetVector<ScheduleData *> ReadyInsts;
4301
4302 /// The first instruction of the scheduling region.
4303 Instruction *ScheduleStart = nullptr;
4304
4305 /// The first instruction _after_ the scheduling region.
4306 Instruction *ScheduleEnd = nullptr;
4307
4308 /// The first memory accessing instruction in the scheduling region
4309 /// (can be null).
4310 ScheduleData *FirstLoadStoreInRegion = nullptr;
4311
4312 /// The last memory accessing instruction in the scheduling region
4313 /// (can be null).
4314 ScheduleData *LastLoadStoreInRegion = nullptr;
4315
4316 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4317 /// region? Used to optimize the dependence calculation for the
4318 /// common case where there isn't.
4319 bool RegionHasStackSave = false;
4320
4321 /// The current size of the scheduling region.
4322 int ScheduleRegionSize = 0;
4323
4324 /// The maximum size allowed for the scheduling region.
4325 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4326
4327 /// The ID of the scheduling region. For a new vectorization iteration this
4328 /// is incremented which "removes" all ScheduleData from the region.
4329 /// Make sure that the initial SchedulingRegionID is greater than the
4330 /// initial SchedulingRegionID in ScheduleData (which is 0).
4331 int SchedulingRegionID = 1;
4332 };
4333
4334 /// Attaches the BlockScheduling structures to basic blocks.
4336
4337 /// Performs the "real" scheduling. Done before vectorization is actually
4338 /// performed in a basic block.
4339 void scheduleBlock(BlockScheduling *BS);
4340
4341 /// List of users to ignore during scheduling and that don't need extracting.
4342 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4343
4344 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4345 /// sorted SmallVectors of unsigned.
4346 struct OrdersTypeDenseMapInfo {
4347 static OrdersType getEmptyKey() {
4348 OrdersType V;
4349 V.push_back(~1U);
4350 return V;
4351 }
4352
4353 static OrdersType getTombstoneKey() {
4354 OrdersType V;
4355 V.push_back(~2U);
4356 return V;
4357 }
4358
4359 static unsigned getHashValue(const OrdersType &V) {
4360 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4361 }
4362
4363 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4364 return LHS == RHS;
4365 }
4366 };
4367
4368 // Analysis and block reference.
4369 Function *F;
4370 ScalarEvolution *SE;
4372 TargetLibraryInfo *TLI;
4373 LoopInfo *LI;
4374 DominatorTree *DT;
4375 AssumptionCache *AC;
4376 DemandedBits *DB;
4377 const DataLayout *DL;
4379
4380 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4381 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4382
4383 /// Instruction builder to construct the vectorized tree.
4385
4386 /// A map of scalar integer values to the smallest bit width with which they
4387 /// can legally be represented. The values map to (width, signed) pairs,
4388 /// where "width" indicates the minimum bit width and "signed" is True if the
4389 /// value must be signed-extended, rather than zero-extended, back to its
4390 /// original width.
4392
4393 /// Final size of the reduced vector, if the current graph represents the
4394 /// input for the reduction and it was possible to narrow the size of the
4395 /// reduction.
4396 unsigned ReductionBitWidth = 0;
4397
4398 /// Canonical graph size before the transformations.
4399 unsigned BaseGraphSize = 1;
4400
4401 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4402 /// type sizes, used in the tree.
4403 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4404
4405 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4406 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4407 DenseSet<unsigned> ExtraBitWidthNodes;
4408};
4409
4410} // end namespace slpvectorizer
4411
4412template <> struct GraphTraits<BoUpSLP *> {
4413 using TreeEntry = BoUpSLP::TreeEntry;
4414
4415 /// NodeRef has to be a pointer per the GraphWriter.
4417
4419
4420 /// Add the VectorizableTree to the index iterator to be able to return
4421 /// TreeEntry pointers.
4422 struct ChildIteratorType
4423 : public iterator_adaptor_base<
4424 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4426
4428 ContainerTy &VT)
4429 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4430
4431 NodeRef operator*() { return I->UserTE; }
4432 };
4433
4435 return R.VectorizableTree[0].get();
4436 }
4437
4438 static ChildIteratorType child_begin(NodeRef N) {
4439 return {N->UserTreeIndices.begin(), N->Container};
4440 }
4441
4442 static ChildIteratorType child_end(NodeRef N) {
4443 return {N->UserTreeIndices.end(), N->Container};
4444 }
4445
4446 /// For the node iterator we just need to turn the TreeEntry iterator into a
4447 /// TreeEntry* iterator so that it dereferences to NodeRef.
4448 class nodes_iterator {
4450 ItTy It;
4451
4452 public:
4453 nodes_iterator(const ItTy &It2) : It(It2) {}
4454 NodeRef operator*() { return It->get(); }
4455 nodes_iterator operator++() {
4456 ++It;
4457 return *this;
4458 }
4459 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4460 };
4461
4462 static nodes_iterator nodes_begin(BoUpSLP *R) {
4463 return nodes_iterator(R->VectorizableTree.begin());
4464 }
4465
4466 static nodes_iterator nodes_end(BoUpSLP *R) {
4467 return nodes_iterator(R->VectorizableTree.end());
4468 }
4469
4470 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4471};
4472
4473template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4474 using TreeEntry = BoUpSLP::TreeEntry;
4475
4476 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4477
4478 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4479 std::string Str;
4481 OS << Entry->Idx << ".\n";
4482 if (isSplat(Entry->Scalars))
4483 OS << "<splat> ";
4484 for (auto *V : Entry->Scalars) {
4485 OS << *V;
4486 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4487 return EU.Scalar == V;
4488 }))
4489 OS << " <extract>";
4490 OS << "\n";
4491 }
4492 return Str;
4493 }
4494
4495 static std::string getNodeAttributes(const TreeEntry *Entry,
4496 const BoUpSLP *) {
4497 if (Entry->isGather())
4498 return "color=red";
4499 if (Entry->State == TreeEntry::ScatterVectorize ||
4500 Entry->State == TreeEntry::StridedVectorize)
4501 return "color=blue";
4502 return "";
4503 }
4504};
4505
4506} // end namespace llvm
4507
4510 for (auto *I : DeletedInstructions) {
4511 if (!I->getParent()) {
4512 // Temporarily insert instruction back to erase them from parent and
4513 // memory later.
4514 if (isa<PHINode>(I))
4515 // Phi nodes must be the very first instructions in the block.
4516 I->insertBefore(F->getEntryBlock(),
4517 F->getEntryBlock().getFirstNonPHIIt());
4518 else
4519 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
4520 continue;
4521 }
4522 for (Use &U : I->operands()) {
4523 auto *Op = dyn_cast<Instruction>(U.get());
4524 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4526 DeadInsts.emplace_back(Op);
4527 }
4528 I->dropAllReferences();
4529 }
4530 for (auto *I : DeletedInstructions) {
4531 assert(I->use_empty() &&
4532 "trying to erase instruction with users.");
4533 I->eraseFromParent();
4534 }
4535
4536 // Cleanup any dead scalar code feeding the vectorized instructions
4538
4539#ifdef EXPENSIVE_CHECKS
4540 // If we could guarantee that this call is not extremely slow, we could
4541 // remove the ifdef limitation (see PR47712).
4542 assert(!verifyFunction(*F, &dbgs()));
4543#endif
4544}
4545
4546/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4547/// contains original mask for the scalars reused in the node. Procedure
4548/// transform this mask in accordance with the given \p Mask.
4550 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4551 "Expected non-empty mask.");
4552 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4553 Prev.swap(Reuses);
4554 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4555 if (Mask[I] != PoisonMaskElem)
4556 Reuses[Mask[I]] = Prev[I];
4557}
4558
4559/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4560/// the original order of the scalars. Procedure transforms the provided order
4561/// in accordance with the given \p Mask. If the resulting \p Order is just an
4562/// identity order, \p Order is cleared.
4564 bool BottomOrder = false) {
4565 assert(!Mask.empty() && "Expected non-empty mask.");
4566 unsigned Sz = Mask.size();
4567 if (BottomOrder) {
4568 SmallVector<unsigned> PrevOrder;
4569 if (Order.empty()) {
4570 PrevOrder.resize(Sz);
4571 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4572 } else {
4573 PrevOrder.swap(Order);
4574 }
4575 Order.assign(Sz, Sz);
4576 for (unsigned I = 0; I < Sz; ++I)
4577 if (Mask[I] != PoisonMaskElem)
4578 Order[I] = PrevOrder[Mask[I]];
4579 if (all_of(enumerate(Order), [&](const auto &Data) {
4580 return Data.value() == Sz || Data.index() == Data.value();
4581 })) {
4582 Order.clear();
4583 return;
4584 }
4585 fixupOrderingIndices(Order);
4586 return;
4587 }
4588 SmallVector<int> MaskOrder;
4589 if (Order.empty()) {
4590 MaskOrder.resize(Sz);
4591 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4592 } else {
4593 inversePermutation(Order, MaskOrder);
4594 }
4595 reorderReuses(MaskOrder, Mask);
4596 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4597 Order.clear();
4598 return;
4599 }
4600 Order.assign(Sz, Sz);
4601 for (unsigned I = 0; I < Sz; ++I)
4602 if (MaskOrder[I] != PoisonMaskElem)
4603 Order[MaskOrder[I]] = I;
4604 fixupOrderingIndices(Order);
4605}
4606
4607std::optional<BoUpSLP::OrdersType>
4608BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4609 assert(TE.isGather() && "Expected gather node only.");
4610 // Try to find subvector extract/insert patterns and reorder only such
4611 // patterns.
4612 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4613 Type *ScalarTy = GatheredScalars.front()->getType();
4614 int NumScalars = GatheredScalars.size();
4615 if (!isValidElementType(ScalarTy))
4616 return std::nullopt;
4617 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4618 int NumParts = TTI->getNumberOfParts(VecTy);
4619 if (NumParts == 0 || NumParts >= NumScalars ||
4620 VecTy->getNumElements() % NumParts != 0 ||
4621 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4622 VecTy->getNumElements() / NumParts))
4623 NumParts = 1;
4624 SmallVector<int> ExtractMask;
4625 SmallVector<int> Mask;
4628 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4630 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4631 /*ForOrder=*/true);
4632 // No shuffled operands - ignore.
4633 if (GatherShuffles.empty() && ExtractShuffles.empty())
4634 return std::nullopt;
4635 OrdersType CurrentOrder(NumScalars, NumScalars);
4636 if (GatherShuffles.size() == 1 &&
4637 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4638 Entries.front().front()->isSame(TE.Scalars)) {
4639 // Perfect match in the graph, will reuse the previously vectorized
4640 // node. Cost is 0.
4641 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4642 return CurrentOrder;
4643 }
4644 auto IsSplatMask = [](ArrayRef<int> Mask) {
4645 int SingleElt = PoisonMaskElem;
4646 return all_of(Mask, [&](int I) {
4647 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4648 SingleElt = I;
4649 return I == PoisonMaskElem || I == SingleElt;
4650 });
4651 };
4652 // Exclusive broadcast mask - ignore.
4653 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4654 (Entries.size() != 1 ||
4655 Entries.front().front()->ReorderIndices.empty())) ||
4656 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4657 return std::nullopt;
4658 SmallBitVector ShuffledSubMasks(NumParts);
4659 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4660 ArrayRef<int> Mask, int PartSz, int NumParts,
4661 function_ref<unsigned(unsigned)> GetVF) {
4662 for (int I : seq<int>(0, NumParts)) {
4663 if (ShuffledSubMasks.test(I))
4664 continue;
4665 const int VF = GetVF(I);
4666 if (VF == 0)
4667 continue;
4668 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4669 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4670 // Shuffle of at least 2 vectors - ignore.
4671 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4672 std::fill(Slice.begin(), Slice.end(), NumScalars);
4673 ShuffledSubMasks.set(I);
4674 continue;
4675 }
4676 // Try to include as much elements from the mask as possible.
4677 int FirstMin = INT_MAX;
4678 int SecondVecFound = false;
4679 for (int K : seq<int>(Limit)) {
4680 int Idx = Mask[I * PartSz + K];
4681 if (Idx == PoisonMaskElem) {
4682 Value *V = GatheredScalars[I * PartSz + K];
4683 if (isConstant(V) && !isa<PoisonValue>(V)) {
4684 SecondVecFound = true;
4685 break;
4686 }
4687 continue;
4688 }
4689 if (Idx < VF) {
4690 if (FirstMin > Idx)
4691 FirstMin = Idx;
4692 } else {
4693 SecondVecFound = true;
4694 break;
4695 }
4696 }
4697 FirstMin = (FirstMin / PartSz) * PartSz;
4698 // Shuffle of at least 2 vectors - ignore.
4699 if (SecondVecFound) {
4700 std::fill(Slice.begin(), Slice.end(), NumScalars);
4701 ShuffledSubMasks.set(I);
4702 continue;
4703 }
4704 for (int K : seq<int>(Limit)) {
4705 int Idx = Mask[I * PartSz + K];
4706 if (Idx == PoisonMaskElem)
4707 continue;
4708 Idx -= FirstMin;
4709 if (Idx >= PartSz) {
4710 SecondVecFound = true;
4711 break;
4712 }
4713 if (CurrentOrder[I * PartSz + Idx] >
4714 static_cast<unsigned>(I * PartSz + K) &&
4715 CurrentOrder[I * PartSz + Idx] !=
4716 static_cast<unsigned>(I * PartSz + Idx))
4717 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4718 }
4719 // Shuffle of at least 2 vectors - ignore.
4720 if (SecondVecFound) {
4721 std::fill(Slice.begin(), Slice.end(), NumScalars);
4722 ShuffledSubMasks.set(I);
4723 continue;
4724 }
4725 }
4726 };
4727 int PartSz = getPartNumElems(NumScalars, NumParts);
4728 if (!ExtractShuffles.empty())
4729 TransformMaskToOrder(
4730 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4731 if (!ExtractShuffles[I])
4732 return 0U;
4733 unsigned VF = 0;
4734 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4735 for (unsigned Idx : seq<unsigned>(Sz)) {
4736 int K = I * PartSz + Idx;
4737 if (ExtractMask[K] == PoisonMaskElem)
4738 continue;
4739 if (!TE.ReuseShuffleIndices.empty())
4740 K = TE.ReuseShuffleIndices[K];
4741 if (K == PoisonMaskElem)
4742 continue;
4743 if (!TE.ReorderIndices.empty())
4744 K = std::distance(TE.ReorderIndices.begin(),
4745 find(TE.ReorderIndices, K));
4746 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4747 if (!EI)
4748 continue;
4749 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4750 ->getElementCount()
4751 .getKnownMinValue());
4752 }
4753 return VF;
4754 });
4755 // Check special corner case - single shuffle of the same entry.
4756 if (GatherShuffles.size() == 1 && NumParts != 1) {
4757 if (ShuffledSubMasks.any())
4758 return std::nullopt;
4759 PartSz = NumScalars;
4760 NumParts = 1;
4761 }
4762 if (!Entries.empty())
4763 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4764 if (!GatherShuffles[I])
4765 return 0U;
4766 return std::max(Entries[I].front()->getVectorFactor(),
4767 Entries[I].back()->getVectorFactor());
4768 });
4769 int NumUndefs =
4770 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4771 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4772 return std::nullopt;
4773 return std::move(CurrentOrder);
4774}
4775
4776static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4777 const TargetLibraryInfo &TLI,
4778 bool CompareOpcodes = true) {
4781 return false;
4782 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4783 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4784 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4785 (!GEP2 || GEP2->getNumOperands() == 2) &&
4786 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4787 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4788 !CompareOpcodes ||
4789 (GEP1 && GEP2 &&
4790 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4791}
4792
4793/// Calculates minimal alignment as a common alignment.
4794template <typename T>
4796 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4797 for (Value *V : VL.drop_front())
4798 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4799 return CommonAlignment;
4800}
4801
4802/// Check if \p Order represents reverse order.
4804 assert(!Order.empty() &&
4805 "Order is empty. Please check it before using isReverseOrder.");
4806 unsigned Sz = Order.size();
4807 return all_of(enumerate(Order), [&](const auto &Pair) {
4808 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4809 });
4810}
4811
4812/// Checks if the provided list of pointers \p Pointers represents the strided
4813/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4814/// Otherwise, if \p Inst is not specified, just initialized optional value is
4815/// returned to show that the pointers represent strided pointers. If \p Inst
4816/// specified, the runtime stride is materialized before the given \p Inst.
4817/// \returns std::nullopt if the pointers are not pointers with the runtime
4818/// stride, nullptr or actual stride value, otherwise.
4819static std::optional<Value *>
4821 const DataLayout &DL, ScalarEvolution &SE,
4822 SmallVectorImpl<unsigned> &SortedIndices,
4823 Instruction *Inst = nullptr) {
4825 const SCEV *PtrSCEVLowest = nullptr;
4826 const SCEV *PtrSCEVHighest = nullptr;
4827 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4828 // addresses).
4829 for (Value *Ptr : PointerOps) {
4830 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4831 if (!PtrSCEV)
4832 return std::nullopt;
4833 SCEVs.push_back(PtrSCEV);
4834 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4835 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4836 continue;
4837 }
4838 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4839 if (isa<SCEVCouldNotCompute>(Diff))
4840 return std::nullopt;
4841 if (Diff->isNonConstantNegative()) {
4842 PtrSCEVLowest = PtrSCEV;
4843 continue;
4844 }
4845 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4846 if (isa<SCEVCouldNotCompute>(Diff1))
4847 return std::nullopt;
4848 if (Diff1->isNonConstantNegative()) {
4849 PtrSCEVHighest = PtrSCEV;
4850 continue;
4851 }
4852 }
4853 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4854 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4855 if (isa<SCEVCouldNotCompute>(Dist))
4856 return std::nullopt;
4857 int Size = DL.getTypeStoreSize(ElemTy);
4858 auto TryGetStride = [&](const SCEV *Dist,
4859 const SCEV *Multiplier) -> const SCEV * {
4860 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4861 if (M->getOperand(0) == Multiplier)
4862 return M->getOperand(1);
4863 if (M->getOperand(1) == Multiplier)
4864 return M->getOperand(0);
4865 return nullptr;
4866 }
4867 if (Multiplier == Dist)
4868 return SE.getConstant(Dist->getType(), 1);
4869 return SE.getUDivExactExpr(Dist, Multiplier);
4870 };
4871 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4872 const SCEV *Stride = nullptr;
4873 if (Size != 1 || SCEVs.size() > 2) {
4874 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4875 Stride = TryGetStride(Dist, Sz);
4876 if (!Stride)
4877 return std::nullopt;
4878 }
4879 if (!Stride || isa<SCEVConstant>(Stride))
4880 return std::nullopt;
4881 // Iterate through all pointers and check if all distances are
4882 // unique multiple of Stride.
4883 using DistOrdPair = std::pair<int64_t, int>;
4884 auto Compare = llvm::less_first();
4885 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4886 int Cnt = 0;
4887 bool IsConsecutive = true;
4888 for (const SCEV *PtrSCEV : SCEVs) {
4889 unsigned Dist = 0;
4890 if (PtrSCEV != PtrSCEVLowest) {
4891 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4892 const SCEV *Coeff = TryGetStride(Diff, Stride);
4893 if (!Coeff)
4894 return std::nullopt;
4895 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4896 if (!SC || isa<SCEVCouldNotCompute>(SC))
4897 return std::nullopt;
4898 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4899 SE.getMulExpr(Stride, SC)))
4900 ->isZero())
4901 return std::nullopt;
4902 Dist = SC->getAPInt().getZExtValue();
4903 }
4904 // If the strides are not the same or repeated, we can't vectorize.
4905 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4906 return std::nullopt;
4907 auto Res = Offsets.emplace(Dist, Cnt);
4908 if (!Res.second)
4909 return std::nullopt;
4910 // Consecutive order if the inserted element is the last one.
4911 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4912 ++Cnt;
4913 }
4914 if (Offsets.size() != SCEVs.size())
4915 return std::nullopt;
4916 SortedIndices.clear();
4917 if (!IsConsecutive) {
4918 // Fill SortedIndices array only if it is non-consecutive.
4919 SortedIndices.resize(PointerOps.size());
4920 Cnt = 0;
4921 for (const std::pair<int64_t, int> &Pair : Offsets) {
4922 SortedIndices[Cnt] = Pair.second;
4923 ++Cnt;
4924 }
4925 }
4926 if (!Inst)
4927 return nullptr;
4928 SCEVExpander Expander(SE, DL, "strided-load-vec");
4929 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4930}
4931
4932static std::pair<InstructionCost, InstructionCost>
4934 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4935 Type *ScalarTy, VectorType *VecTy);
4936
4937/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4938/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4939/// subvector pattern.
4940static InstructionCost
4942 VectorType *Tp, ArrayRef<int> Mask = {},
4944 int Index = 0, VectorType *SubTp = nullptr,
4946 if (Kind != TTI::SK_PermuteTwoSrc)
4947 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4948 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4949 int NumSubElts;
4951 Mask, NumSrcElts, NumSubElts, Index)) {
4952 if (Index + NumSubElts > NumSrcElts &&
4953 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4954 return TTI.getShuffleCost(
4956 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4958 }
4959 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4960}
4961
4962/// Correctly creates insert_subvector, checking that the index is multiple of
4963/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4964/// using default shuffle.
4966 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
4967 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
4968 const unsigned SubVecVF = getNumElements(V->getType());
4969 if (Index % SubVecVF == 0) {
4970 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
4971 Builder.getInt64(Index));
4972 } else {
4973 // Create shuffle, insertvector requires that index is multiple of
4974 // the subvector length.
4975 const unsigned VecVF = getNumElements(Vec->getType());
4977 std::iota(Mask.begin(), Mask.end(), 0);
4978 for (unsigned I : seq<unsigned>(SubVecVF))
4979 Mask[I + Index] = I + VecVF;
4980 if (Generator) {
4981 Vec = Generator(Vec, V, Mask);
4982 } else {
4983 // 1. Resize V to the size of Vec.
4984 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
4985 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4986 V = Builder.CreateShuffleVector(V, ResizeMask);
4987 Vec = Builder.CreateShuffleVector(Vec, V, Mask);
4988 }
4989 }
4990 return Vec;
4991}
4992
4993/// Correctly creates extract_subvector, checking that the index is multiple of
4994/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4995/// using default shuffle.
4997 unsigned SubVecVF, unsigned Index) {
4998 if (Index % SubVecVF == 0) {
4999 VectorType *SubVecTy =
5000 getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
5001 return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));
5002 }
5003 // Create shuffle, extract_subvector requires that index is multiple of
5004 // the subvector length.
5005 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
5006 std::iota(Mask.begin(), Mask.end(), Index);
5007 return Builder.CreateShuffleVector(Vec, Mask);
5008}
5009
5013 SmallVectorImpl<Value *> &PointerOps,
5014 unsigned *BestVF, bool TryRecursiveCheck) const {
5015 // Check that a vectorized load would load the same memory as a scalar
5016 // load. For example, we don't want to vectorize loads that are smaller
5017 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5018 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5019 // from such a struct, we read/write packed bits disagreeing with the
5020 // unvectorized version.
5021 if (BestVF)
5022 *BestVF = 0;
5024 return LoadsState::Gather;
5025 Type *ScalarTy = VL0->getType();
5026
5027 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
5028 return LoadsState::Gather;
5029
5030 // Make sure all loads in the bundle are simple - we can't vectorize
5031 // atomic or volatile loads.
5032 PointerOps.clear();
5033 const unsigned Sz = VL.size();
5034 PointerOps.resize(Sz);
5035 auto *POIter = PointerOps.begin();
5036 for (Value *V : VL) {
5037 auto *L = dyn_cast<LoadInst>(V);
5038 if (!L || !L->isSimple())
5039 return LoadsState::Gather;
5040 *POIter = L->getPointerOperand();
5041 ++POIter;
5042 }
5043
5044 Order.clear();
5045 // Check the order of pointer operands or that all pointers are the same.
5046 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
5047
5048 auto *VecTy = getWidenedType(ScalarTy, Sz);
5049 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5050 if (!IsSorted) {
5051 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
5052 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5053 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5055 }
5056
5057 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5058 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5059 return LoadsState::Gather;
5060
5061 if (!all_of(PointerOps, [&](Value *P) {
5062 return arePointersCompatible(P, PointerOps.front(), *TLI);
5063 }))
5064 return LoadsState::Gather;
5065
5066 } else {
5067 Value *Ptr0;
5068 Value *PtrN;
5069 if (Order.empty()) {
5070 Ptr0 = PointerOps.front();
5071 PtrN = PointerOps.back();
5072 } else {
5073 Ptr0 = PointerOps[Order.front()];
5074 PtrN = PointerOps[Order.back()];
5075 }
5076 std::optional<int> Diff =
5077 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5078 // Check that the sorted loads are consecutive.
5079 if (static_cast<unsigned>(*Diff) == Sz - 1)
5080 return LoadsState::Vectorize;
5081 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5082 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5083 return LoadsState::Gather;
5084 // Simple check if not a strided access - clear order.
5085 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5086 // Try to generate strided load node if:
5087 // 1. Target with strided load support is detected.
5088 // 2. The number of loads is greater than MinProfitableStridedLoads,
5089 // or the potential stride <= MaxProfitableLoadStride and the
5090 // potential stride is power-of-2 (to avoid perf regressions for the very
5091 // small number of loads) and max distance > number of loads, or potential
5092 // stride is -1.
5093 // 3. The loads are ordered, or number of unordered loads <=
5094 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5095 // (this check is to avoid extra costs for very expensive shuffles).
5096 // 4. Any pointer operand is an instruction with the users outside of the
5097 // current graph (for masked gathers extra extractelement instructions
5098 // might be required).
5099 auto IsAnyPointerUsedOutGraph =
5100 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5101 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5102 return !getTreeEntry(U) && !MustGather.contains(U);
5103 });
5104 });
5105 const unsigned AbsoluteDiff = std::abs(*Diff);
5106 if (IsPossibleStrided &&
5107 (IsAnyPointerUsedOutGraph ||
5108 (AbsoluteDiff > Sz &&
5110 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5111 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
5112 *Diff == -(static_cast<int>(Sz) - 1))) {
5113 int Stride = *Diff / static_cast<int>(Sz - 1);
5114 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5115 Align Alignment =
5116 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5117 ->getAlign();
5118 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5119 // Iterate through all pointers and check if all distances are
5120 // unique multiple of Dist.
5121 SmallSet<int, 4> Dists;
5122 for (Value *Ptr : PointerOps) {
5123 int Dist = 0;
5124 if (Ptr == PtrN)
5125 Dist = *Diff;
5126 else if (Ptr != Ptr0)
5127 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5128 // If the strides are not the same or repeated, we can't
5129 // vectorize.
5130 if (((Dist / Stride) * Stride) != Dist ||
5131 !Dists.insert(Dist).second)
5132 break;
5133 }
5134 if (Dists.size() == Sz)
5136 }
5137 }
5138 }
5139 }
5140 // Correctly identify compare the cost of loads + shuffles rather than
5141 // strided/masked gather loads. Returns true if vectorized + shuffles
5142 // representation is better than just gather.
5143 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5144 unsigned *BestVF,
5145 bool ProfitableGatherPointers) {
5146 if (BestVF)
5147 *BestVF = 0;
5148 // Compare masked gather cost and loads + insert subvector costs.
5150 auto [ScalarGEPCost, VectorGEPCost] =
5151 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5152 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5153 // Estimate the cost of masked gather GEP. If not a splat, roughly
5154 // estimate as a buildvector, otherwise estimate as splat.
5155 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5156 VectorType *PtrVecTy =
5157 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5158 VecTy->getNumElements());
5159 if (static_cast<unsigned>(count_if(
5160 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5161 any_of(PointerOps, [&](Value *V) {
5162 return getUnderlyingObject(V) !=
5163 getUnderlyingObject(PointerOps.front());
5164 }))
5165 VectorGEPCost += TTI.getScalarizationOverhead(
5166 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5167 else
5168 VectorGEPCost +=
5170 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5171 /*Insert=*/true, /*Extract=*/false, CostKind) +
5173 // The cost of scalar loads.
5174 InstructionCost ScalarLoadsCost =
5175 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5176 [&](InstructionCost C, Value *V) {
5177 return C + TTI.getInstructionCost(
5178 cast<Instruction>(V), CostKind);
5179 }) +
5180 ScalarGEPCost;
5181 // The cost of masked gather.
5182 InstructionCost MaskedGatherCost =
5184 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5185 /*VariableMask=*/false, CommonAlignment, CostKind) +
5186 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5187 InstructionCost GatherCost =
5188 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5189 /*Extract=*/false, CostKind) +
5190 ScalarLoadsCost;
5191 // The list of loads is small or perform partial check already - directly
5192 // compare masked gather cost and gather cost.
5193 constexpr unsigned ListLimit = 4;
5194 if (!TryRecursiveCheck || VL.size() < ListLimit)
5195 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5196
5197 // FIXME: The following code has not been updated for non-power-of-2
5198 // vectors (and not whole registers). The splitting logic here does not
5199 // cover the original vector if the vector factor is not a power of two.
5200 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
5201 return false;
5202
5203 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5204 unsigned MinVF = getMinVF(2 * Sz);
5205 DemandedElts.clearAllBits();
5206 // Iterate through possible vectorization factors and check if vectorized +
5207 // shuffles is better than just gather.
5208 for (unsigned VF =
5209 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5210 VF >= MinVF;
5211 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
5213 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5214 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5216 SmallVector<Value *> PointerOps;
5217 LoadsState LS =
5218 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5219 /*TryRecursiveCheck=*/false);
5220 // Check that the sorted loads are consecutive.
5221 if (LS == LoadsState::Gather) {
5222 if (BestVF) {
5223 DemandedElts.setAllBits();
5224 break;
5225 }
5226 DemandedElts.setBits(Cnt, Cnt + VF);
5227 continue;
5228 }
5229 // If need the reorder - consider as high-cost masked gather for now.
5230 if ((LS == LoadsState::Vectorize ||
5232 !Order.empty() && !isReverseOrder(Order))
5234 States.push_back(LS);
5235 }
5236 if (DemandedElts.isAllOnes())
5237 // All loads gathered - try smaller VF.
5238 continue;
5239 // Can be vectorized later as a serie of loads/insertelements.
5240 InstructionCost VecLdCost = 0;
5241 if (!DemandedElts.isZero()) {
5242 VecLdCost =
5243 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5244 /*Extract=*/false, CostKind) +
5245 ScalarGEPCost;
5246 for (unsigned Idx : seq<unsigned>(VL.size()))
5247 if (DemandedElts[Idx])
5248 VecLdCost +=
5249 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5250 }
5251 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5252 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5253 for (auto [I, LS] : enumerate(States)) {
5254 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5255 InstructionCost VectorGEPCost =
5256 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5257 ? 0
5258 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5259 LI0->getPointerOperand(),
5260 Instruction::GetElementPtr, CostKind, ScalarTy,
5261 SubVecTy)
5262 .second;
5263 if (LS == LoadsState::ScatterVectorize) {
5264 if (static_cast<unsigned>(
5265 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5266 PointerOps.size() - 1 ||
5267 any_of(PointerOps, [&](Value *V) {
5268 return getUnderlyingObject(V) !=
5269 getUnderlyingObject(PointerOps.front());
5270 }))
5271 VectorGEPCost += TTI.getScalarizationOverhead(
5272 SubVecTy, APInt::getAllOnes(VF),
5273 /*Insert=*/true, /*Extract=*/false, CostKind);
5274 else
5275 VectorGEPCost +=
5277 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5278 /*Insert=*/true, /*Extract=*/false, CostKind) +
5279 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5280 CostKind);
5281 }
5282 switch (LS) {
5284 VecLdCost +=
5285 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5286 LI0->getPointerAddressSpace(), CostKind,
5288 VectorGEPCost;
5289 break;
5291 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5292 LI0->getPointerOperand(),
5293 /*VariableMask=*/false,
5294 CommonAlignment, CostKind) +
5295 VectorGEPCost;
5296 break;
5298 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5299 LI0->getPointerOperand(),
5300 /*VariableMask=*/false,
5301 CommonAlignment, CostKind) +
5302 VectorGEPCost;
5303 break;
5304 case LoadsState::Gather:
5305 // Gathers are already calculated - ignore.
5306 continue;
5307 }
5308 SmallVector<int> ShuffleMask(VL.size());
5309 for (int Idx : seq<int>(0, VL.size()))
5310 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5311 if (I > 0)
5312 VecLdCost +=
5313 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5314 CostKind, I * VF, SubVecTy);
5315 }
5316 // If masked gather cost is higher - better to vectorize, so
5317 // consider it as a gather node. It will be better estimated
5318 // later.
5319 if (MaskedGatherCost >= VecLdCost &&
5320 VecLdCost - GatherCost < -SLPCostThreshold) {
5321 if (BestVF)
5322 *BestVF = VF;
5323 return true;
5324 }
5325 }
5326 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5327 };
5328 // TODO: need to improve analysis of the pointers, if not all of them are
5329 // GEPs or have > 2 operands, we end up with a gather node, which just
5330 // increases the cost.
5331 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5332 bool ProfitableGatherPointers =
5333 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5334 return L->isLoopInvariant(V);
5335 })) <= Sz / 2;
5336 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5337 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5338 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5339 (GEP && GEP->getNumOperands() == 2 &&
5340 isa<Constant, Instruction>(GEP->getOperand(1)));
5341 })) {
5342 // Check if potential masked gather can be represented as series
5343 // of loads + insertsubvectors.
5344 // If masked gather cost is higher - better to vectorize, so
5345 // consider it as a gather node. It will be better estimated
5346 // later.
5347 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5348 ProfitableGatherPointers))
5350 }
5351
5352 return LoadsState::Gather;
5353}
5354
5356 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5357 const DataLayout &DL, ScalarEvolution &SE,
5358 SmallVectorImpl<unsigned> &SortedIndices) {
5359 assert(
5360 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5361 "Expected list of pointer operands.");
5362 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5363 // Ptr into, sort and return the sorted indices with values next to one
5364 // another.
5367 Bases;
5368 Bases
5369 .try_emplace(std::make_pair(
5371 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5372
5373 SortedIndices.clear();
5374 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5375 auto Key = std::make_pair(BBs[Cnt + 1],
5377 bool Found = any_of(Bases.try_emplace(Key).first->second,
5378 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5379 std::optional<int> Diff = getPointersDiff(
5380 ElemTy, std::get<0>(Base.front()), ElemTy,
5381 Ptr, DL, SE,
5382 /*StrictCheck=*/true);
5383 if (!Diff)
5384 return false;
5385
5386 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5387 return true;
5388 });
5389
5390 if (!Found) {
5391 // If we haven't found enough to usefully cluster, return early.
5392 if (Bases.size() > VL.size() / 2 - 1)
5393 return false;
5394
5395 // Not found already - add a new Base
5396 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5397 }
5398 }
5399
5400 if (Bases.size() == VL.size())
5401 return false;
5402
5403 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5404 Bases.front().second.size() == VL.size()))
5405 return false;
5406
5407 // For each of the bases sort the pointers by Offset and check if any of the
5408 // base become consecutively allocated.
5409 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5410 SmallPtrSet<Value *, 13> FirstPointers;
5411 SmallPtrSet<Value *, 13> SecondPointers;
5412 Value *P1 = Ptr1;
5413 Value *P2 = Ptr2;
5414 unsigned Depth = 0;
5415 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
5416 if (P1 == P2 || Depth > RecursionMaxDepth)
5417 return false;
5418 FirstPointers.insert(P1);
5419 SecondPointers.insert(P2);
5420 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5421 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5422 ++Depth;
5423 }
5424 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5425 "Unable to find matching root.");
5426 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5427 };
5428 for (auto &Base : Bases) {
5429 for (auto &Vec : Base.second) {
5430 if (Vec.size() > 1) {
5431 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5432 const std::tuple<Value *, int, unsigned> &Y) {
5433 return std::get<1>(X) < std::get<1>(Y);
5434 });
5435 int InitialOffset = std::get<1>(Vec[0]);
5436 bool AnyConsecutive =
5437 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5438 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5439 });
5440 // Fill SortedIndices array only if it looks worth-while to sort the
5441 // ptrs.
5442 if (!AnyConsecutive)
5443 return false;
5444 }
5445 }
5446 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5447 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5448 });
5449 }
5450
5451 for (auto &T : Bases)
5452 for (const auto &Vec : T.second)
5453 for (const auto &P : Vec)
5454 SortedIndices.push_back(std::get<2>(P));
5455
5456 assert(SortedIndices.size() == VL.size() &&
5457 "Expected SortedIndices to be the size of VL");
5458 return true;
5459}
5460
5461std::optional<BoUpSLP::OrdersType>
5462BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5463 assert(TE.isGather() && "Expected gather node only.");
5464 Type *ScalarTy = TE.Scalars[0]->getType();
5465
5467 Ptrs.reserve(TE.Scalars.size());
5469 BBs.reserve(TE.Scalars.size());
5470 for (Value *V : TE.Scalars) {
5471 auto *L = dyn_cast<LoadInst>(V);
5472 if (!L || !L->isSimple())
5473 return std::nullopt;
5474 Ptrs.push_back(L->getPointerOperand());
5475 BBs.push_back(L->getParent());
5476 }
5477
5478 BoUpSLP::OrdersType Order;
5479 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5480 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5481 return std::move(Order);
5482 return std::nullopt;
5483}
5484
5485/// Check if two insertelement instructions are from the same buildvector.
5488 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5489 // Instructions must be from the same basic blocks.
5490 if (VU->getParent() != V->getParent())
5491 return false;
5492 // Checks if 2 insertelements are from the same buildvector.
5493 if (VU->getType() != V->getType())
5494 return false;
5495 // Multiple used inserts are separate nodes.
5496 if (!VU->hasOneUse() && !V->hasOneUse())
5497 return false;
5498 auto *IE1 = VU;
5499 auto *IE2 = V;
5500 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5501 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5502 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5503 return false;
5504 // Go through the vector operand of insertelement instructions trying to find
5505 // either VU as the original vector for IE2 or V as the original vector for
5506 // IE1.
5507 SmallBitVector ReusedIdx(
5508 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5509 bool IsReusedIdx = false;
5510 do {
5511 if (IE2 == VU && !IE1)
5512 return VU->hasOneUse();
5513 if (IE1 == V && !IE2)
5514 return V->hasOneUse();
5515 if (IE1 && IE1 != V) {
5516 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5517 IsReusedIdx |= ReusedIdx.test(Idx1);
5518 ReusedIdx.set(Idx1);
5519 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5520 IE1 = nullptr;
5521 else
5522 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5523 }
5524 if (IE2 && IE2 != VU) {
5525 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5526 IsReusedIdx |= ReusedIdx.test(Idx2);
5527 ReusedIdx.set(Idx2);
5528 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5529 IE2 = nullptr;
5530 else
5531 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5532 }
5533 } while (!IsReusedIdx && (IE1 || IE2));
5534 return false;
5535}
5536
5537std::optional<BoUpSLP::OrdersType>
5538BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5539 // No need to reorder if need to shuffle reuses, still need to shuffle the
5540 // node.
5541 if (!TE.ReuseShuffleIndices.empty()) {
5542 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5543 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5544 "Reshuffling scalars not yet supported for nodes with padding");
5545
5546 if (isSplat(TE.Scalars))
5547 return std::nullopt;
5548 // Check if reuse shuffle indices can be improved by reordering.
5549 // For this, check that reuse mask is "clustered", i.e. each scalar values
5550 // is used once in each submask of size <number_of_scalars>.
5551 // Example: 4 scalar values.
5552 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5553 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5554 // element 3 is used twice in the second submask.
5555 unsigned Sz = TE.Scalars.size();
5556 if (TE.isGather()) {
5557 if (std::optional<OrdersType> CurrentOrder =
5559 SmallVector<int> Mask;
5560 fixupOrderingIndices(*CurrentOrder);
5561 inversePermutation(*CurrentOrder, Mask);
5562 ::addMask(Mask, TE.ReuseShuffleIndices);
5563 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5564 unsigned Sz = TE.Scalars.size();
5565 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5566 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5567 if (Idx != PoisonMaskElem)
5568 Res[Idx + K * Sz] = I + K * Sz;
5569 }
5570 return std::move(Res);
5571 }
5572 }
5573 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5574 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5575 2 * TE.getVectorFactor())) == 1)
5576 return std::nullopt;
5577 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5578 Sz)) {
5579 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5580 if (TE.ReorderIndices.empty())
5581 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5582 else
5583 inversePermutation(TE.ReorderIndices, ReorderMask);
5584 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5585 unsigned VF = ReorderMask.size();
5586 OrdersType ResOrder(VF, VF);
5587 unsigned NumParts = divideCeil(VF, Sz);
5588 SmallBitVector UsedVals(NumParts);
5589 for (unsigned I = 0; I < VF; I += Sz) {
5590 int Val = PoisonMaskElem;
5591 unsigned UndefCnt = 0;
5592 unsigned Limit = std::min(Sz, VF - I);
5593 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5594 [&](int Idx) {
5595 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5596 Val = Idx;
5597 if (Idx == PoisonMaskElem)
5598 ++UndefCnt;
5599 return Idx != PoisonMaskElem && Idx != Val;
5600 }) ||
5601 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5602 UndefCnt > Sz / 2)
5603 return std::nullopt;
5604 UsedVals.set(Val);
5605 for (unsigned K = 0; K < NumParts; ++K) {
5606 unsigned Idx = Val + Sz * K;
5607 if (Idx < VF)
5608 ResOrder[Idx] = I + K;
5609 }
5610 }
5611 return std::move(ResOrder);
5612 }
5613 unsigned VF = TE.getVectorFactor();
5614 // Try build correct order for extractelement instructions.
5615 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5616 TE.ReuseShuffleIndices.end());
5617 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5618 all_of(TE.Scalars, [Sz](Value *V) {
5619 if (isa<PoisonValue>(V))
5620 return true;
5621 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5622 return Idx && *Idx < Sz;
5623 })) {
5624 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5625 "by BinaryOperator and CastInst.");
5626 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5627 if (TE.ReorderIndices.empty())
5628 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5629 else
5630 inversePermutation(TE.ReorderIndices, ReorderMask);
5631 for (unsigned I = 0; I < VF; ++I) {
5632 int &Idx = ReusedMask[I];
5633 if (Idx == PoisonMaskElem)
5634 continue;
5635 Value *V = TE.Scalars[ReorderMask[Idx]];
5636 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5637 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5638 }
5639 }
5640 // Build the order of the VF size, need to reorder reuses shuffles, they are
5641 // always of VF size.
5642 OrdersType ResOrder(VF);
5643 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5644 auto *It = ResOrder.begin();
5645 for (unsigned K = 0; K < VF; K += Sz) {
5646 OrdersType CurrentOrder(TE.ReorderIndices);
5647 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5648 if (SubMask.front() == PoisonMaskElem)
5649 std::iota(SubMask.begin(), SubMask.end(), 0);
5650 reorderOrder(CurrentOrder, SubMask);
5651 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5652 std::advance(It, Sz);
5653 }
5654 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5655 return Data.index() == Data.value();
5656 }))
5657 return std::nullopt; // No need to reorder.
5658 return std::move(ResOrder);
5659 }
5660 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5661 any_of(TE.UserTreeIndices,
5662 [](const EdgeInfo &EI) {
5663 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5664 }) &&
5665 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5666 return std::nullopt;
5667 if ((TE.State == TreeEntry::Vectorize ||
5668 TE.State == TreeEntry::StridedVectorize) &&
5669 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5670 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5671 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5672 "BinaryOperator and CastInst.");
5673 return TE.ReorderIndices;
5674 }
5675 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5676 if (!TE.ReorderIndices.empty())
5677 return TE.ReorderIndices;
5678
5679 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5680 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5681 if (!V->hasNUsesOrMore(1))
5682 continue;
5683 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5684 if (!II)
5685 continue;
5686 Instruction *BVHead = nullptr;
5687 BasicBlock *BB = II->getParent();
5688 while (II && II->hasOneUse() && II->getParent() == BB) {
5689 BVHead = II;
5690 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5691 }
5692 I = BVHead;
5693 }
5694
5695 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5696 assert(BB1 != BB2 && "Expected different basic blocks.");
5697 auto *NodeA = DT->getNode(BB1);
5698 auto *NodeB = DT->getNode(BB2);
5699 assert(NodeA && "Should only process reachable instructions");
5700 assert(NodeB && "Should only process reachable instructions");
5701 assert((NodeA == NodeB) ==
5702 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5703 "Different nodes should have different DFS numbers");
5704 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5705 };
5706 auto PHICompare = [&](unsigned I1, unsigned I2) {
5707 Value *V1 = TE.Scalars[I1];
5708 Value *V2 = TE.Scalars[I2];
5709 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5710 return false;
5711 if (isa<PoisonValue>(V1))
5712 return true;
5713 if (isa<PoisonValue>(V2))
5714 return false;
5715 if (V1->getNumUses() < V2->getNumUses())
5716 return true;
5717 if (V1->getNumUses() > V2->getNumUses())
5718 return false;
5719 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5720 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5721 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5722 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5723 FirstUserOfPhi2->getParent());
5724 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5725 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5726 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5727 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5728 if (IE1 && !IE2)
5729 return true;
5730 if (!IE1 && IE2)
5731 return false;
5732 if (IE1 && IE2) {
5733 if (UserBVHead[I1] && !UserBVHead[I2])
5734 return true;
5735 if (!UserBVHead[I1])
5736 return false;
5737 if (UserBVHead[I1] == UserBVHead[I2])
5738 return getElementIndex(IE1) < getElementIndex(IE2);
5739 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5740 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5741 UserBVHead[I2]->getParent());
5742 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5743 }
5744 if (EE1 && !EE2)
5745 return true;
5746 if (!EE1 && EE2)
5747 return false;
5748 if (EE1 && EE2) {
5749 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5750 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5751 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5752 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5753 if (!Inst2 && !P2)
5754 return Inst1 || P1;
5755 if (EE1->getOperand(0) == EE2->getOperand(0))
5756 return getElementIndex(EE1) < getElementIndex(EE2);
5757 if (!Inst1 && Inst2)
5758 return false;
5759 if (Inst1 && Inst2) {
5760 if (Inst1->getParent() != Inst2->getParent())
5761 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5762 return Inst1->comesBefore(Inst2);
5763 }
5764 if (!P1 && P2)
5765 return false;
5766 assert(P1 && P2 &&
5767 "Expected either instructions or arguments vector operands.");
5768 return P1->getArgNo() < P2->getArgNo();
5769 }
5770 return false;
5771 };
5772 OrdersType Phis(TE.Scalars.size());
5773 std::iota(Phis.begin(), Phis.end(), 0);
5774 stable_sort(Phis, PHICompare);
5775 if (isIdentityOrder(Phis))
5776 return std::nullopt; // No need to reorder.
5777 return std::move(Phis);
5778 }
5779 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5780 allSameType(TE.Scalars)) {
5781 // TODO: add analysis of other gather nodes with extractelement
5782 // instructions and other values/instructions, not only undefs.
5783 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5784 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5785 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5786 all_of(TE.Scalars, [](Value *V) {
5787 auto *EE = dyn_cast<ExtractElementInst>(V);
5788 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5789 })) {
5790 // Check that gather of extractelements can be represented as
5791 // just a shuffle of a single vector.
5792 OrdersType CurrentOrder;
5793 bool Reuse =
5794 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
5795 if (Reuse || !CurrentOrder.empty())
5796 return std::move(CurrentOrder);
5797 }
5798 // If the gather node is <undef, v, .., poison> and
5799 // insertelement poison, v, 0 [+ permute]
5800 // is cheaper than
5801 // insertelement poison, v, n - try to reorder.
5802 // If rotating the whole graph, exclude the permute cost, the whole graph
5803 // might be transformed.
5804 int Sz = TE.Scalars.size();
5805 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5806 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5807 const auto *It =
5808 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5809 if (It == TE.Scalars.begin())
5810 return OrdersType();
5811 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5812 if (It != TE.Scalars.end()) {
5813 OrdersType Order(Sz, Sz);
5814 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5815 Order[Idx] = 0;
5816 fixupOrderingIndices(Order);
5817 SmallVector<int> Mask;
5818 inversePermutation(Order, Mask);
5819 InstructionCost PermuteCost =
5820 TopToBottom
5821 ? 0
5823 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5824 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5825 PoisonValue::get(Ty), *It);
5826 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5827 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5828 PoisonValue::get(Ty), *It);
5829 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5830 OrdersType Order(Sz, Sz);
5831 Order[Idx] = 0;
5832 return std::move(Order);
5833 }
5834 }
5835 }
5836 if (isSplat(TE.Scalars))
5837 return std::nullopt;
5838 if (TE.Scalars.size() >= 3)
5839 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5840 return Order;
5841 // Check if can include the order of vectorized loads. For masked gathers do
5842 // extra analysis later, so include such nodes into a special list.
5843 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5844 SmallVector<Value *> PointerOps;
5845 OrdersType CurrentOrder;
5846 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5847 CurrentOrder, PointerOps);
5849 return std::move(CurrentOrder);
5850 }
5851 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5852 // has been auditted for correctness with non-power-of-two vectors.
5853 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5854 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5855 return CurrentOrder;
5856 }
5857 return std::nullopt;
5858}
5859
5860/// Checks if the given mask is a "clustered" mask with the same clusters of
5861/// size \p Sz, which are not identity submasks.
5863 unsigned Sz) {
5864 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5865 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5866 return false;
5867 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5868 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5869 if (Cluster != FirstCluster)
5870 return false;
5871 }
5872 return true;
5873}
5874
5875void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5876 // Reorder reuses mask.
5877 reorderReuses(TE.ReuseShuffleIndices, Mask);
5878 const unsigned Sz = TE.Scalars.size();
5879 // For vectorized and non-clustered reused no need to do anything else.
5880 if (!TE.isGather() ||
5882 Sz) ||
5883 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5884 return;
5885 SmallVector<int> NewMask;
5886 inversePermutation(TE.ReorderIndices, NewMask);
5887 addMask(NewMask, TE.ReuseShuffleIndices);
5888 // Clear reorder since it is going to be applied to the new mask.
5889 TE.ReorderIndices.clear();
5890 // Try to improve gathered nodes with clustered reuses, if possible.
5891 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5892 SmallVector<unsigned> NewOrder(Slice);
5893 inversePermutation(NewOrder, NewMask);
5894 reorderScalars(TE.Scalars, NewMask);
5895 // Fill the reuses mask with the identity submasks.
5896 for (auto *It = TE.ReuseShuffleIndices.begin(),
5897 *End = TE.ReuseShuffleIndices.end();
5898 It != End; std::advance(It, Sz))
5899 std::iota(It, std::next(It, Sz), 0);
5900}
5901
5903 ArrayRef<unsigned> SecondaryOrder) {
5904 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5905 "Expected same size of orders");
5906 unsigned Sz = Order.size();
5907 SmallBitVector UsedIndices(Sz);
5908 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5909 if (Order[Idx] != Sz)
5910 UsedIndices.set(Order[Idx]);
5911 }
5912 if (SecondaryOrder.empty()) {
5913 for (unsigned Idx : seq<unsigned>(0, Sz))
5914 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5915 Order[Idx] = Idx;
5916 } else {
5917 for (unsigned Idx : seq<unsigned>(0, Sz))
5918 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5919 !UsedIndices.test(SecondaryOrder[Idx]))
5920 Order[Idx] = SecondaryOrder[Idx];
5921 }
5922}
5923
5925 // Maps VF to the graph nodes.
5927 // ExtractElement gather nodes which can be vectorized and need to handle
5928 // their ordering.
5930
5931 // Phi nodes can have preferred ordering based on their result users
5933
5934 // AltShuffles can also have a preferred ordering that leads to fewer
5935 // instructions, e.g., the addsub instruction in x86.
5936 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5937
5938 // Maps a TreeEntry to the reorder indices of external users.
5940 ExternalUserReorderMap;
5941 // Find all reorderable nodes with the given VF.
5942 // Currently the are vectorized stores,loads,extracts + some gathering of
5943 // extracts.
5944 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5945 const std::unique_ptr<TreeEntry> &TE) {
5946 // Look for external users that will probably be vectorized.
5947 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5948 findExternalStoreUsersReorderIndices(TE.get());
5949 if (!ExternalUserReorderIndices.empty()) {
5950 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5951 ExternalUserReorderMap.try_emplace(TE.get(),
5952 std::move(ExternalUserReorderIndices));
5953 }
5954
5955 // Patterns like [fadd,fsub] can be combined into a single instruction in
5956 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5957 // to take into account their order when looking for the most used order.
5958 if (TE->hasState() && TE->isAltShuffle()) {
5959 VectorType *VecTy =
5960 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5961 unsigned Opcode0 = TE->getOpcode();
5962 unsigned Opcode1 = TE->getAltOpcode();
5963 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5964 // If this pattern is supported by the target then we consider the order.
5965 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5966 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5967 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5968 }
5969 // TODO: Check the reverse order too.
5970 }
5971
5972 if (std::optional<OrdersType> CurrentOrder =
5973 getReorderingData(*TE, /*TopToBottom=*/true)) {
5974 // Do not include ordering for nodes used in the alt opcode vectorization,
5975 // better to reorder them during bottom-to-top stage. If follow the order
5976 // here, it causes reordering of the whole graph though actually it is
5977 // profitable just to reorder the subgraph that starts from the alternate
5978 // opcode vectorization node. Such nodes already end-up with the shuffle
5979 // instruction and it is just enough to change this shuffle rather than
5980 // rotate the scalars for the whole graph.
5981 unsigned Cnt = 0;
5982 const TreeEntry *UserTE = TE.get();
5983 while (UserTE && Cnt < RecursionMaxDepth) {
5984 if (UserTE->UserTreeIndices.size() != 1)
5985 break;
5986 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5987 return EI.UserTE->State == TreeEntry::Vectorize &&
5988 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5989 }))
5990 return;
5991 UserTE = UserTE->UserTreeIndices.back().UserTE;
5992 ++Cnt;
5993 }
5994 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5995 if (!(TE->State == TreeEntry::Vectorize ||
5996 TE->State == TreeEntry::StridedVectorize) ||
5997 !TE->ReuseShuffleIndices.empty())
5998 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5999 if (TE->State == TreeEntry::Vectorize &&
6000 TE->getOpcode() == Instruction::PHI)
6001 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
6002 }
6003 });
6004
6005 // Reorder the graph nodes according to their vectorization factor.
6006 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
6007 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6008 auto It = VFToOrderedEntries.find(VF);
6009 if (It == VFToOrderedEntries.end())
6010 continue;
6011 // Try to find the most profitable order. We just are looking for the most
6012 // used order and reorder scalar elements in the nodes according to this
6013 // mostly used order.
6014 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
6015 // Delete VF entry upon exit.
6016 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
6017
6018 // All operands are reordered and used only in this node - propagate the
6019 // most used order to the user node.
6022 OrdersUses;
6024 for (const TreeEntry *OpTE : OrderedEntries) {
6025 // No need to reorder this nodes, still need to extend and to use shuffle,
6026 // just need to merge reordering shuffle and the reuse shuffle.
6027 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6028 continue;
6029 // Count number of orders uses.
6030 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6031 &PhisToOrders]() -> const OrdersType & {
6032 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6033 auto It = GathersToOrders.find(OpTE);
6034 if (It != GathersToOrders.end())
6035 return It->second;
6036 }
6037 if (OpTE->hasState() && OpTE->isAltShuffle()) {
6038 auto It = AltShufflesToOrders.find(OpTE);
6039 if (It != AltShufflesToOrders.end())
6040 return It->second;
6041 }
6042 if (OpTE->State == TreeEntry::Vectorize &&
6043 OpTE->getOpcode() == Instruction::PHI) {
6044 auto It = PhisToOrders.find(OpTE);
6045 if (It != PhisToOrders.end())
6046 return It->second;
6047 }
6048 return OpTE->ReorderIndices;
6049 }();
6050 // First consider the order of the external scalar users.
6051 auto It = ExternalUserReorderMap.find(OpTE);
6052 if (It != ExternalUserReorderMap.end()) {
6053 const auto &ExternalUserReorderIndices = It->second;
6054 // If the OpTE vector factor != number of scalars - use natural order,
6055 // it is an attempt to reorder node with reused scalars but with
6056 // external uses.
6057 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6058 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6059 ExternalUserReorderIndices.size();
6060 } else {
6061 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
6062 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6063 }
6064 // No other useful reorder data in this entry.
6065 if (Order.empty())
6066 continue;
6067 }
6068 // Stores actually store the mask, not the order, need to invert.
6069 if (OpTE->State == TreeEntry::Vectorize &&
6070 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6071 assert(!OpTE->isAltShuffle() &&
6072 "Alternate instructions are only supported by BinaryOperator "
6073 "and CastInst.");
6074 SmallVector<int> Mask;
6075 inversePermutation(Order, Mask);
6076 unsigned E = Order.size();
6077 OrdersType CurrentOrder(E, E);
6078 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6079 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6080 });
6081 fixupOrderingIndices(CurrentOrder);
6082 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6083 } else {
6084 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6085 }
6086 }
6087 if (OrdersUses.empty())
6088 continue;
6089 // Choose the most used order.
6090 unsigned IdentityCnt = 0;
6091 unsigned FilledIdentityCnt = 0;
6092 OrdersType IdentityOrder(VF, VF);
6093 for (auto &Pair : OrdersUses) {
6094 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6095 if (!Pair.first.empty())
6096 FilledIdentityCnt += Pair.second;
6097 IdentityCnt += Pair.second;
6098 combineOrders(IdentityOrder, Pair.first);
6099 }
6100 }
6101 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6102 unsigned Cnt = IdentityCnt;
6103 for (auto &Pair : OrdersUses) {
6104 // Prefer identity order. But, if filled identity found (non-empty order)
6105 // with same number of uses, as the new candidate order, we can choose
6106 // this candidate order.
6107 if (Cnt < Pair.second ||
6108 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6109 Cnt == Pair.second && !BestOrder.empty() &&
6110 isIdentityOrder(BestOrder))) {
6111 combineOrders(Pair.first, BestOrder);
6112 BestOrder = Pair.first;
6113 Cnt = Pair.second;
6114 } else {
6115 combineOrders(BestOrder, Pair.first);
6116 }
6117 }
6118 // Set order of the user node.
6119 if (isIdentityOrder(BestOrder))
6120 continue;
6121 fixupOrderingIndices(BestOrder);
6122 SmallVector<int> Mask;
6123 inversePermutation(BestOrder, Mask);
6124 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6125 unsigned E = BestOrder.size();
6126 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6127 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6128 });
6129 // Do an actual reordering, if profitable.
6130 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6131 // Just do the reordering for the nodes with the given VF.
6132 if (TE->Scalars.size() != VF) {
6133 if (TE->ReuseShuffleIndices.size() == VF) {
6134 // Need to reorder the reuses masks of the operands with smaller VF to
6135 // be able to find the match between the graph nodes and scalar
6136 // operands of the given node during vectorization/cost estimation.
6137 assert(all_of(TE->UserTreeIndices,
6138 [VF, &TE](const EdgeInfo &EI) {
6139 return EI.UserTE->Scalars.size() == VF ||
6140 EI.UserTE->Scalars.size() ==
6141 TE->Scalars.size();
6142 }) &&
6143 "All users must be of VF size.");
6144 if (SLPReVec) {
6145 assert(SLPReVec && "Only supported by REVEC.");
6146 // ShuffleVectorInst does not do reorderOperands (and it should not
6147 // because ShuffleVectorInst supports only a limited set of
6148 // patterns). Only do reorderNodeWithReuses if all of the users are
6149 // not ShuffleVectorInst.
6150 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6151 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6152 }))
6153 continue;
6154 assert(none_of(TE->UserTreeIndices,
6155 [&](const EdgeInfo &EI) {
6156 return isa<ShuffleVectorInst>(
6157 EI.UserTE->getMainOp());
6158 }) &&
6159 "Does not know how to reorder.");
6160 }
6161 // Update ordering of the operands with the smaller VF than the given
6162 // one.
6163 reorderNodeWithReuses(*TE, Mask);
6164 }
6165 continue;
6166 }
6167 if ((TE->State == TreeEntry::Vectorize ||
6168 TE->State == TreeEntry::StridedVectorize) &&
6170 InsertElementInst>(TE->getMainOp()) ||
6171 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6172 assert(!TE->isAltShuffle() &&
6173 "Alternate instructions are only supported by BinaryOperator "
6174 "and CastInst.");
6175 // Build correct orders for extract{element,value}, loads and
6176 // stores.
6177 reorderOrder(TE->ReorderIndices, Mask);
6178 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6179 TE->reorderOperands(Mask);
6180 } else {
6181 // Reorder the node and its operands.
6182 TE->reorderOperands(Mask);
6183 assert(TE->ReorderIndices.empty() &&
6184 "Expected empty reorder sequence.");
6185 reorderScalars(TE->Scalars, Mask);
6186 }
6187 if (!TE->ReuseShuffleIndices.empty()) {
6188 // Apply reversed order to keep the original ordering of the reused
6189 // elements to avoid extra reorder indices shuffling.
6190 OrdersType CurrentOrder;
6191 reorderOrder(CurrentOrder, MaskOrder);
6192 SmallVector<int> NewReuses;
6193 inversePermutation(CurrentOrder, NewReuses);
6194 addMask(NewReuses, TE->ReuseShuffleIndices);
6195 TE->ReuseShuffleIndices.swap(NewReuses);
6196 }
6197 }
6198 }
6199}
6200
6201bool BoUpSLP::canReorderOperands(
6202 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6203 ArrayRef<TreeEntry *> ReorderableGathers,
6204 SmallVectorImpl<TreeEntry *> &GatherOps) {
6205 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6206 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6207 return OpData.first == I &&
6208 (OpData.second->State == TreeEntry::Vectorize ||
6209 OpData.second->State == TreeEntry::StridedVectorize);
6210 }))
6211 continue;
6212 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6213 // Do not reorder if operand node is used by many user nodes.
6214 if (any_of(TE->UserTreeIndices,
6215 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6216 return false;
6217 // Add the node to the list of the ordered nodes with the identity
6218 // order.
6219 Edges.emplace_back(I, TE);
6220 // Add ScatterVectorize nodes to the list of operands, where just
6221 // reordering of the scalars is required. Similar to the gathers, so
6222 // simply add to the list of gathered ops.
6223 // If there are reused scalars, process this node as a regular vectorize
6224 // node, just reorder reuses mask.
6225 if (TE->State != TreeEntry::Vectorize &&
6226 TE->State != TreeEntry::StridedVectorize &&
6227 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6228 GatherOps.push_back(TE);
6229 continue;
6230 }
6231 TreeEntry *Gather = nullptr;
6232 if (count_if(ReorderableGathers,
6233 [&Gather, UserTE, I](TreeEntry *TE) {
6234 assert(TE->State != TreeEntry::Vectorize &&
6235 TE->State != TreeEntry::StridedVectorize &&
6236 "Only non-vectorized nodes are expected.");
6237 if (any_of(TE->UserTreeIndices,
6238 [UserTE, I](const EdgeInfo &EI) {
6239 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6240 })) {
6241 assert(TE->isSame(UserTE->getOperand(I)) &&
6242 "Operand entry does not match operands.");
6243 Gather = TE;
6244 return true;
6245 }
6246 return false;
6247 }) > 1 &&
6248 !allConstant(UserTE->getOperand(I)))
6249 return false;
6250 if (Gather)
6251 GatherOps.push_back(Gather);
6252 }
6253 return true;
6254}
6255
6256void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6257 SetVector<TreeEntry *> OrderedEntries;
6258 DenseSet<const TreeEntry *> GathersToOrders;
6259 // Find all reorderable leaf nodes with the given VF.
6260 // Currently the are vectorized loads,extracts without alternate operands +
6261 // some gathering of extracts.
6262 SmallVector<TreeEntry *> NonVectorized;
6263 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6264 if (TE->State != TreeEntry::Vectorize &&
6265 TE->State != TreeEntry::StridedVectorize)
6266 NonVectorized.push_back(TE.get());
6267 if (std::optional<OrdersType> CurrentOrder =
6268 getReorderingData(*TE, /*TopToBottom=*/false)) {
6269 OrderedEntries.insert(TE.get());
6270 if (!(TE->State == TreeEntry::Vectorize ||
6271 TE->State == TreeEntry::StridedVectorize) ||
6272 !TE->ReuseShuffleIndices.empty())
6273 GathersToOrders.insert(TE.get());
6274 }
6275 }
6276
6277 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6278 // I.e., if the node has operands, that are reordered, try to make at least
6279 // one operand order in the natural order and reorder others + reorder the
6280 // user node itself.
6282 while (!OrderedEntries.empty()) {
6283 // 1. Filter out only reordered nodes.
6284 // 2. If the entry has multiple uses - skip it and jump to the next node.
6286 SmallVector<TreeEntry *> Filtered;
6287 for (TreeEntry *TE : OrderedEntries) {
6288 if (!(TE->State == TreeEntry::Vectorize ||
6289 TE->State == TreeEntry::StridedVectorize ||
6290 (TE->isGather() && GathersToOrders.contains(TE))) ||
6291 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6292 !all_of(drop_begin(TE->UserTreeIndices),
6293 [TE](const EdgeInfo &EI) {
6294 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6295 }) ||
6296 !Visited.insert(TE).second) {
6297 Filtered.push_back(TE);
6298 continue;
6299 }
6300 // Build a map between user nodes and their operands order to speedup
6301 // search. The graph currently does not provide this dependency directly.
6302 for (EdgeInfo &EI : TE->UserTreeIndices)
6303 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6304 }
6305 // Erase filtered entries.
6306 for (TreeEntry *TE : Filtered)
6307 OrderedEntries.remove(TE);
6309 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6310 UsersVec(Users.begin(), Users.end());
6311 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6312 return Data1.first->Idx > Data2.first->Idx;
6313 });
6314 for (auto &Data : UsersVec) {
6315 // Check that operands are used only in the User node.
6316 SmallVector<TreeEntry *> GatherOps;
6317 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6318 GatherOps)) {
6319 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6320 OrderedEntries.remove(Op.second);
6321 continue;
6322 }
6323 // All operands are reordered and used only in this node - propagate the
6324 // most used order to the user node.
6327 OrdersUses;
6328 // Do the analysis for each tree entry only once, otherwise the order of
6329 // the same node my be considered several times, though might be not
6330 // profitable.
6333 for (const auto &Op : Data.second) {
6334 TreeEntry *OpTE = Op.second;
6335 if (!VisitedOps.insert(OpTE).second)
6336 continue;
6337 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6338 continue;
6339 const auto Order = [&]() -> const OrdersType {
6340 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6341 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6342 .value_or(OrdersType(1));
6343 return OpTE->ReorderIndices;
6344 }();
6345 // The order is partially ordered, skip it in favor of fully non-ordered
6346 // orders.
6347 if (Order.size() == 1)
6348 continue;
6349 unsigned NumOps = count_if(
6350 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6351 return P.second == OpTE;
6352 });
6353 // Stores actually store the mask, not the order, need to invert.
6354 if (OpTE->State == TreeEntry::Vectorize &&
6355 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6356 assert(!OpTE->isAltShuffle() &&
6357 "Alternate instructions are only supported by BinaryOperator "
6358 "and CastInst.");
6359 SmallVector<int> Mask;
6360 inversePermutation(Order, Mask);
6361 unsigned E = Order.size();
6362 OrdersType CurrentOrder(E, E);
6363 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6364 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6365 });
6366 fixupOrderingIndices(CurrentOrder);
6367 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6368 NumOps;
6369 } else {
6370 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6371 }
6372 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6373 const auto AllowsReordering = [&](const TreeEntry *TE) {
6374 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6375 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6376 (IgnoreReorder && TE->Idx == 0))
6377 return true;
6378 if (TE->isGather()) {
6379 if (GathersToOrders.contains(TE))
6380 return !getReorderingData(*TE, /*TopToBottom=*/false)
6381 .value_or(OrdersType(1))
6382 .empty();
6383 return true;
6384 }
6385 return false;
6386 };
6387 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6388 TreeEntry *UserTE = EI.UserTE;
6389 if (!VisitedUsers.insert(UserTE).second)
6390 continue;
6391 // May reorder user node if it requires reordering, has reused
6392 // scalars, is an alternate op vectorize node or its op nodes require
6393 // reordering.
6394 if (AllowsReordering(UserTE))
6395 continue;
6396 // Check if users allow reordering.
6397 // Currently look up just 1 level of operands to avoid increase of
6398 // the compile time.
6399 // Profitable to reorder if definitely more operands allow
6400 // reordering rather than those with natural order.
6402 if (static_cast<unsigned>(count_if(
6403 Ops, [UserTE, &AllowsReordering](
6404 const std::pair<unsigned, TreeEntry *> &Op) {
6405 return AllowsReordering(Op.second) &&
6406 all_of(Op.second->UserTreeIndices,
6407 [UserTE](const EdgeInfo &EI) {
6408 return EI.UserTE == UserTE;
6409 });
6410 })) <= Ops.size() / 2)
6411 ++Res.first->second;
6412 }
6413 }
6414 if (OrdersUses.empty()) {
6415 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6416 OrderedEntries.remove(Op.second);
6417 continue;
6418 }
6419 // Choose the most used order.
6420 unsigned IdentityCnt = 0;
6421 unsigned VF = Data.second.front().second->getVectorFactor();
6422 OrdersType IdentityOrder(VF, VF);
6423 for (auto &Pair : OrdersUses) {
6424 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6425 IdentityCnt += Pair.second;
6426 combineOrders(IdentityOrder, Pair.first);
6427 }
6428 }
6429 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6430 unsigned Cnt = IdentityCnt;
6431 for (auto &Pair : OrdersUses) {
6432 // Prefer identity order. But, if filled identity found (non-empty
6433 // order) with same number of uses, as the new candidate order, we can
6434 // choose this candidate order.
6435 if (Cnt < Pair.second) {
6436 combineOrders(Pair.first, BestOrder);
6437 BestOrder = Pair.first;
6438 Cnt = Pair.second;
6439 } else {
6440 combineOrders(BestOrder, Pair.first);
6441 }
6442 }
6443 // Set order of the user node.
6444 if (isIdentityOrder(BestOrder)) {
6445 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6446 OrderedEntries.remove(Op.second);
6447 continue;
6448 }
6449 fixupOrderingIndices(BestOrder);
6450 // Erase operands from OrderedEntries list and adjust their orders.
6451 VisitedOps.clear();
6452 SmallVector<int> Mask;
6453 inversePermutation(BestOrder, Mask);
6454 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6455 unsigned E = BestOrder.size();
6456 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6457 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6458 });
6459 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6460 TreeEntry *TE = Op.second;
6461 OrderedEntries.remove(TE);
6462 if (!VisitedOps.insert(TE).second)
6463 continue;
6464 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6465 reorderNodeWithReuses(*TE, Mask);
6466 continue;
6467 }
6468 // Gathers are processed separately.
6469 if (TE->State != TreeEntry::Vectorize &&
6470 TE->State != TreeEntry::StridedVectorize &&
6471 (TE->State != TreeEntry::ScatterVectorize ||
6472 TE->ReorderIndices.empty()))
6473 continue;
6474 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6475 TE->ReorderIndices.empty()) &&
6476 "Non-matching sizes of user/operand entries.");
6477 reorderOrder(TE->ReorderIndices, Mask);
6478 if (IgnoreReorder && TE == VectorizableTree.front().get())
6479 IgnoreReorder = false;
6480 }
6481 // For gathers just need to reorder its scalars.
6482 for (TreeEntry *Gather : GatherOps) {
6483 assert(Gather->ReorderIndices.empty() &&
6484 "Unexpected reordering of gathers.");
6485 if (!Gather->ReuseShuffleIndices.empty()) {
6486 // Just reorder reuses indices.
6487 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6488 continue;
6489 }
6490 reorderScalars(Gather->Scalars, Mask);
6491 OrderedEntries.remove(Gather);
6492 }
6493 // Reorder operands of the user node and set the ordering for the user
6494 // node itself.
6495 if (Data.first->State != TreeEntry::Vectorize ||
6496 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6497 Data.first->getMainOp()) ||
6498 Data.first->isAltShuffle())
6499 Data.first->reorderOperands(Mask);
6500 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6501 Data.first->isAltShuffle() ||
6502 Data.first->State == TreeEntry::StridedVectorize) {
6503 reorderScalars(Data.first->Scalars, Mask);
6504 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6505 /*BottomOrder=*/true);
6506 if (Data.first->ReuseShuffleIndices.empty() &&
6507 !Data.first->ReorderIndices.empty() &&
6508 !Data.first->isAltShuffle()) {
6509 // Insert user node to the list to try to sink reordering deeper in
6510 // the graph.
6511 OrderedEntries.insert(Data.first);
6512 }
6513 } else {
6514 reorderOrder(Data.first->ReorderIndices, Mask);
6515 }
6516 }
6517 }
6518 // If the reordering is unnecessary, just remove the reorder.
6519 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6520 VectorizableTree.front()->ReuseShuffleIndices.empty())
6521 VectorizableTree.front()->ReorderIndices.clear();
6522}
6523
6524Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6525 if ((Entry.getOpcode() == Instruction::Store ||
6526 Entry.getOpcode() == Instruction::Load) &&
6527 Entry.State == TreeEntry::StridedVectorize &&
6528 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6529 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6530 return dyn_cast<Instruction>(Entry.Scalars.front());
6531}
6532
6534 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6535 DenseMap<Value *, unsigned> ScalarToExtUses;
6536 // Collect the values that we need to extract from the tree.
6537 for (auto &TEPtr : VectorizableTree) {
6538 TreeEntry *Entry = TEPtr.get();
6539
6540 // No need to handle users of gathered values.
6541 if (Entry->isGather())
6542 continue;
6543
6544 // For each lane:
6545 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6546 Value *Scalar = Entry->Scalars[Lane];
6547 if (!isa<Instruction>(Scalar))
6548 continue;
6549 // All uses must be replaced already? No need to do it again.
6550 auto It = ScalarToExtUses.find(Scalar);
6551 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6552 continue;
6553
6554 // Check if the scalar is externally used as an extra arg.
6555 const auto ExtI = ExternallyUsedValues.find(Scalar);
6556 if (ExtI != ExternallyUsedValues.end()) {
6557 int FoundLane = Entry->findLaneForValue(Scalar);
6558 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6559 << FoundLane << " from " << *Scalar << ".\n");
6560 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6561 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6562 continue;
6563 }
6564 for (User *U : Scalar->users()) {
6565 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6566
6567 Instruction *UserInst = dyn_cast<Instruction>(U);
6568 if (!UserInst || isDeleted(UserInst))
6569 continue;
6570
6571 // Ignore users in the user ignore list.
6572 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6573 continue;
6574
6575 // Skip in-tree scalars that become vectors
6576 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6577 // Some in-tree scalars will remain as scalar in vectorized
6578 // instructions. If that is the case, the one in FoundLane will
6579 // be used.
6580 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6582 Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
6583 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6584 << ".\n");
6585 assert(!UseEntry->isGather() && "Bad state");
6586 continue;
6587 }
6588 U = nullptr;
6589 if (It != ScalarToExtUses.end()) {
6590 ExternalUses[It->second].User = nullptr;
6591 break;
6592 }
6593 }
6594
6595 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6596 U = nullptr;
6597 int FoundLane = Entry->findLaneForValue(Scalar);
6598 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6599 << " from lane " << FoundLane << " from " << *Scalar
6600 << ".\n");
6601 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6602 ExternalUses.emplace_back(Scalar, U, FoundLane);
6603 if (!U)
6604 break;
6605 }
6606 }
6607 }
6608}
6609
6611BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6614 PtrToStoresMap;
6615 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6616 Value *V = TE->Scalars[Lane];
6617 // Don't iterate over the users of constant data.
6618 if (!isa<Instruction>(V))
6619 continue;
6620 // To save compilation time we don't visit if we have too many users.
6621 if (V->hasNUsesOrMore(UsesLimit))
6622 break;
6623
6624 // Collect stores per pointer object.
6625 for (User *U : V->users()) {
6626 auto *SI = dyn_cast<StoreInst>(U);
6627 // Test whether we can handle the store. V might be a global, which could
6628 // be used in a different function.
6629 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6630 !isValidElementType(SI->getValueOperand()->getType()))
6631 continue;
6632 // Skip entry if already
6633 if (getTreeEntry(U))
6634 continue;
6635
6636 Value *Ptr =
6637 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6638 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6639 SI->getValueOperand()->getType(), Ptr}];
6640 // For now just keep one store per pointer object per lane.
6641 // TODO: Extend this to support multiple stores per pointer per lane
6642 if (StoresVec.size() > Lane)
6643 continue;
6644 if (!StoresVec.empty()) {
6645 std::optional<int> Diff = getPointersDiff(
6646 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6647 SI->getValueOperand()->getType(),
6648 StoresVec.front()->getPointerOperand(), *DL, *SE,
6649 /*StrictCheck=*/true);
6650 // We failed to compare the pointers so just abandon this store.
6651 if (!Diff)
6652 continue;
6653 }
6654 StoresVec.push_back(SI);
6655 }
6656 }
6657 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6658 unsigned I = 0;
6659 for (auto &P : PtrToStoresMap) {
6660 Res[I].swap(P.second);
6661 ++I;
6662 }
6663 return Res;
6664}
6665
6666bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6667 OrdersType &ReorderIndices) const {
6668 // We check whether the stores in StoreVec can form a vector by sorting them
6669 // and checking whether they are consecutive.
6670
6671 // To avoid calling getPointersDiff() while sorting we create a vector of
6672 // pairs {store, offset from first} and sort this instead.
6674 StoreInst *S0 = StoresVec[0];
6675 StoreOffsetVec.emplace_back(0, 0);
6676 Type *S0Ty = S0->getValueOperand()->getType();
6677 Value *S0Ptr = S0->getPointerOperand();
6678 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6679 StoreInst *SI = StoresVec[Idx];
6680 std::optional<int> Diff =
6681 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6682 SI->getPointerOperand(), *DL, *SE,
6683 /*StrictCheck=*/true);
6684 StoreOffsetVec.emplace_back(*Diff, Idx);
6685 }
6686
6687 // Check if the stores are consecutive by checking if their difference is 1.
6688 if (StoreOffsetVec.size() != StoresVec.size())
6689 return false;
6690 sort(StoreOffsetVec,
6691 [](const std::pair<int, unsigned> &L,
6692 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6693 unsigned Idx = 0;
6694 int PrevDist = 0;
6695 for (const auto &P : StoreOffsetVec) {
6696 if (Idx > 0 && P.first != PrevDist + 1)
6697 return false;
6698 PrevDist = P.first;
6699 ++Idx;
6700 }
6701
6702 // Calculate the shuffle indices according to their offset against the sorted
6703 // StoreOffsetVec.
6704 ReorderIndices.assign(StoresVec.size(), 0);
6705 bool IsIdentity = true;
6706 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6707 ReorderIndices[P.second] = I;
6708 IsIdentity &= P.second == I;
6709 }
6710 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6711 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6712 // same convention here.
6713 if (IsIdentity)
6714 ReorderIndices.clear();
6715
6716 return true;
6717}
6718
6719#ifndef NDEBUG
6721 for (unsigned Idx : Order)
6722 dbgs() << Idx << ", ";
6723 dbgs() << "\n";
6724}
6725#endif
6726
6728BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6729 unsigned NumLanes = TE->Scalars.size();
6730
6731 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6732
6733 // Holds the reorder indices for each candidate store vector that is a user of
6734 // the current TreeEntry.
6735 SmallVector<OrdersType, 1> ExternalReorderIndices;
6736
6737 // Now inspect the stores collected per pointer and look for vectorization
6738 // candidates. For each candidate calculate the reorder index vector and push
6739 // it into `ExternalReorderIndices`
6740 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6741 // If we have fewer than NumLanes stores, then we can't form a vector.
6742 if (StoresVec.size() != NumLanes)
6743 continue;
6744
6745 // If the stores are not consecutive then abandon this StoresVec.
6746 OrdersType ReorderIndices;
6747 if (!canFormVector(StoresVec, ReorderIndices))
6748 continue;
6749
6750 // We now know that the scalars in StoresVec can form a vector instruction,
6751 // so set the reorder indices.
6752 ExternalReorderIndices.push_back(ReorderIndices);
6753 }
6754 return ExternalReorderIndices;
6755}
6756
6758 const SmallDenseSet<Value *> &UserIgnoreLst) {
6759 deleteTree();
6760 UserIgnoreList = &UserIgnoreLst;
6761 if (!allSameType(Roots))
6762 return;
6763 buildTree_rec(Roots, 0, EdgeInfo());
6764}
6765
6767 deleteTree();
6768 if (!allSameType(Roots))
6769 return;
6770 buildTree_rec(Roots, 0, EdgeInfo());
6771}
6772
6773/// Tries to find subvector of loads and builds new vector of only loads if can
6774/// be profitable.
6776 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6778 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6779 bool AddNew = true) {
6780 if (VL.empty())
6781 return;
6782 Type *ScalarTy = getValueType(VL.front());
6783 if (!isValidElementType(ScalarTy))
6784 return;
6786 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6787 for (Value *V : VL) {
6788 auto *LI = dyn_cast<LoadInst>(V);
6789 if (!LI)
6790 continue;
6791 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6792 continue;
6793 bool IsFound = false;
6794 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6795 assert(LI->getParent() == Data.front().first->getParent() &&
6796 LI->getType() == Data.front().first->getType() &&
6797 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6798 getUnderlyingObject(Data.front().first->getPointerOperand(),
6800 "Expected loads with the same type, same parent and same "
6801 "underlying pointer.");
6802 std::optional<int> Dist = getPointersDiff(
6803 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6804 Data.front().first->getPointerOperand(), DL, SE,
6805 /*StrictCheck=*/true);
6806 if (!Dist)
6807 continue;
6808 auto It = Map.find(*Dist);
6809 if (It != Map.end() && It->second != LI)
6810 continue;
6811 if (It == Map.end()) {
6812 Data.emplace_back(LI, *Dist);
6813 Map.try_emplace(*Dist, LI);
6814 }
6815 IsFound = true;
6816 break;
6817 }
6818 if (!IsFound) {
6819 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6820 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6821 }
6822 }
6823 auto FindMatchingLoads =
6826 &GatheredLoads,
6827 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6828 int &Offset, unsigned &Start) {
6829 if (Loads.empty())
6830 return GatheredLoads.end();
6832 LoadInst *LI = Loads.front().first;
6833 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6834 if (Idx < Start)
6835 continue;
6836 ToAdd.clear();
6837 if (LI->getParent() != Data.front().first->getParent() ||
6838 LI->getType() != Data.front().first->getType())
6839 continue;
6840 std::optional<int> Dist =
6842 Data.front().first->getType(),
6843 Data.front().first->getPointerOperand(), DL, SE,
6844 /*StrictCheck=*/true);
6845 if (!Dist)
6846 continue;
6847 SmallSet<int, 4> DataDists;
6849 for (std::pair<LoadInst *, int> P : Data) {
6850 DataDists.insert(P.second);
6851 DataLoads.insert(P.first);
6852 }
6853 // Found matching gathered loads - check if all loads are unique or
6854 // can be effectively vectorized.
6855 unsigned NumUniques = 0;
6856 for (auto [Cnt, Pair] : enumerate(Loads)) {
6857 bool Used = DataLoads.contains(Pair.first);
6858 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6859 ++NumUniques;
6860 ToAdd.insert(Cnt);
6861 } else if (Used) {
6862 Repeated.insert(Cnt);
6863 }
6864 }
6865 if (NumUniques > 0 &&
6866 (Loads.size() == NumUniques ||
6867 (Loads.size() - NumUniques >= 2 &&
6868 Loads.size() - NumUniques >= Loads.size() / 2 &&
6869 (has_single_bit(Data.size() + NumUniques) ||
6870 bit_ceil(Data.size()) <
6871 bit_ceil(Data.size() + NumUniques))))) {
6872 Offset = *Dist;
6873 Start = Idx + 1;
6874 return std::next(GatheredLoads.begin(), Idx);
6875 }
6876 }
6877 ToAdd.clear();
6878 return GatheredLoads.end();
6879 };
6880 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6881 unsigned Start = 0;
6882 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6883 int Offset = 0;
6884 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6885 Offset, Start);
6886 while (It != GatheredLoads.end()) {
6887 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6888 for (unsigned Idx : LocalToAdd)
6889 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6890 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6891 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6892 Start);
6893 }
6894 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6895 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6896 })) {
6897 auto AddNewLoads =
6899 for (unsigned Idx : seq<unsigned>(Data.size())) {
6900 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6901 continue;
6902 Loads.push_back(Data[Idx]);
6903 }
6904 };
6905 if (!AddNew) {
6906 LoadInst *LI = Data.front().first;
6907 It = find_if(
6908 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6909 return PD.front().first->getParent() == LI->getParent() &&
6910 PD.front().first->getType() == LI->getType();
6911 });
6912 while (It != GatheredLoads.end()) {
6913 AddNewLoads(*It);
6914 It = std::find_if(
6915 std::next(It), GatheredLoads.end(),
6916 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6917 return PD.front().first->getParent() == LI->getParent() &&
6918 PD.front().first->getType() == LI->getType();
6919 });
6920 }
6921 }
6922 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6923 AddNewLoads(GatheredLoads.emplace_back());
6924 }
6925 }
6926}
6927
6928void BoUpSLP::tryToVectorizeGatheredLoads(
6929 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6930 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6931 8> &GatheredLoads) {
6932 GatheredLoadsEntriesFirst = VectorizableTree.size();
6933
6934 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6935 LoadEntriesToVectorize.size());
6936 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6937 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6938 VectorizableTree[Idx]->Scalars.end());
6939
6940 // Sort loads by distance.
6941 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6942 const std::pair<LoadInst *, int> &L2) {
6943 return L1.second > L2.second;
6944 };
6945
6946 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
6947 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6948 Loads.size());
6949 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6950 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6951 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6952 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6953 };
6954
6955 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6956 BoUpSLP::ValueSet &VectorizedLoads,
6957 SmallVectorImpl<LoadInst *> &NonVectorized,
6958 bool Final, unsigned MaxVF) {
6960 unsigned StartIdx = 0;
6961 SmallVector<int> CandidateVFs;
6962 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6963 CandidateVFs.push_back(MaxVF);
6964 for (int NumElts = getFloorFullVectorNumberOfElements(
6965 *TTI, Loads.front()->getType(), MaxVF);
6966 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6967 *TTI, Loads.front()->getType(), NumElts - 1)) {
6968 CandidateVFs.push_back(NumElts);
6969 if (VectorizeNonPowerOf2 && NumElts > 2)
6970 CandidateVFs.push_back(NumElts - 1);
6971 }
6972
6973 if (Final && CandidateVFs.empty())
6974 return Results;
6975
6976 unsigned BestVF = Final ? CandidateVFs.back() : 0;
6977 for (unsigned NumElts : CandidateVFs) {
6978 if (Final && NumElts > BestVF)
6979 continue;
6980 SmallVector<unsigned> MaskedGatherVectorized;
6981 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
6982 ++Cnt) {
6983 ArrayRef<LoadInst *> Slice =
6984 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
6985 if (VectorizedLoads.count(Slice.front()) ||
6986 VectorizedLoads.count(Slice.back()) ||
6988 continue;
6989 // Check if it is profitable to try vectorizing gathered loads. It is
6990 // profitable if we have more than 3 consecutive loads or if we have
6991 // less but all users are vectorized or deleted.
6992 bool AllowToVectorize = false;
6993 // Check if it is profitable to vectorize 2-elements loads.
6994 if (NumElts == 2) {
6995 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
6996 Slice.front()->getType(), ElementCount::getFixed(NumElts));
6997 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
6998 for (LoadInst *LI : Slice) {
6999 // If single use/user - allow to vectorize.
7000 if (LI->hasOneUse())
7001 continue;
7002 // 1. Check if number of uses equals number of users.
7003 // 2. All users are deleted.
7004 // 3. The load broadcasts are not allowed or the load is not
7005 // broadcasted.
7006 if (static_cast<unsigned int>(std::distance(
7007 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7008 return false;
7009 if (!IsLegalBroadcastLoad)
7010 continue;
7011 if (LI->hasNUsesOrMore(UsesLimit))
7012 return false;
7013 for (User *U : LI->users()) {
7014 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
7015 continue;
7016 if (const TreeEntry *UTE = getTreeEntry(U)) {
7017 for (int I : seq<int>(UTE->getNumOperands())) {
7018 if (all_of(UTE->getOperand(I),
7019 [LI](Value *V) { return V == LI; }))
7020 // Found legal broadcast - do not vectorize.
7021 return false;
7022 }
7023 }
7024 }
7025 }
7026 return true;
7027 };
7028 AllowToVectorize = CheckIfAllowed(Slice);
7029 } else {
7030 AllowToVectorize =
7031 (NumElts >= 3 ||
7032 any_of(ValueToGatherNodes.at(Slice.front()),
7033 [=](const TreeEntry *TE) {
7034 return TE->Scalars.size() == 2 &&
7035 ((TE->Scalars.front() == Slice.front() &&
7036 TE->Scalars.back() == Slice.back()) ||
7037 (TE->Scalars.front() == Slice.back() &&
7038 TE->Scalars.back() == Slice.front()));
7039 })) &&
7040 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
7041 Slice.size());
7042 }
7043 if (AllowToVectorize) {
7044 SmallVector<Value *> PointerOps;
7045 OrdersType CurrentOrder;
7046 // Try to build vector load.
7047 ArrayRef<Value *> Values(
7048 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7049 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
7050 PointerOps, &BestVF);
7051 if (LS != LoadsState::Gather ||
7052 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7053 if (LS == LoadsState::ScatterVectorize) {
7054 if (MaskedGatherVectorized.empty() ||
7055 Cnt >= MaskedGatherVectorized.back() + NumElts)
7056 MaskedGatherVectorized.push_back(Cnt);
7057 continue;
7058 }
7059 if (LS != LoadsState::Gather) {
7060 Results.emplace_back(Values, LS);
7061 VectorizedLoads.insert(Slice.begin(), Slice.end());
7062 // If we vectorized initial block, no need to try to vectorize it
7063 // again.
7064 if (Cnt == StartIdx)
7065 StartIdx += NumElts;
7066 }
7067 // Check if the whole array was vectorized already - exit.
7068 if (StartIdx >= Loads.size())
7069 break;
7070 // Erase last masked gather candidate, if another candidate within
7071 // the range is found to be better.
7072 if (!MaskedGatherVectorized.empty() &&
7073 Cnt < MaskedGatherVectorized.back() + NumElts)
7074 MaskedGatherVectorized.pop_back();
7075 Cnt += NumElts - 1;
7076 continue;
7077 }
7078 }
7079 if (!AllowToVectorize || BestVF == 0)
7081 }
7082 // Mark masked gathers candidates as vectorized, if any.
7083 for (unsigned Cnt : MaskedGatherVectorized) {
7084 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7085 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7086 ArrayRef<Value *> Values(
7087 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7088 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7089 VectorizedLoads.insert(Slice.begin(), Slice.end());
7090 // If we vectorized initial block, no need to try to vectorize it again.
7091 if (Cnt == StartIdx)
7092 StartIdx += NumElts;
7093 }
7094 }
7095 for (LoadInst *LI : Loads) {
7096 if (!VectorizedLoads.contains(LI))
7097 NonVectorized.push_back(LI);
7098 }
7099 return Results;
7100 };
7101 auto ProcessGatheredLoads =
7102 [&, &TTI = *TTI](
7104 bool Final = false) {
7105 SmallVector<LoadInst *> NonVectorized;
7106 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7107 if (LoadsDists.size() <= 1) {
7108 NonVectorized.push_back(LoadsDists.back().first);
7109 continue;
7110 }
7111 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7112 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7113 transform(LoadsDists, OriginalLoads.begin(),
7114 [](const std::pair<LoadInst *, int> &L) -> LoadInst * {
7115 return L.first;
7116 });
7117 stable_sort(LocalLoadsDists, LoadSorter);
7119 unsigned MaxConsecutiveDistance = 0;
7120 unsigned CurrentConsecutiveDist = 1;
7121 int LastDist = LocalLoadsDists.front().second;
7122 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7123 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7124 if (getTreeEntry(L.first))
7125 continue;
7126 assert(LastDist >= L.second &&
7127 "Expected first distance always not less than second");
7128 if (static_cast<unsigned>(LastDist - L.second) ==
7129 CurrentConsecutiveDist) {
7130 ++CurrentConsecutiveDist;
7131 MaxConsecutiveDistance =
7132 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7133 Loads.push_back(L.first);
7134 continue;
7135 }
7136 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7137 !Loads.empty())
7138 Loads.pop_back();
7139 CurrentConsecutiveDist = 1;
7140 LastDist = L.second;
7141 Loads.push_back(L.first);
7142 }
7143 if (Loads.size() <= 1)
7144 continue;
7145 if (AllowMaskedGather)
7146 MaxConsecutiveDistance = Loads.size();
7147 else if (MaxConsecutiveDistance < 2)
7148 continue;
7149 BoUpSLP::ValueSet VectorizedLoads;
7150 SmallVector<LoadInst *> SortedNonVectorized;
7152 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7153 Final, MaxConsecutiveDistance);
7154 if (!Results.empty() && !SortedNonVectorized.empty() &&
7155 OriginalLoads.size() == Loads.size() &&
7156 MaxConsecutiveDistance == Loads.size() &&
7158 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7159 return P.second == LoadsState::ScatterVectorize;
7160 })) {
7161 VectorizedLoads.clear();
7162 SmallVector<LoadInst *> UnsortedNonVectorized;
7164 UnsortedResults =
7165 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7166 UnsortedNonVectorized, Final,
7167 OriginalLoads.size());
7168 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7169 SortedNonVectorized.swap(UnsortedNonVectorized);
7170 Results.swap(UnsortedResults);
7171 }
7172 }
7173 for (auto [Slice, _] : Results) {
7174 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7175 << Slice.size() << ")\n");
7176 if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
7177 for (Value *L : Slice)
7178 if (!getTreeEntry(L))
7179 SortedNonVectorized.push_back(cast<LoadInst>(L));
7180 continue;
7181 }
7182
7183 // Select maximum VF as a maximum of user gathered nodes and
7184 // distance between scalar loads in these nodes.
7185 unsigned MaxVF = Slice.size();
7186 unsigned UserMaxVF = 0;
7187 unsigned InterleaveFactor = 0;
7188 if (MaxVF == 2) {
7189 UserMaxVF = MaxVF;
7190 } else {
7191 // Found distance between segments of the interleaved loads.
7192 std::optional<unsigned> InterleavedLoadsDistance = 0;
7193 unsigned Order = 0;
7194 std::optional<unsigned> CommonVF = 0;
7196 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7197 for (auto [Idx, V] : enumerate(Slice)) {
7198 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7199 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7200 unsigned Pos =
7201 EntryToPosition.try_emplace(E, Idx).first->second;
7202 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7203 if (CommonVF) {
7204 if (*CommonVF == 0) {
7205 CommonVF = E->Scalars.size();
7206 continue;
7207 }
7208 if (*CommonVF != E->Scalars.size())
7209 CommonVF.reset();
7210 }
7211 // Check if the load is the part of the interleaved load.
7212 if (Pos != Idx && InterleavedLoadsDistance) {
7213 if (!DeinterleavedNodes.contains(E) &&
7214 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7215 if (isa<Constant>(V))
7216 return false;
7217 if (getTreeEntry(V))
7218 return true;
7219 const auto &Nodes = ValueToGatherNodes.at(V);
7220 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7221 !is_contained(Slice, V);
7222 })) {
7223 InterleavedLoadsDistance.reset();
7224 continue;
7225 }
7226 DeinterleavedNodes.insert(E);
7227 if (*InterleavedLoadsDistance == 0) {
7228 InterleavedLoadsDistance = Idx - Pos;
7229 continue;
7230 }
7231 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7232 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7233 InterleavedLoadsDistance.reset();
7234 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7235 }
7236 }
7237 }
7238 DeinterleavedNodes.clear();
7239 // Check if the large load represents interleaved load operation.
7240 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7241 CommonVF.value_or(0) != 0) {
7242 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7243 unsigned VF = *CommonVF;
7244 OrdersType Order;
7245 SmallVector<Value *> PointerOps;
7246 // Segmented load detected - vectorize at maximum vector factor.
7247 if (InterleaveFactor <= Slice.size() &&
7249 getWidenedType(Slice.front()->getType(), VF),
7250 InterleaveFactor,
7251 cast<LoadInst>(Slice.front())->getAlign(),
7252 cast<LoadInst>(Slice.front())
7254 canVectorizeLoads(Slice, Slice.front(), Order,
7255 PointerOps) == LoadsState::Vectorize) {
7256 UserMaxVF = InterleaveFactor * VF;
7257 } else {
7258 InterleaveFactor = 0;
7259 }
7260 }
7261 // Cannot represent the loads as consecutive vectorizable nodes -
7262 // just exit.
7263 unsigned ConsecutiveNodesSize = 0;
7264 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7265 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7266 [&, Slice = Slice](const auto &P) {
7267 const auto *It = find_if(Slice, [&](Value *V) {
7268 return std::get<1>(P).contains(V);
7269 });
7270 if (It == Slice.end())
7271 return false;
7273 VectorizableTree[std::get<0>(P)]->Scalars;
7274 ConsecutiveNodesSize += VL.size();
7275 unsigned Start = std::distance(Slice.begin(), It);
7276 unsigned Sz = Slice.size() - Start;
7277 return Sz < VL.size() ||
7278 Slice.slice(std::distance(Slice.begin(), It),
7279 VL.size()) != VL;
7280 }))
7281 continue;
7282 // Try to build long masked gather loads.
7283 UserMaxVF = bit_ceil(UserMaxVF);
7284 if (InterleaveFactor == 0 &&
7285 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7286 [&, Slice = Slice](unsigned Idx) {
7287 OrdersType Order;
7288 SmallVector<Value *> PointerOps;
7289 return canVectorizeLoads(
7290 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7291 Slice[Idx * UserMaxVF], Order,
7292 PointerOps) ==
7293 LoadsState::ScatterVectorize;
7294 }))
7295 UserMaxVF = MaxVF;
7296 if (Slice.size() != ConsecutiveNodesSize)
7297 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7298 }
7299 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7300 bool IsVectorized = true;
7301 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7302 ArrayRef<Value *> SubSlice =
7303 Slice.slice(I, std::min(VF, E - I));
7304 if (getTreeEntry(SubSlice.front()))
7305 continue;
7306 // Check if the subslice is to be-vectorized entry, which is not
7307 // equal to entry.
7308 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7309 [&](const auto &P) {
7310 return !SubSlice.equals(
7311 VectorizableTree[std::get<0>(P)]
7312 ->Scalars) &&
7313 set_is_subset(SubSlice, std::get<1>(P));
7314 }))
7315 continue;
7316 unsigned Sz = VectorizableTree.size();
7317 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7318 if (Sz == VectorizableTree.size()) {
7319 IsVectorized = false;
7320 // Try non-interleaved vectorization with smaller vector
7321 // factor.
7322 if (InterleaveFactor > 0) {
7323 VF = 2 * (MaxVF / InterleaveFactor);
7324 InterleaveFactor = 0;
7325 }
7326 continue;
7327 }
7328 }
7329 if (IsVectorized)
7330 break;
7331 }
7332 }
7333 NonVectorized.append(SortedNonVectorized);
7334 }
7335 return NonVectorized;
7336 };
7337 for (const auto &GLs : GatheredLoads) {
7338 const auto &Ref = GLs.second;
7339 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7340 if (!Ref.empty() && !NonVectorized.empty() &&
7341 std::accumulate(
7342 Ref.begin(), Ref.end(), 0u,
7343 [](unsigned S,
7344 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned {
7345 return S + LoadsDists.size();
7346 }) != NonVectorized.size() &&
7347 IsMaskedGatherSupported(NonVectorized)) {
7349 for (LoadInst *LI : NonVectorized) {
7350 // Reinsert non-vectorized loads to other list of loads with the same
7351 // base pointers.
7352 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7353 FinalGatheredLoads,
7354 /*AddNew=*/false);
7355 }
7356 // Final attempt to vectorize non-vectorized loads.
7357 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7358 }
7359 }
7360 // Try to vectorize postponed load entries, previously marked as gathered.
7361 for (unsigned Idx : LoadEntriesToVectorize) {
7362 const TreeEntry &E = *VectorizableTree[Idx];
7363 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7364 // Avoid reordering, if possible.
7365 if (!E.ReorderIndices.empty()) {
7366 // Build a mask out of the reorder indices and reorder scalars per this
7367 // mask.
7368 SmallVector<int> ReorderMask;
7369 inversePermutation(E.ReorderIndices, ReorderMask);
7370 reorderScalars(GatheredScalars, ReorderMask);
7371 }
7372 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7373 }
7374 // If no new entries created, consider it as no gathered loads entries must be
7375 // handled.
7376 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7377 VectorizableTree.size())
7378 GatheredLoadsEntriesFirst.reset();
7379}
7380
7381/// \return true if the specified list of values has only one instruction that
7382/// requires scheduling, false otherwise.
7383#ifndef NDEBUG
7385 Value *NeedsScheduling = nullptr;
7386 for (Value *V : VL) {
7388 continue;
7389 if (!NeedsScheduling) {
7390 NeedsScheduling = V;
7391 continue;
7392 }
7393 return false;
7394 }
7395 return NeedsScheduling;
7396}
7397#endif
7398
7399/// Generates key/subkey pair for the given value to provide effective sorting
7400/// of the values and better detection of the vectorizable values sequences. The
7401/// keys/subkeys can be used for better sorting of the values themselves (keys)
7402/// and in values subgroups (subkeys).
7403static std::pair<size_t, size_t> generateKeySubkey(
7404 Value *V, const TargetLibraryInfo *TLI,
7405 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7406 bool AllowAlternate) {
7407 hash_code Key = hash_value(V->getValueID() + 2);
7408 hash_code SubKey = hash_value(0);
7409 // Sort the loads by the distance between the pointers.
7410 if (auto *LI = dyn_cast<LoadInst>(V)) {
7411 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7412 if (LI->isSimple())
7413 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7414 else
7415 Key = SubKey = hash_value(LI);
7416 } else if (isVectorLikeInstWithConstOps(V)) {
7417 // Sort extracts by the vector operands.
7418 if (isa<ExtractElementInst, UndefValue>(V))
7419 Key = hash_value(Value::UndefValueVal + 1);
7420 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7421 if (!isUndefVector(EI->getVectorOperand()).all() &&
7422 !isa<UndefValue>(EI->getIndexOperand()))
7423 SubKey = hash_value(EI->getVectorOperand());
7424 }
7425 } else if (auto *I = dyn_cast<Instruction>(V)) {
7426 // Sort other instructions just by the opcodes except for CMPInst.
7427 // For CMP also sort by the predicate kind.
7428 if ((isa<BinaryOperator, CastInst>(I)) &&
7429 isValidForAlternation(I->getOpcode())) {
7430 if (AllowAlternate)
7431 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7432 else
7433 Key = hash_combine(hash_value(I->getOpcode()), Key);
7434 SubKey = hash_combine(
7435 hash_value(I->getOpcode()), hash_value(I->getType()),
7436 hash_value(isa<BinaryOperator>(I)
7437 ? I->getType()
7438 : cast<CastInst>(I)->getOperand(0)->getType()));
7439 // For casts, look through the only operand to improve compile time.
7440 if (isa<CastInst>(I)) {
7441 std::pair<size_t, size_t> OpVals =
7442 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7443 /*AllowAlternate=*/true);
7444 Key = hash_combine(OpVals.first, Key);
7445 SubKey = hash_combine(OpVals.first, SubKey);
7446 }
7447 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7448 CmpInst::Predicate Pred = CI->getPredicate();
7449 if (CI->isCommutative())
7450 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7452 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7453 hash_value(SwapPred),
7454 hash_value(CI->getOperand(0)->getType()));
7455 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7458 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7459 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7460 SubKey = hash_combine(hash_value(I->getOpcode()),
7461 hash_value(Call->getCalledFunction()));
7462 } else {
7463 Key = hash_combine(hash_value(Call), Key);
7464 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7465 }
7466 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7467 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7468 hash_value(Op.Tag), SubKey);
7469 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7470 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7471 SubKey = hash_value(Gep->getPointerOperand());
7472 else
7473 SubKey = hash_value(Gep);
7474 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7475 !isa<ConstantInt>(I->getOperand(1))) {
7476 // Do not try to vectorize instructions with potentially high cost.
7477 SubKey = hash_value(I);
7478 } else {
7479 SubKey = hash_value(I->getOpcode());
7480 }
7481 Key = hash_combine(hash_value(I->getParent()), Key);
7482 }
7483 return std::make_pair(Key, SubKey);
7484}
7485
7486/// Checks if the specified instruction \p I is an alternate operation for
7487/// the given \p MainOp and \p AltOp instructions.
7488static bool isAlternateInstruction(const Instruction *I,
7489 const Instruction *MainOp,
7490 const Instruction *AltOp,
7491 const TargetLibraryInfo &TLI);
7492
7493bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7494 ArrayRef<Value *> VL) const {
7495 unsigned Opcode0 = S.getOpcode();
7496 unsigned Opcode1 = S.getAltOpcode();
7497 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7498 // If this pattern is supported by the target then consider it profitable.
7499 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7500 Opcode0, Opcode1, OpcodeMask))
7501 return true;
7503 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7504 Operands.emplace_back();
7505 // Prepare the operand vector.
7506 for (Value *V : VL) {
7507 if (isa<PoisonValue>(V)) {
7508 Operands.back().push_back(
7509 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7510 continue;
7511 }
7512 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7513 }
7514 }
7515 if (Operands.size() == 2) {
7516 // Try find best operands candidates.
7517 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7519 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7520 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7521 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7522 std::optional<int> Res = findBestRootPair(Candidates);
7523 switch (Res.value_or(0)) {
7524 case 0:
7525 break;
7526 case 1:
7527 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7528 break;
7529 case 2:
7530 std::swap(Operands[0][I], Operands[1][I]);
7531 break;
7532 default:
7533 llvm_unreachable("Unexpected index.");
7534 }
7535 }
7536 }
7537 DenseSet<unsigned> UniqueOpcodes;
7538 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7539 unsigned NonInstCnt = 0;
7540 // Estimate number of instructions, required for the vectorized node and for
7541 // the buildvector node.
7542 unsigned UndefCnt = 0;
7543 // Count the number of extra shuffles, required for vector nodes.
7544 unsigned ExtraShuffleInsts = 0;
7545 // Check that operands do not contain same values and create either perfect
7546 // diamond match or shuffled match.
7547 if (Operands.size() == 2) {
7548 // Do not count same operands twice.
7549 if (Operands.front() == Operands.back()) {
7550 Operands.erase(Operands.begin());
7551 } else if (!allConstant(Operands.front()) &&
7552 all_of(Operands.front(), [&](Value *V) {
7553 return is_contained(Operands.back(), V);
7554 })) {
7555 Operands.erase(Operands.begin());
7556 ++ExtraShuffleInsts;
7557 }
7558 }
7559 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7560 // Vectorize node, if:
7561 // 1. at least single operand is constant or splat.
7562 // 2. Operands have many loop invariants (the instructions are not loop
7563 // invariants).
7564 // 3. At least single unique operands is supposed to vectorized.
7565 return none_of(Operands,
7566 [&](ArrayRef<Value *> Op) {
7567 if (allConstant(Op) ||
7568 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7569 getSameOpcode(Op, *TLI)))
7570 return false;
7572 for (Value *V : Op) {
7573 if (isa<Constant, ExtractElementInst>(V) ||
7574 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
7575 if (isa<UndefValue>(V))
7576 ++UndefCnt;
7577 continue;
7578 }
7579 auto Res = Uniques.try_emplace(V, 0);
7580 // Found first duplicate - need to add shuffle.
7581 if (!Res.second && Res.first->second == 1)
7582 ++ExtraShuffleInsts;
7583 ++Res.first->getSecond();
7584 if (auto *I = dyn_cast<Instruction>(V))
7585 UniqueOpcodes.insert(I->getOpcode());
7586 else if (Res.second)
7587 ++NonInstCnt;
7588 }
7589 return none_of(Uniques, [&](const auto &P) {
7590 return P.first->hasNUsesOrMore(P.second + 1) &&
7591 none_of(P.first->users(), [&](User *U) {
7592 return getTreeEntry(U) || Uniques.contains(U);
7593 });
7594 });
7595 }) ||
7596 // Do not vectorize node, if estimated number of vector instructions is
7597 // more than estimated number of buildvector instructions. Number of
7598 // vector operands is number of vector instructions + number of vector
7599 // instructions for operands (buildvectors). Number of buildvector
7600 // instructions is just number_of_operands * number_of_scalars.
7601 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7602 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7603 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7604}
7605
7606BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7607 const InstructionsState &S, ArrayRef<Value *> VL,
7608 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7609 SmallVectorImpl<Value *> &PointerOps) {
7610 assert(S.getMainOp() &&
7611 "Expected instructions with same/alternate opcodes only.");
7612
7613 unsigned ShuffleOrOp =
7614 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7615 Instruction *VL0 = S.getMainOp();
7616 switch (ShuffleOrOp) {
7617 case Instruction::PHI: {
7618 // Too many operands - gather, most probably won't be vectorized.
7619 if (VL0->getNumOperands() > MaxPHINumOperands)
7620 return TreeEntry::NeedToGather;
7621 // Check for terminator values (e.g. invoke).
7622 for (Value *V : VL) {
7623 auto *PHI = dyn_cast<PHINode>(V);
7624 if (!PHI)
7625 continue;
7626 for (Value *Incoming : PHI->incoming_values()) {
7627 Instruction *Term = dyn_cast<Instruction>(Incoming);
7628 if (Term && Term->isTerminator()) {
7630 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7631 return TreeEntry::NeedToGather;
7632 }
7633 }
7634 }
7635
7636 return TreeEntry::Vectorize;
7637 }
7638 case Instruction::ExtractValue:
7639 case Instruction::ExtractElement: {
7640 bool Reuse = canReuseExtract(VL, CurrentOrder);
7641 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7642 // non-full registers).
7643 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
7644 return TreeEntry::NeedToGather;
7645 if (Reuse || !CurrentOrder.empty())
7646 return TreeEntry::Vectorize;
7647 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7648 return TreeEntry::NeedToGather;
7649 }
7650 case Instruction::InsertElement: {
7651 // Check that we have a buildvector and not a shuffle of 2 or more
7652 // different vectors.
7653 ValueSet SourceVectors;
7654 for (Value *V : VL) {
7655 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7656 assert(getElementIndex(V) != std::nullopt &&
7657 "Non-constant or undef index?");
7658 }
7659
7660 if (count_if(VL, [&SourceVectors](Value *V) {
7661 return !SourceVectors.contains(V);
7662 }) >= 2) {
7663 // Found 2nd source vector - cancel.
7664 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7665 "different source vectors.\n");
7666 return TreeEntry::NeedToGather;
7667 }
7668
7669 if (any_of(VL, [&SourceVectors](Value *V) {
7670 // The last InsertElement can have multiple uses.
7671 return SourceVectors.contains(V) && !V->hasOneUse();
7672 })) {
7673 assert(SLPReVec && "Only supported by REVEC.");
7674 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7675 "multiple uses.\n");
7676 return TreeEntry::NeedToGather;
7677 }
7678
7679 return TreeEntry::Vectorize;
7680 }
7681 case Instruction::Load: {
7682 // Check that a vectorized load would load the same memory as a scalar
7683 // load. For example, we don't want to vectorize loads that are smaller
7684 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7685 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7686 // from such a struct, we read/write packed bits disagreeing with the
7687 // unvectorized version.
7688 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7690 return TreeEntry::Vectorize;
7692 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7693 // Delay slow vectorized nodes for better vectorization attempts.
7694 LoadEntriesToVectorize.insert(VectorizableTree.size());
7695 return TreeEntry::NeedToGather;
7696 }
7697 return TreeEntry::ScatterVectorize;
7699 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7700 // Delay slow vectorized nodes for better vectorization attempts.
7701 LoadEntriesToVectorize.insert(VectorizableTree.size());
7702 return TreeEntry::NeedToGather;
7703 }
7704 return TreeEntry::StridedVectorize;
7705 case LoadsState::Gather:
7706#ifndef NDEBUG
7707 Type *ScalarTy = VL0->getType();
7708 if (DL->getTypeSizeInBits(ScalarTy) !=
7709 DL->getTypeAllocSizeInBits(ScalarTy))
7710 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7711 else if (any_of(VL, [](Value *V) {
7712 auto *LI = dyn_cast<LoadInst>(V);
7713 return !LI || !LI->isSimple();
7714 }))
7715 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7716 else
7717 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7718#endif // NDEBUG
7720 return TreeEntry::NeedToGather;
7721 }
7722 llvm_unreachable("Unexpected state of loads");
7723 }
7724 case Instruction::ZExt:
7725 case Instruction::SExt:
7726 case Instruction::FPToUI:
7727 case Instruction::FPToSI:
7728 case Instruction::FPExt:
7729 case Instruction::PtrToInt:
7730 case Instruction::IntToPtr:
7731 case Instruction::SIToFP:
7732 case Instruction::UIToFP:
7733 case Instruction::Trunc:
7734 case Instruction::FPTrunc:
7735 case Instruction::BitCast: {
7736 Type *SrcTy = VL0->getOperand(0)->getType();
7737 for (Value *V : VL) {
7738 if (isa<PoisonValue>(V))
7739 continue;
7740 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7741 if (Ty != SrcTy || !isValidElementType(Ty)) {
7742 LLVM_DEBUG(
7743 dbgs() << "SLP: Gathering casts with different src types.\n");
7744 return TreeEntry::NeedToGather;
7745 }
7746 }
7747 return TreeEntry::Vectorize;
7748 }
7749 case Instruction::ICmp:
7750 case Instruction::FCmp: {
7751 // Check that all of the compares have the same predicate.
7752 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7754 Type *ComparedTy = VL0->getOperand(0)->getType();
7755 for (Value *V : VL) {
7756 if (isa<PoisonValue>(V))
7757 continue;
7758 auto *Cmp = cast<CmpInst>(V);
7759 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7760 Cmp->getOperand(0)->getType() != ComparedTy) {
7761 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7762 return TreeEntry::NeedToGather;
7763 }
7764 }
7765 return TreeEntry::Vectorize;
7766 }
7767 case Instruction::Select:
7768 case Instruction::FNeg:
7769 case Instruction::Add:
7770 case Instruction::FAdd:
7771 case Instruction::Sub:
7772 case Instruction::FSub:
7773 case Instruction::Mul:
7774 case Instruction::FMul:
7775 case Instruction::UDiv:
7776 case Instruction::SDiv:
7777 case Instruction::FDiv:
7778 case Instruction::URem:
7779 case Instruction::SRem:
7780 case Instruction::FRem:
7781 case Instruction::Shl:
7782 case Instruction::LShr:
7783 case Instruction::AShr:
7784 case Instruction::And:
7785 case Instruction::Or:
7786 case Instruction::Xor:
7787 case Instruction::Freeze:
7788 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7790 auto *I = dyn_cast<Instruction>(V);
7791 return I && I->isBinaryOp() && !I->isFast();
7792 }))
7793 return TreeEntry::NeedToGather;
7794 return TreeEntry::Vectorize;
7795 case Instruction::GetElementPtr: {
7796 // We don't combine GEPs with complicated (nested) indexing.
7797 for (Value *V : VL) {
7798 auto *I = dyn_cast<GetElementPtrInst>(V);
7799 if (!I)
7800 continue;
7801 if (I->getNumOperands() != 2) {
7802 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7803 return TreeEntry::NeedToGather;
7804 }
7805 }
7806
7807 // We can't combine several GEPs into one vector if they operate on
7808 // different types.
7809 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7810 for (Value *V : VL) {
7811 auto *GEP = dyn_cast<GEPOperator>(V);
7812 if (!GEP)
7813 continue;
7814 Type *CurTy = GEP->getSourceElementType();
7815 if (Ty0 != CurTy) {
7816 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7817 return TreeEntry::NeedToGather;
7818 }
7819 }
7820
7821 // We don't combine GEPs with non-constant indexes.
7822 Type *Ty1 = VL0->getOperand(1)->getType();
7823 for (Value *V : VL) {
7824 auto *I = dyn_cast<GetElementPtrInst>(V);
7825 if (!I)
7826 continue;
7827 auto *Op = I->getOperand(1);
7828 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7829 (Op->getType() != Ty1 &&
7830 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7831 Op->getType()->getScalarSizeInBits() >
7832 DL->getIndexSizeInBits(
7833 V->getType()->getPointerAddressSpace())))) {
7834 LLVM_DEBUG(
7835 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7836 return TreeEntry::NeedToGather;
7837 }
7838 }
7839
7840 return TreeEntry::Vectorize;
7841 }
7842 case Instruction::Store: {
7843 // Check if the stores are consecutive or if we need to swizzle them.
7844 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7845 // Avoid types that are padded when being allocated as scalars, while
7846 // being packed together in a vector (such as i1).
7847 if (DL->getTypeSizeInBits(ScalarTy) !=
7848 DL->getTypeAllocSizeInBits(ScalarTy)) {
7849 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7850 return TreeEntry::NeedToGather;
7851 }
7852 // Make sure all stores in the bundle are simple - we can't vectorize
7853 // atomic or volatile stores.
7854 for (Value *V : VL) {
7855 auto *SI = cast<StoreInst>(V);
7856 if (!SI->isSimple()) {
7857 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7858 return TreeEntry::NeedToGather;
7859 }
7860 PointerOps.push_back(SI->getPointerOperand());
7861 }
7862
7863 // Check the order of pointer operands.
7864 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7865 Value *Ptr0;
7866 Value *PtrN;
7867 if (CurrentOrder.empty()) {
7868 Ptr0 = PointerOps.front();
7869 PtrN = PointerOps.back();
7870 } else {
7871 Ptr0 = PointerOps[CurrentOrder.front()];
7872 PtrN = PointerOps[CurrentOrder.back()];
7873 }
7874 std::optional<int> Dist =
7875 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7876 // Check that the sorted pointer operands are consecutive.
7877 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7878 return TreeEntry::Vectorize;
7879 }
7880
7881 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7882 return TreeEntry::NeedToGather;
7883 }
7884 case Instruction::Call: {
7885 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7887 auto *I = dyn_cast<Instruction>(V);
7888 return I && !I->isFast();
7889 }))
7890 return TreeEntry::NeedToGather;
7891 // Check if the calls are all to the same vectorizable intrinsic or
7892 // library function.
7893 CallInst *CI = cast<CallInst>(VL0);
7895
7896 VFShape Shape = VFShape::get(
7897 CI->getFunctionType(),
7898 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7899 false /*HasGlobalPred*/);
7900 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7901
7902 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7903 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7904 return TreeEntry::NeedToGather;
7905 }
7906 Function *F = CI->getCalledFunction();
7907 unsigned NumArgs = CI->arg_size();
7908 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7909 for (unsigned J = 0; J != NumArgs; ++J)
7911 ScalarArgs[J] = CI->getArgOperand(J);
7912 for (Value *V : VL) {
7913 CallInst *CI2 = dyn_cast<CallInst>(V);
7914 if (!CI2 || CI2->getCalledFunction() != F ||
7915 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7916 (VecFunc &&
7917 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7919 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7920 << "\n");
7921 return TreeEntry::NeedToGather;
7922 }
7923 // Some intrinsics have scalar arguments and should be same in order for
7924 // them to be vectorized.
7925 for (unsigned J = 0; J != NumArgs; ++J) {
7927 Value *A1J = CI2->getArgOperand(J);
7928 if (ScalarArgs[J] != A1J) {
7930 << "SLP: mismatched arguments in call:" << *CI
7931 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7932 return TreeEntry::NeedToGather;
7933 }
7934 }
7935 }
7936 // Verify that the bundle operands are identical between the two calls.
7937 if (CI->hasOperandBundles() &&
7938 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7939 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7940 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7941 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7942 << "!=" << *V << '\n');
7943 return TreeEntry::NeedToGather;
7944 }
7945 }
7946
7947 return TreeEntry::Vectorize;
7948 }
7949 case Instruction::ShuffleVector: {
7950 if (!S.isAltShuffle()) {
7951 // REVEC can support non alternate shuffle.
7953 return TreeEntry::Vectorize;
7954 // If this is not an alternate sequence of opcode like add-sub
7955 // then do not vectorize this instruction.
7956 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7957 return TreeEntry::NeedToGather;
7958 }
7959 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7960 LLVM_DEBUG(
7961 dbgs()
7962 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7963 "the whole alt sequence is not profitable.\n");
7964 return TreeEntry::NeedToGather;
7965 }
7966
7967 return TreeEntry::Vectorize;
7968 }
7969 default:
7970 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
7971 return TreeEntry::NeedToGather;
7972 }
7973}
7974
7975namespace {
7976/// Allows to correctly handle operands of the phi nodes based on the \p Main
7977/// PHINode order of incoming basic blocks/values.
7978class PHIHandler {
7979 DominatorTree &DT;
7980 PHINode *Main = nullptr;
7983
7984public:
7985 PHIHandler() = delete;
7986 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
7987 : DT(DT), Main(Main), Phis(Phis),
7988 Operands(Main->getNumIncomingValues(),
7989 SmallVector<Value *>(Phis.size(), nullptr)) {}
7990 void buildOperands() {
7991 constexpr unsigned FastLimit = 4;
7992 if (Main->getNumIncomingValues() <= FastLimit) {
7993 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7994 BasicBlock *InBB = Main->getIncomingBlock(I);
7995 if (!DT.isReachableFromEntry(InBB)) {
7996 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7997 continue;
7998 }
7999 // Prepare the operand vector.
8000 for (auto [Idx, V] : enumerate(Phis)) {
8001 auto *P = dyn_cast<PHINode>(V);
8002 if (!P) {
8003 assert(isa<PoisonValue>(V) &&
8004 "Expected isa instruction or poison value.");
8005 Operands[I][Idx] = V;
8006 continue;
8007 }
8008 if (P->getIncomingBlock(I) == InBB)
8009 Operands[I][Idx] = P->getIncomingValue(I);
8010 else
8011 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
8012 }
8013 }
8014 return;
8015 }
8017 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
8018 BasicBlock *InBB = Main->getIncomingBlock(I);
8019 if (!DT.isReachableFromEntry(InBB)) {
8020 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
8021 continue;
8022 }
8023 Blocks.try_emplace(InBB).first->second.push_back(I);
8024 }
8025 for (auto [Idx, V] : enumerate(Phis)) {
8026 if (isa<PoisonValue>(V)) {
8027 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
8028 Operands[I][Idx] = V;
8029 continue;
8030 }
8031 auto *P = cast<PHINode>(V);
8032 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
8033 BasicBlock *InBB = P->getIncomingBlock(I);
8034 if (InBB == Main->getIncomingBlock(I)) {
8035 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
8036 continue;
8037 Operands[I][Idx] = P->getIncomingValue(I);
8038 continue;
8039 }
8040 auto It = Blocks.find(InBB);
8041 if (It == Blocks.end())
8042 continue;
8043 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
8044 }
8045 }
8046 for (const auto &P : Blocks) {
8047 if (P.getSecond().size() <= 1)
8048 continue;
8049 unsigned BasicI = P.getSecond().front();
8050 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
8052 [&](const auto &Data) {
8053 return !Data.value() ||
8054 Data.value() == Operands[BasicI][Data.index()];
8055 }) &&
8056 "Expected empty operands list.");
8057 Operands[I] = Operands[BasicI];
8058 }
8059 }
8060 }
8061 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
8062};
8063} // namespace
8064
8065void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8066 const EdgeInfo &UserTreeIdx,
8067 unsigned InterleaveFactor) {
8068 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
8069
8070 SmallVector<int> ReuseShuffleIndices;
8071 SmallVector<Value *> UniqueValues;
8072 SmallVector<Value *> NonUniqueValueVL;
8073 auto TryToFindDuplicates = [&](const InstructionsState &S,
8074 bool DoNotFail = false) {
8075 // Check that every instruction appears once in this bundle.
8076 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8077 for (Value *V : VL) {
8078 if (isConstant(V)) {
8079 ReuseShuffleIndices.emplace_back(
8080 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8081 UniqueValues.emplace_back(V);
8082 continue;
8083 }
8084 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8085 ReuseShuffleIndices.emplace_back(Res.first->second);
8086 if (Res.second)
8087 UniqueValues.emplace_back(V);
8088 }
8089 size_t NumUniqueScalarValues = UniqueValues.size();
8090 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8091 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8092 if (NumUniqueScalarValues == VL.size() &&
8093 (VectorizeNonPowerOf2 || IsFullVectors)) {
8094 ReuseShuffleIndices.clear();
8095 } else {
8096 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8097 if ((UserTreeIdx.UserTE &&
8098 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8099 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
8100 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8101 "for nodes with padding.\n");
8102 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8103 return false;
8104 }
8105 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8106 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8107 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8108 return isa<UndefValue>(V) || !isConstant(V);
8109 }))) {
8110 if (DoNotFail && UniquePositions.size() > 1 &&
8111 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8112 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8113 // Find the number of elements, which forms full vectors.
8114 unsigned PWSz = getFullVectorNumberOfElements(
8115 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8116 if (PWSz == VL.size()) {
8117 ReuseShuffleIndices.clear();
8118 } else {
8119 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8120 NonUniqueValueVL.append(
8121 PWSz - UniqueValues.size(),
8122 PoisonValue::get(UniqueValues.front()->getType()));
8123 // Check that extended with poisons operations are still valid for
8124 // vectorization (div/rem are not allowed).
8125 if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
8126 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8127 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8128 return false;
8129 }
8130 VL = NonUniqueValueVL;
8131 }
8132 return true;
8133 }
8134 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8135 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8136 return false;
8137 }
8138 VL = UniqueValues;
8139 }
8140 return true;
8141 };
8142
8143 InstructionsState S = getSameOpcode(VL, *TLI);
8144
8145 // Don't go into catchswitch blocks, which can happen with PHIs.
8146 // Such blocks can only have PHIs and the catchswitch. There is no
8147 // place to insert a shuffle if we need to, so just avoid that issue.
8148 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8149 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8150 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8151 return;
8152 }
8153
8154 // Check if this is a duplicate of another entry.
8155 if (S) {
8156 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8157 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
8158 << ".\n");
8159 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8160 auto It = MultiNodeScalars.find(S.getMainOp());
8161 if (It != MultiNodeScalars.end()) {
8162 auto *TEIt = find_if(It->getSecond(),
8163 [&](TreeEntry *ME) { return ME->isSame(VL); });
8164 if (TEIt != It->getSecond().end())
8165 E = *TEIt;
8166 else
8167 E = nullptr;
8168 } else {
8169 E = nullptr;
8170 }
8171 }
8172 if (!E) {
8173 if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8174 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
8175 if (TryToFindDuplicates(S))
8176 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8177 ReuseShuffleIndices);
8178 return;
8179 }
8181 Nodes.insert(getTreeEntry(S.getMainOp()));
8182 for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8183 Nodes.insert(E);
8184 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8185 if (any_of(Nodes, [&](const TreeEntry *E) {
8186 if (all_of(E->Scalars,
8187 [&](Value *V) { return Values.contains(V); }))
8188 return true;
8189 SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8190 E->Scalars.end());
8191 return (
8192 all_of(VL, [&](Value *V) { return EValues.contains(V); }));
8193 })) {
8194 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8195 if (TryToFindDuplicates(S))
8196 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8197 ReuseShuffleIndices);
8198 return;
8199 }
8200 } else {
8201 // Record the reuse of the tree node. FIXME, currently this is only
8202 // used to properly draw the graph rather than for the actual
8203 // vectorization.
8204 E->UserTreeIndices.push_back(UserTreeIdx);
8205 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8206 << ".\n");
8207 return;
8208 }
8209 }
8210 }
8211
8212 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8213 // a load), in which case peek through to include it in the tree, without
8214 // ballooning over-budget.
8215 if (Depth >= RecursionMaxDepth &&
8216 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8217 (match(S.getMainOp(), m_Load(m_Value())) ||
8218 all_of(VL, [&S](const Value *I) {
8219 return match(I,
8221 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8222 })))) {
8223 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8224 if (TryToFindDuplicates(S))
8225 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8226 ReuseShuffleIndices);
8227 return;
8228 }
8229
8230 // Don't handle scalable vectors
8231 if (S && S.getOpcode() == Instruction::ExtractElement &&
8232 isa<ScalableVectorType>(
8233 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8234 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8235 if (TryToFindDuplicates(S))
8236 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8237 ReuseShuffleIndices);
8238 return;
8239 }
8240
8241 // Don't handle vectors.
8242 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8243 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8244 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8245 return;
8246 }
8247
8248 // If all of the operands are identical or constant we have a simple solution.
8249 // If we deal with insert/extract instructions, they all must have constant
8250 // indices, otherwise we should gather them, not try to vectorize.
8251 // If alternate op node with 2 elements with gathered operands - do not
8252 // vectorize.
8253 auto &&NotProfitableForVectorization = [&S, this,
8255 if (!S || !S.isAltShuffle() || VL.size() > 2)
8256 return false;
8257 if (VectorizableTree.size() < MinTreeSize)
8258 return false;
8259 if (Depth >= RecursionMaxDepth - 1)
8260 return true;
8261 // Check if all operands are extracts, part of vector node or can build a
8262 // regular vectorize node.
8263 SmallVector<unsigned, 8> InstsCount;
8264 for (Value *V : VL) {
8265 auto *I = cast<Instruction>(V);
8266 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8267 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8268 }));
8269 }
8270 bool IsCommutative =
8271 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8272 if ((IsCommutative &&
8273 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8274 (!IsCommutative &&
8275 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8276 return true;
8277 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8279 auto *I1 = cast<Instruction>(VL.front());
8280 auto *I2 = cast<Instruction>(VL.back());
8281 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8282 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8283 I2->getOperand(Op));
8284 if (static_cast<unsigned>(count_if(
8285 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8287 })) >= S.getMainOp()->getNumOperands() / 2)
8288 return false;
8289 if (S.getMainOp()->getNumOperands() > 2)
8290 return true;
8291 if (IsCommutative) {
8292 // Check permuted operands.
8293 Candidates.clear();
8294 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8295 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8296 I2->getOperand((Op + 1) % E));
8297 if (any_of(
8298 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8300 }))
8301 return false;
8302 }
8303 return true;
8304 };
8305 SmallVector<unsigned> SortedIndices;
8306 BasicBlock *BB = nullptr;
8307 bool IsScatterVectorizeUserTE =
8308 UserTreeIdx.UserTE &&
8309 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8310 bool AreAllSameBlock = S && allSameBlock(VL);
8311 bool AreScatterAllGEPSameBlock =
8312 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8313 VL.size() > 2 &&
8314 all_of(VL,
8315 [&BB](Value *V) {
8316 auto *I = dyn_cast<GetElementPtrInst>(V);
8317 if (!I)
8318 return doesNotNeedToBeScheduled(V);
8319 if (!BB)
8320 BB = I->getParent();
8321 return BB == I->getParent() && I->getNumOperands() == 2;
8322 }) &&
8323 BB &&
8324 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8325 SortedIndices));
8326 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8327 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
8328 (S &&
8329 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8330 S.getMainOp()) &&
8332 NotProfitableForVectorization(VL)) {
8333 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8334 if (TryToFindDuplicates(S))
8335 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8336 ReuseShuffleIndices);
8337 return;
8338 }
8339
8340 // Don't vectorize ephemeral values.
8341 if (S && !EphValues.empty()) {
8342 for (Value *V : VL) {
8343 if (EphValues.count(V)) {
8344 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8345 << ") is ephemeral.\n");
8346 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8347 return;
8348 }
8349 }
8350 }
8351
8352 // We now know that this is a vector of instructions of the same type from
8353 // the same block.
8354
8355 // Check that none of the instructions in the bundle are already in the tree.
8356 for (Value *V : VL) {
8357 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8359 continue;
8360 if (getTreeEntry(V)) {
8361 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8362 << ") is already in tree.\n");
8363 if (TryToFindDuplicates(S))
8364 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8365 ReuseShuffleIndices);
8366 return;
8367 }
8368 }
8369
8370 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8371 if (UserIgnoreList && !UserIgnoreList->empty()) {
8372 for (Value *V : VL) {
8373 if (UserIgnoreList->contains(V)) {
8374 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8375 if (TryToFindDuplicates(S))
8376 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8377 ReuseShuffleIndices);
8378 return;
8379 }
8380 }
8381 }
8382
8383 // Special processing for sorted pointers for ScatterVectorize node with
8384 // constant indeces only.
8385 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8386 assert(VL.front()->getType()->isPointerTy() &&
8387 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8388 "Expected pointers only.");
8389 // Reset S to make it GetElementPtr kind of node.
8390 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8391 assert(It != VL.end() && "Expected at least one GEP.");
8392 S = getSameOpcode(*It, *TLI);
8393 }
8394
8395 // Check that all of the users of the scalars that we want to vectorize are
8396 // schedulable.
8397 Instruction *VL0 = S.getMainOp();
8398 BB = VL0->getParent();
8399
8400 if (S &&
8401 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8402 !DT->isReachableFromEntry(BB))) {
8403 // Don't go into unreachable blocks. They may contain instructions with
8404 // dependency cycles which confuse the final scheduling.
8405 // Do not vectorize EH and non-returning blocks, not profitable in most
8406 // cases.
8407 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8408 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8409 return;
8410 }
8411
8412 // Check that every instruction appears once in this bundle.
8413 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8414 return;
8415
8416 // Perform specific checks for each particular instruction kind.
8417 OrdersType CurrentOrder;
8418 SmallVector<Value *> PointerOps;
8419 TreeEntry::EntryState State = getScalarsVectorizationState(
8420 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8421 if (State == TreeEntry::NeedToGather) {
8422 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8423 ReuseShuffleIndices);
8424 return;
8425 }
8426
8427 auto &BSRef = BlocksSchedules[BB];
8428 if (!BSRef)
8429 BSRef = std::make_unique<BlockScheduling>(BB);
8430
8431 BlockScheduling &BS = *BSRef;
8432
8433 std::optional<ScheduleData *> Bundle =
8434 BS.tryScheduleBundle(UniqueValues, this, S);
8435#ifdef EXPENSIVE_CHECKS
8436 // Make sure we didn't break any internal invariants
8437 BS.verify();
8438#endif
8439 if (!Bundle) {
8440 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8441 assert((!BS.getScheduleData(VL0) ||
8442 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8443 "tryScheduleBundle should cancelScheduling on failure");
8444 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8445 ReuseShuffleIndices);
8446 NonScheduledFirst.insert(VL.front());
8447 if (S.getOpcode() == Instruction::Load &&
8448 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8450 return;
8451 }
8452 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8453
8454 unsigned ShuffleOrOp =
8455 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8456 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8457 // Postpone PHI nodes creation
8458 SmallVector<unsigned> PHIOps;
8459 for (unsigned I : seq<unsigned>(Operands.size())) {
8461 if (Op.empty())
8462 continue;
8463 InstructionsState S = getSameOpcode(Op, *TLI);
8464 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8465 buildTree_rec(Op, Depth + 1, {TE, I});
8466 else
8467 PHIOps.push_back(I);
8468 }
8469 for (unsigned I : PHIOps)
8470 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8471 };
8472 switch (ShuffleOrOp) {
8473 case Instruction::PHI: {
8474 auto *PH = cast<PHINode>(VL0);
8475
8476 TreeEntry *TE =
8477 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8478 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
8479 TE->dump());
8480
8481 // Keeps the reordered operands to avoid code duplication.
8482 PHIHandler Handler(*DT, PH, VL);
8483 Handler.buildOperands();
8484 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8485 TE->setOperand(I, Handler.getOperands(I));
8486 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8487 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8488 Operands[I] = Handler.getOperands(I);
8489 CreateOperandNodes(TE, Operands);
8490 return;
8491 }
8492 case Instruction::ExtractValue:
8493 case Instruction::ExtractElement: {
8494 if (CurrentOrder.empty()) {
8495 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8496 } else {
8497 LLVM_DEBUG({
8498 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8499 "with order";
8500 for (unsigned Idx : CurrentOrder)
8501 dbgs() << " " << Idx;
8502 dbgs() << "\n";
8503 });
8504 fixupOrderingIndices(CurrentOrder);
8505 }
8506 // Insert new order with initial value 0, if it does not exist,
8507 // otherwise return the iterator to the existing one.
8508 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8509 ReuseShuffleIndices, CurrentOrder);
8510 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
8511 "(ExtractValueInst/ExtractElementInst).\n";
8512 TE->dump());
8513 // This is a special case, as it does not gather, but at the same time
8514 // we are not extending buildTree_rec() towards the operands.
8515 TE->setOperand(*this);
8516 return;
8517 }
8518 case Instruction::InsertElement: {
8519 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8520
8521 auto OrdCompare = [](const std::pair<int, int> &P1,
8522 const std::pair<int, int> &P2) {
8523 return P1.first > P2.first;
8524 };
8526 decltype(OrdCompare)>
8527 Indices(OrdCompare);
8528 for (int I = 0, E = VL.size(); I < E; ++I) {
8529 unsigned Idx = *getElementIndex(VL[I]);
8530 Indices.emplace(Idx, I);
8531 }
8532 OrdersType CurrentOrder(VL.size(), VL.size());
8533 bool IsIdentity = true;
8534 for (int I = 0, E = VL.size(); I < E; ++I) {
8535 CurrentOrder[Indices.top().second] = I;
8536 IsIdentity &= Indices.top().second == I;
8537 Indices.pop();
8538 }
8539 if (IsIdentity)
8540 CurrentOrder.clear();
8541 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8542 {}, CurrentOrder);
8543 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
8544 TE->dump());
8545
8546 TE->setOperand(*this);
8547 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8548 return;
8549 }
8550 case Instruction::Load: {
8551 // Check that a vectorized load would load the same memory as a scalar
8552 // load. For example, we don't want to vectorize loads that are smaller
8553 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8554 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8555 // from such a struct, we read/write packed bits disagreeing with the
8556 // unvectorized version.
8557 TreeEntry *TE = nullptr;
8558 fixupOrderingIndices(CurrentOrder);
8559 switch (State) {
8560 case TreeEntry::Vectorize:
8561 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8562 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8563 if (CurrentOrder.empty())
8564 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
8565 TE->dump());
8566 else
8568 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
8569 TE->dump());
8570 break;
8571 case TreeEntry::StridedVectorize:
8572 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8573 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8574 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8575 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
8576 TE->dump());
8577 break;
8578 case TreeEntry::ScatterVectorize:
8579 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8580 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8581 UserTreeIdx, ReuseShuffleIndices);
8582 LLVM_DEBUG(
8583 dbgs()
8584 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8585 TE->dump());
8586 break;
8587 case TreeEntry::CombinedVectorize:
8588 case TreeEntry::NeedToGather:
8589 llvm_unreachable("Unexpected loads state.");
8590 }
8591 TE->setOperand(*this);
8592 if (State == TreeEntry::ScatterVectorize)
8593 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8594 return;
8595 }
8596 case Instruction::ZExt:
8597 case Instruction::SExt:
8598 case Instruction::FPToUI:
8599 case Instruction::FPToSI:
8600 case Instruction::FPExt:
8601 case Instruction::PtrToInt:
8602 case Instruction::IntToPtr:
8603 case Instruction::SIToFP:
8604 case Instruction::UIToFP:
8605 case Instruction::Trunc:
8606 case Instruction::FPTrunc:
8607 case Instruction::BitCast: {
8608 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8609 std::make_pair(std::numeric_limits<unsigned>::min(),
8610 std::numeric_limits<unsigned>::max()));
8611 if (ShuffleOrOp == Instruction::ZExt ||
8612 ShuffleOrOp == Instruction::SExt) {
8613 CastMaxMinBWSizes = std::make_pair(
8614 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8615 PrevMaxBW),
8616 std::min<unsigned>(
8617 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8618 PrevMinBW));
8619 } else if (ShuffleOrOp == Instruction::Trunc) {
8620 CastMaxMinBWSizes = std::make_pair(
8621 std::max<unsigned>(
8622 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8623 PrevMaxBW),
8624 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8625 PrevMinBW));
8626 }
8627 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8628 ReuseShuffleIndices);
8629 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
8630 TE->dump());
8631
8632 TE->setOperand(*this);
8633 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8634 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8635 if (ShuffleOrOp == Instruction::Trunc) {
8636 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8637 } else if (ShuffleOrOp == Instruction::SIToFP ||
8638 ShuffleOrOp == Instruction::UIToFP) {
8639 unsigned NumSignBits =
8640 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8641 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8642 APInt Mask = DB->getDemandedBits(OpI);
8643 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8644 }
8645 if (NumSignBits * 2 >=
8646 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8647 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8648 }
8649 return;
8650 }
8651 case Instruction::ICmp:
8652 case Instruction::FCmp: {
8653 // Check that all of the compares have the same predicate.
8654 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8655 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8656 ReuseShuffleIndices);
8657 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
8658 TE->dump());
8659
8661 VLOperands Ops(VL, S, *this);
8662 if (cast<CmpInst>(VL0)->isCommutative()) {
8663 // Commutative predicate - collect + sort operands of the instructions
8664 // so that each side is more likely to have the same opcode.
8666 "Commutative Predicate mismatch");
8667 Ops.reorder();
8668 Left = Ops.getVL(0);
8669 Right = Ops.getVL(1);
8670 } else {
8671 // Collect operands - commute if it uses the swapped predicate.
8672 for (Value *V : VL) {
8673 if (isa<PoisonValue>(V)) {
8674 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8675 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8676 continue;
8677 }
8678 auto *Cmp = cast<CmpInst>(V);
8679 Value *LHS = Cmp->getOperand(0);
8680 Value *RHS = Cmp->getOperand(1);
8681 if (Cmp->getPredicate() != P0)
8682 std::swap(LHS, RHS);
8683 Left.push_back(LHS);
8684 Right.push_back(RHS);
8685 }
8686 }
8687 TE->setOperand(0, Left);
8688 TE->setOperand(1, Right);
8689 buildTree_rec(Left, Depth + 1, {TE, 0});
8690 buildTree_rec(Right, Depth + 1, {TE, 1});
8691 if (ShuffleOrOp == Instruction::ICmp) {
8692 unsigned NumSignBits0 =
8693 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8694 if (NumSignBits0 * 2 >=
8695 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8696 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8697 unsigned NumSignBits1 =
8698 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8699 if (NumSignBits1 * 2 >=
8700 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8701 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8702 }
8703 return;
8704 }
8705 case Instruction::Select:
8706 case Instruction::FNeg:
8707 case Instruction::Add:
8708 case Instruction::FAdd:
8709 case Instruction::Sub:
8710 case Instruction::FSub:
8711 case Instruction::Mul:
8712 case Instruction::FMul:
8713 case Instruction::UDiv:
8714 case Instruction::SDiv:
8715 case Instruction::FDiv:
8716 case Instruction::URem:
8717 case Instruction::SRem:
8718 case Instruction::FRem:
8719 case Instruction::Shl:
8720 case Instruction::LShr:
8721 case Instruction::AShr:
8722 case Instruction::And:
8723 case Instruction::Or:
8724 case Instruction::Xor:
8725 case Instruction::Freeze: {
8726 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8727 ReuseShuffleIndices);
8728 LLVM_DEBUG(
8729 dbgs() << "SLP: added a new TreeEntry "
8730 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8731 TE->dump());
8732
8733 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8734 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8735 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8736 return;
8737 }
8738 case Instruction::GetElementPtr: {
8739 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8740 ReuseShuffleIndices);
8741 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
8742 TE->dump());
8744 // Prepare the operand vector for pointer operands.
8745 for (Value *V : VL) {
8746 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8747 if (!GEP) {
8748 Operands.front().push_back(V);
8749 continue;
8750 }
8751 Operands.front().push_back(GEP->getPointerOperand());
8752 }
8753 TE->setOperand(0, Operands.front());
8754 // Need to cast all indices to the same type before vectorization to
8755 // avoid crash.
8756 // Required to be able to find correct matches between different gather
8757 // nodes and reuse the vectorized values rather than trying to gather them
8758 // again.
8759 int IndexIdx = 1;
8760 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8761 Type *Ty = all_of(VL,
8762 [VL0Ty, IndexIdx](Value *V) {
8763 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8764 if (!GEP)
8765 return true;
8766 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8767 })
8768 ? VL0Ty
8769 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8770 ->getPointerOperandType()
8771 ->getScalarType());
8772 // Prepare the operand vector.
8773 for (Value *V : VL) {
8774 auto *I = dyn_cast<GetElementPtrInst>(V);
8775 if (!I) {
8776 Operands.back().push_back(
8777 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8778 continue;
8779 }
8780 auto *Op = I->getOperand(IndexIdx);
8781 auto *CI = dyn_cast<ConstantInt>(Op);
8782 if (!CI)
8783 Operands.back().push_back(Op);
8784 else
8785 Operands.back().push_back(ConstantFoldIntegerCast(
8786 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8787 }
8788 TE->setOperand(IndexIdx, Operands.back());
8789
8790 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8791 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8792 return;
8793 }
8794 case Instruction::Store: {
8795 bool Consecutive = CurrentOrder.empty();
8796 if (!Consecutive)
8797 fixupOrderingIndices(CurrentOrder);
8798 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8799 ReuseShuffleIndices, CurrentOrder);
8800 if (Consecutive)
8801 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
8802 TE->dump());
8803 else
8804 LLVM_DEBUG(
8805 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
8806 TE->dump());
8807 TE->setOperand(*this);
8808 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8809 return;
8810 }
8811 case Instruction::Call: {
8812 // Check if the calls are all to the same vectorizable intrinsic or
8813 // library function.
8814 CallInst *CI = cast<CallInst>(VL0);
8816
8817 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8818 ReuseShuffleIndices);
8819 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
8820 TE->dump());
8821 TE->setOperand(*this, isCommutative(VL0));
8822 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8823 // For scalar operands no need to create an entry since no need to
8824 // vectorize it.
8826 continue;
8827 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8828 }
8829 return;
8830 }
8831 case Instruction::ShuffleVector: {
8832 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8833 ReuseShuffleIndices);
8834 if (S.isAltShuffle()) {
8835 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
8836 TE->dump());
8837 } else {
8838 assert(SLPReVec && "Only supported by REVEC.");
8839 LLVM_DEBUG(
8840 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8841 TE->dump());
8842 }
8843
8844 // Reorder operands if reordering would enable vectorization.
8845 auto *CI = dyn_cast<CmpInst>(VL0);
8846 if (CI && any_of(VL, [](Value *V) {
8847 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8848 })) {
8849 auto *MainCI = cast<CmpInst>(S.getMainOp());
8850 auto *AltCI = cast<CmpInst>(S.getAltOp());
8851 CmpInst::Predicate MainP = MainCI->getPredicate();
8852 CmpInst::Predicate AltP = AltCI->getPredicate();
8853 assert(MainP != AltP &&
8854 "Expected different main/alternate predicates.");
8856 // Collect operands - commute if it uses the swapped predicate or
8857 // alternate operation.
8858 for (Value *V : VL) {
8859 if (isa<PoisonValue>(V)) {
8860 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8861 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8862 continue;
8863 }
8864 auto *Cmp = cast<CmpInst>(V);
8865 Value *LHS = Cmp->getOperand(0);
8866 Value *RHS = Cmp->getOperand(1);
8867
8868 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8869 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8870 std::swap(LHS, RHS);
8871 } else {
8872 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8873 std::swap(LHS, RHS);
8874 }
8875 Left.push_back(LHS);
8876 Right.push_back(RHS);
8877 }
8878 TE->setOperand(0, Left);
8879 TE->setOperand(1, Right);
8880 buildTree_rec(Left, Depth + 1, {TE, 0});
8881 buildTree_rec(Right, Depth + 1, {TE, 1});
8882 return;
8883 }
8884
8885 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8886 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8887 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8888 return;
8889 }
8890 default:
8891 break;
8892 }
8893 llvm_unreachable("Unexpected vectorization of the instructions.");
8894}
8895
8897 unsigned N = 1;
8898 Type *EltTy = T;
8899
8900 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8901 if (EltTy->isEmptyTy())
8902 return 0;
8903 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8904 // Check that struct is homogeneous.
8905 for (const auto *Ty : ST->elements())
8906 if (Ty != *ST->element_begin())
8907 return 0;
8908 N *= ST->getNumElements();
8909 EltTy = *ST->element_begin();
8910 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8911 N *= AT->getNumElements();
8912 EltTy = AT->getElementType();
8913 } else {
8914 auto *VT = cast<FixedVectorType>(EltTy);
8915 N *= VT->getNumElements();
8916 EltTy = VT->getElementType();
8917 }
8918 }
8919
8920 if (!isValidElementType(EltTy))
8921 return 0;
8922 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8923 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8924 VTSize != DL->getTypeStoreSizeInBits(T))
8925 return 0;
8926 return N;
8927}
8928
8929bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
8930 SmallVectorImpl<unsigned> &CurrentOrder,
8931 bool ResizeAllowed) const {
8932 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8933 assert(It != VL.end() && "Expected at least one extract instruction.");
8934 auto *E0 = cast<Instruction>(*It);
8935 assert(
8936 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8937 "Invalid opcode");
8938 // Check if all of the extracts come from the same vector and from the
8939 // correct offset.
8940 Value *Vec = E0->getOperand(0);
8941
8942 CurrentOrder.clear();
8943
8944 // We have to extract from a vector/aggregate with the same number of elements.
8945 unsigned NElts;
8946 if (E0->getOpcode() == Instruction::ExtractValue) {
8947 NElts = canMapToVector(Vec->getType());
8948 if (!NElts)
8949 return false;
8950 // Check if load can be rewritten as load of vector.
8951 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8952 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8953 return false;
8954 } else {
8955 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8956 }
8957
8958 unsigned E = VL.size();
8959 if (!ResizeAllowed && NElts != E)
8960 return false;
8961 SmallVector<int> Indices(E, PoisonMaskElem);
8962 unsigned MinIdx = NElts, MaxIdx = 0;
8963 for (auto [I, V] : enumerate(VL)) {
8964 auto *Inst = dyn_cast<Instruction>(V);
8965 if (!Inst)
8966 continue;
8967 if (Inst->getOperand(0) != Vec)
8968 return false;
8969 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8970 if (isa<UndefValue>(EE->getIndexOperand()))
8971 continue;
8972 std::optional<unsigned> Idx = getExtractIndex(Inst);
8973 if (!Idx)
8974 return false;
8975 const unsigned ExtIdx = *Idx;
8976 if (ExtIdx >= NElts)
8977 continue;
8978 Indices[I] = ExtIdx;
8979 if (MinIdx > ExtIdx)
8980 MinIdx = ExtIdx;
8981 if (MaxIdx < ExtIdx)
8982 MaxIdx = ExtIdx;
8983 }
8984 if (MaxIdx - MinIdx + 1 > E)
8985 return false;
8986 if (MaxIdx + 1 <= E)
8987 MinIdx = 0;
8988
8989 // Check that all of the indices extract from the correct offset.
8990 bool ShouldKeepOrder = true;
8991 // Assign to all items the initial value E + 1 so we can check if the extract
8992 // instruction index was used already.
8993 // Also, later we can check that all the indices are used and we have a
8994 // consecutive access in the extract instructions, by checking that no
8995 // element of CurrentOrder still has value E + 1.
8996 CurrentOrder.assign(E, E);
8997 for (unsigned I = 0; I < E; ++I) {
8998 if (Indices[I] == PoisonMaskElem)
8999 continue;
9000 const unsigned ExtIdx = Indices[I] - MinIdx;
9001 if (CurrentOrder[ExtIdx] != E) {
9002 CurrentOrder.clear();
9003 return false;
9004 }
9005 ShouldKeepOrder &= ExtIdx == I;
9006 CurrentOrder[ExtIdx] = I;
9007 }
9008 if (ShouldKeepOrder)
9009 CurrentOrder.clear();
9010
9011 return ShouldKeepOrder;
9012}
9013
9014bool BoUpSLP::areAllUsersVectorized(
9015 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
9016 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
9017 all_of(I->users(), [this](User *U) {
9018 return ScalarToTreeEntry.contains(U) ||
9019 isVectorLikeInstWithConstOps(U) ||
9020 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9021 });
9022}
9023
9024static std::pair<InstructionCost, InstructionCost>
9027 ArrayRef<Type *> ArgTys) {
9029
9030 // Calculate the cost of the scalar and vector calls.
9031 FastMathFlags FMF;
9032 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9033 FMF = FPCI->getFastMathFlags();
9035 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
9036 dyn_cast<IntrinsicInst>(CI));
9037 auto IntrinsicCost =
9039
9040 auto Shape = VFShape::get(CI->getFunctionType(),
9042 false /*HasGlobalPred*/);
9043 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9044 auto LibCost = IntrinsicCost;
9045 if (!CI->isNoBuiltin() && VecFunc) {
9046 // Calculate the cost of the vector library call.
9047 // If the corresponding vector call is cheaper, return its cost.
9048 LibCost =
9049 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9050 }
9051 return {IntrinsicCost, LibCost};
9052}
9053
9054void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9055 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
9056 SmallVectorImpl<Value *> *OpScalars,
9057 SmallVectorImpl<Value *> *AltScalars) const {
9058 unsigned Sz = Scalars.size();
9059 Mask.assign(Sz, PoisonMaskElem);
9060 SmallVector<int> OrderMask;
9061 if (!ReorderIndices.empty())
9062 inversePermutation(ReorderIndices, OrderMask);
9063 for (unsigned I = 0; I < Sz; ++I) {
9064 unsigned Idx = I;
9065 if (!ReorderIndices.empty())
9066 Idx = OrderMask[I];
9067 if (isa<PoisonValue>(Scalars[Idx]))
9068 continue;
9069 auto *OpInst = cast<Instruction>(Scalars[Idx]);
9070 if (IsAltOp(OpInst)) {
9071 Mask[I] = Sz + Idx;
9072 if (AltScalars)
9073 AltScalars->push_back(OpInst);
9074 } else {
9075 Mask[I] = Idx;
9076 if (OpScalars)
9077 OpScalars->push_back(OpInst);
9078 }
9079 }
9080 if (!ReuseShuffleIndices.empty()) {
9081 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
9082 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
9083 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9084 });
9085 Mask.swap(NewMask);
9086 }
9087}
9088
9090 const Instruction *MainOp,
9091 const Instruction *AltOp,
9092 const TargetLibraryInfo &TLI) {
9093 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9094 auto *AltCI = cast<CmpInst>(AltOp);
9095 CmpInst::Predicate MainP = MainCI->getPredicate();
9096 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
9097 assert(MainP != AltP && "Expected different main/alternate predicates.");
9098 auto *CI = cast<CmpInst>(I);
9099 if (isCmpSameOrSwapped(MainCI, CI, TLI))
9100 return false;
9101 if (isCmpSameOrSwapped(AltCI, CI, TLI))
9102 return true;
9103 CmpInst::Predicate P = CI->getPredicate();
9105
9106 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
9107 "CmpInst expected to match either main or alternate predicate or "
9108 "their swap.");
9109 return MainP != P && MainP != SwappedP;
9110 }
9111 return I->getOpcode() == AltOp->getOpcode();
9112}
9113
9114TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9115 assert(!Ops.empty());
9116 const auto *Op0 = Ops.front();
9117
9118 const bool IsConstant = all_of(Ops, [](Value *V) {
9119 // TODO: We should allow undef elements here
9120 return isConstant(V) && !isa<UndefValue>(V);
9121 });
9122 const bool IsUniform = all_of(Ops, [=](Value *V) {
9123 // TODO: We should allow undef elements here
9124 return V == Op0;
9125 });
9126 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9127 // TODO: We should allow undef elements here
9128 if (auto *CI = dyn_cast<ConstantInt>(V))
9129 return CI->getValue().isPowerOf2();
9130 return false;
9131 });
9132 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9133 // TODO: We should allow undef elements here
9134 if (auto *CI = dyn_cast<ConstantInt>(V))
9135 return CI->getValue().isNegatedPowerOf2();
9136 return false;
9137 });
9138
9140 if (IsConstant && IsUniform)
9142 else if (IsConstant)
9144 else if (IsUniform)
9146
9148 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9149 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9150
9151 return {VK, VP};
9152}
9153
9154namespace {
9155/// The base class for shuffle instruction emission and shuffle cost estimation.
9156class BaseShuffleAnalysis {
9157protected:
9158 Type *ScalarTy = nullptr;
9159
9160 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9161
9162 /// V is expected to be a vectorized value.
9163 /// When REVEC is disabled, there is no difference between VF and
9164 /// VNumElements.
9165 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9166 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9167 /// of 8.
9168 unsigned getVF(Value *V) const {
9169 assert(V && "V cannot be nullptr");
9170 assert(isa<FixedVectorType>(V->getType()) &&
9171 "V does not have FixedVectorType");
9172 assert(ScalarTy && "ScalarTy cannot be nullptr");
9173 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9174 unsigned VNumElements =
9175 cast<FixedVectorType>(V->getType())->getNumElements();
9176 assert(VNumElements > ScalarTyNumElements &&
9177 "the number of elements of V is not large enough");
9178 assert(VNumElements % ScalarTyNumElements == 0 &&
9179 "the number of elements of V is not a vectorized value");
9180 return VNumElements / ScalarTyNumElements;
9181 }
9182
9183 /// Checks if the mask is an identity mask.
9184 /// \param IsStrict if is true the function returns false if mask size does
9185 /// not match vector size.
9186 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9187 bool IsStrict) {
9188 int Limit = Mask.size();
9189 int VF = VecTy->getNumElements();
9190 int Index = -1;
9191 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9192 return true;
9193 if (!IsStrict) {
9194 // Consider extract subvector starting from index 0.
9196 Index == 0)
9197 return true;
9198 // All VF-size submasks are identity (e.g.
9199 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9200 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9201 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9202 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9204 }))
9205 return true;
9206 }
9207 return false;
9208 }
9209
9210 /// Tries to combine 2 different masks into single one.
9211 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9212 /// change the size of the vector, \p LocalVF is the original size of the
9213 /// shuffled vector.
9214 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9215 ArrayRef<int> ExtMask) {
9216 unsigned VF = Mask.size();
9217 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9218 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9219 if (ExtMask[I] == PoisonMaskElem)
9220 continue;
9221 int MaskedIdx = Mask[ExtMask[I] % VF];
9222 NewMask[I] =
9223 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9224 }
9225 Mask.swap(NewMask);
9226 }
9227
9228 /// Looks through shuffles trying to reduce final number of shuffles in the
9229 /// code. The function looks through the previously emitted shuffle
9230 /// instructions and properly mark indices in mask as undef.
9231 /// For example, given the code
9232 /// \code
9233 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9234 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9235 /// \endcode
9236 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9237 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9238 /// <0, 1, 2, 3> for the shuffle.
9239 /// If 2 operands are of different size, the smallest one will be resized and
9240 /// the mask recalculated properly.
9241 /// For example, given the code
9242 /// \code
9243 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9244 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9245 /// \endcode
9246 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9247 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9248 /// <0, 1, 2, 3> for the shuffle.
9249 /// So, it tries to transform permutations to simple vector merge, if
9250 /// possible.
9251 /// \param V The input vector which must be shuffled using the given \p Mask.
9252 /// If the better candidate is found, \p V is set to this best candidate
9253 /// vector.
9254 /// \param Mask The input mask for the shuffle. If the best candidate is found
9255 /// during looking-through-shuffles attempt, it is updated accordingly.
9256 /// \param SinglePermute true if the shuffle operation is originally a
9257 /// single-value-permutation. In this case the look-through-shuffles procedure
9258 /// may look for resizing shuffles as the best candidates.
9259 /// \return true if the shuffle results in the non-resizing identity shuffle
9260 /// (and thus can be ignored), false - otherwise.
9261 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9262 bool SinglePermute) {
9263 Value *Op = V;
9264 ShuffleVectorInst *IdentityOp = nullptr;
9265 SmallVector<int> IdentityMask;
9266 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9267 // Exit if not a fixed vector type or changing size shuffle.
9268 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9269 if (!SVTy)
9270 break;
9271 // Remember the identity or broadcast mask, if it is not a resizing
9272 // shuffle. If no better candidates are found, this Op and Mask will be
9273 // used in the final shuffle.
9274 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9275 if (!IdentityOp || !SinglePermute ||
9276 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9278 IdentityMask.size()))) {
9279 IdentityOp = SV;
9280 // Store current mask in the IdentityMask so later we did not lost
9281 // this info if IdentityOp is selected as the best candidate for the
9282 // permutation.
9283 IdentityMask.assign(Mask);
9284 }
9285 }
9286 // Remember the broadcast mask. If no better candidates are found, this Op
9287 // and Mask will be used in the final shuffle.
9288 // Zero splat can be used as identity too, since it might be used with
9289 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9290 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9291 // expensive, the analysis founds out, that the source vector is just a
9292 // broadcast, this original mask can be transformed to identity mask <0,
9293 // 1, 2, 3>.
9294 // \code
9295 // %0 = shuffle %v, poison, zeroinitalizer
9296 // %res = shuffle %0, poison, <3, 1, 2, 0>
9297 // \endcode
9298 // may be transformed to
9299 // \code
9300 // %0 = shuffle %v, poison, zeroinitalizer
9301 // %res = shuffle %0, poison, <0, 1, 2, 3>
9302 // \endcode
9303 if (SV->isZeroEltSplat()) {
9304 IdentityOp = SV;
9305 IdentityMask.assign(Mask);
9306 }
9307 int LocalVF = Mask.size();
9308 if (auto *SVOpTy =
9309 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9310 LocalVF = SVOpTy->getNumElements();
9311 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9312 for (auto [Idx, I] : enumerate(Mask)) {
9313 if (I == PoisonMaskElem ||
9314 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9315 continue;
9316 ExtMask[Idx] = SV->getMaskValue(I);
9317 }
9318 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9319 SV->getOperand(0),
9320 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9321 .all();
9322 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9323 SV->getOperand(1),
9324 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9325 .all();
9326 if (!IsOp1Undef && !IsOp2Undef) {
9327 // Update mask and mark undef elems.
9328 for (int &I : Mask) {
9329 if (I == PoisonMaskElem)
9330 continue;
9331 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9333 I = PoisonMaskElem;
9334 }
9335 break;
9336 }
9337 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9338 combineMasks(LocalVF, ShuffleMask, Mask);
9339 Mask.swap(ShuffleMask);
9340 if (IsOp2Undef)
9341 Op = SV->getOperand(0);
9342 else
9343 Op = SV->getOperand(1);
9344 }
9345 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9346 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9348 if (IdentityOp) {
9349 V = IdentityOp;
9350 assert(Mask.size() == IdentityMask.size() &&
9351 "Expected masks of same sizes.");
9352 // Clear known poison elements.
9353 for (auto [I, Idx] : enumerate(Mask))
9354 if (Idx == PoisonMaskElem)
9355 IdentityMask[I] = PoisonMaskElem;
9356 Mask.swap(IdentityMask);
9357 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9358 return SinglePermute &&
9359 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9360 /*IsStrict=*/true) ||
9361 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9362 Shuffle->isZeroEltSplat() &&
9364 }
9365 V = Op;
9366 return false;
9367 }
9368 V = Op;
9369 return true;
9370 }
9371
9372 /// Smart shuffle instruction emission, walks through shuffles trees and
9373 /// tries to find the best matching vector for the actual shuffle
9374 /// instruction.
9375 template <typename T, typename ShuffleBuilderTy>
9376 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9377 ShuffleBuilderTy &Builder) {
9378 assert(V1 && "Expected at least one vector value.");
9379 if (V2)
9380 Builder.resizeToMatch(V1, V2);
9381 int VF = Mask.size();
9382 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9383 VF = FTy->getNumElements();
9384 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9385 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9386 .all()) {
9387 // Peek through shuffles.
9388 Value *Op1 = V1;
9389 Value *Op2 = V2;
9390 int VF =
9391 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9392 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9393 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9394 for (int I = 0, E = Mask.size(); I < E; ++I) {
9395 if (Mask[I] < VF)
9396 CombinedMask1[I] = Mask[I];
9397 else
9398 CombinedMask2[I] = Mask[I] - VF;
9399 }
9400 Value *PrevOp1;
9401 Value *PrevOp2;
9402 do {
9403 PrevOp1 = Op1;
9404 PrevOp2 = Op2;
9405 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9406 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9407 // Check if we have 2 resizing shuffles - need to peek through operands
9408 // again.
9409 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9410 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9411 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9412 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9413 if (I == PoisonMaskElem)
9414 continue;
9415 ExtMask1[Idx] = SV1->getMaskValue(I);
9416 }
9417 SmallBitVector UseMask1 = buildUseMask(
9418 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9419 ->getNumElements(),
9420 ExtMask1, UseMask::SecondArg);
9421 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9422 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9423 if (I == PoisonMaskElem)
9424 continue;
9425 ExtMask2[Idx] = SV2->getMaskValue(I);
9426 }
9427 SmallBitVector UseMask2 = buildUseMask(
9428 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9429 ->getNumElements(),
9430 ExtMask2, UseMask::SecondArg);
9431 if (SV1->getOperand(0)->getType() ==
9432 SV2->getOperand(0)->getType() &&
9433 SV1->getOperand(0)->getType() != SV1->getType() &&
9434 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9435 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9436 Op1 = SV1->getOperand(0);
9437 Op2 = SV2->getOperand(0);
9438 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9439 int LocalVF = ShuffleMask1.size();
9440 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9441 LocalVF = FTy->getNumElements();
9442 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9443 CombinedMask1.swap(ShuffleMask1);
9444 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9445 LocalVF = ShuffleMask2.size();
9446 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9447 LocalVF = FTy->getNumElements();
9448 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9449 CombinedMask2.swap(ShuffleMask2);
9450 }
9451 }
9452 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9453 Builder.resizeToMatch(Op1, Op2);
9454 VF = std::max(cast<VectorType>(Op1->getType())
9455 ->getElementCount()
9456 .getKnownMinValue(),
9457 cast<VectorType>(Op2->getType())
9458 ->getElementCount()
9459 .getKnownMinValue());
9460 for (int I = 0, E = Mask.size(); I < E; ++I) {
9461 if (CombinedMask2[I] != PoisonMaskElem) {
9462 assert(CombinedMask1[I] == PoisonMaskElem &&
9463 "Expected undefined mask element");
9464 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9465 }
9466 }
9467 if (Op1 == Op2 &&
9468 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9469 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9470 isa<ShuffleVectorInst>(Op1) &&
9471 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9472 ArrayRef(CombinedMask1))))
9473 return Builder.createIdentity(Op1);
9474 return Builder.createShuffleVector(
9475 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9476 CombinedMask1);
9477 }
9478 if (isa<PoisonValue>(V1))
9479 return Builder.createPoison(
9480 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9481 SmallVector<int> NewMask(Mask);
9482 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9483 assert(V1 && "Expected non-null value after looking through shuffles.");
9484
9485 if (!IsIdentity)
9486 return Builder.createShuffleVector(V1, NewMask);
9487 return Builder.createIdentity(V1);
9488 }
9489
9490 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
9491 /// shuffle emission.
9492 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
9493 ArrayRef<int> Mask) {
9494 for (unsigned I : seq<unsigned>(CommonMask.size()))
9495 if (Mask[I] != PoisonMaskElem)
9496 CommonMask[I] = I;
9497 }
9498};
9499} // namespace
9500
9501/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9502static std::pair<InstructionCost, InstructionCost>
9504 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9505 Type *ScalarTy, VectorType *VecTy) {
9506 InstructionCost ScalarCost = 0;
9507 InstructionCost VecCost = 0;
9508 // Here we differentiate two cases: (1) when Ptrs represent a regular
9509 // vectorization tree node (as they are pointer arguments of scattered
9510 // loads) or (2) when Ptrs are the arguments of loads or stores being
9511 // vectorized as plane wide unit-stride load/store since all the
9512 // loads/stores are known to be from/to adjacent locations.
9513 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9514 // Case 2: estimate costs for pointer related costs when vectorizing to
9515 // a wide load/store.
9516 // Scalar cost is estimated as a set of pointers with known relationship
9517 // between them.
9518 // For vector code we will use BasePtr as argument for the wide load/store
9519 // but we also need to account all the instructions which are going to
9520 // stay in vectorized code due to uses outside of these scalar
9521 // loads/stores.
9522 ScalarCost = TTI.getPointersChainCost(
9523 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9524 CostKind);
9525
9526 SmallVector<const Value *> PtrsRetainedInVecCode;
9527 for (Value *V : Ptrs) {
9528 if (V == BasePtr) {
9529 PtrsRetainedInVecCode.push_back(V);
9530 continue;
9531 }
9532 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9533 // For simplicity assume Ptr to stay in vectorized code if it's not a
9534 // GEP instruction. We don't care since it's cost considered free.
9535 // TODO: We should check for any uses outside of vectorizable tree
9536 // rather than just single use.
9537 if (!Ptr || !Ptr->hasOneUse())
9538 PtrsRetainedInVecCode.push_back(V);
9539 }
9540
9541 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9542 // If all pointers stay in vectorized code then we don't have
9543 // any savings on that.
9544 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9545 }
9546 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9547 TTI::PointersChainInfo::getKnownStride(),
9548 VecTy, CostKind);
9549 } else {
9550 // Case 1: Ptrs are the arguments of loads that we are going to transform
9551 // into masked gather load intrinsic.
9552 // All the scalar GEPs will be removed as a result of vectorization.
9553 // For any external uses of some lanes extract element instructions will
9554 // be generated (which cost is estimated separately).
9555 TTI::PointersChainInfo PtrsInfo =
9556 all_of(Ptrs,
9557 [](const Value *V) {
9558 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9559 return Ptr && !Ptr->hasAllConstantIndices();
9560 })
9561 ? TTI::PointersChainInfo::getUnknownStride()
9562 : TTI::PointersChainInfo::getKnownStride();
9563
9564 ScalarCost =
9565 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9566 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9567 if (!BaseGEP) {
9568 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9569 if (It != Ptrs.end())
9570 BaseGEP = cast<GEPOperator>(*It);
9571 }
9572 if (BaseGEP) {
9573 SmallVector<const Value *> Indices(BaseGEP->indices());
9574 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9575 BaseGEP->getPointerOperand(), Indices, VecTy,
9576 CostKind);
9577 }
9578 }
9579
9580 return std::make_pair(ScalarCost, VecCost);
9581}
9582
9583void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9584 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9585 "Expected gather node without reordering.");
9587 SmallSet<size_t, 2> LoadKeyUsed;
9588
9589 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9590 // instructions have same opcode already.
9591 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
9592 all_of(TE.Scalars, isConstant))
9593 return;
9594
9595 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9596 return VectorizableTree[Idx]->isSame(TE.Scalars);
9597 }))
9598 return;
9599
9600 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9601 Key = hash_combine(hash_value(LI->getParent()), Key);
9602 Value *Ptr =
9604 if (LoadKeyUsed.contains(Key)) {
9605 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9606 if (LIt != LoadsMap.end()) {
9607 for (LoadInst *RLI : LIt->second) {
9608 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9609 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9610 /*StrictCheck=*/true))
9611 return hash_value(RLI->getPointerOperand());
9612 }
9613 for (LoadInst *RLI : LIt->second) {
9615 LI->getPointerOperand(), *TLI)) {
9616 hash_code SubKey = hash_value(RLI->getPointerOperand());
9617 return SubKey;
9618 }
9619 }
9620 if (LIt->second.size() > 2) {
9621 hash_code SubKey =
9622 hash_value(LIt->second.back()->getPointerOperand());
9623 return SubKey;
9624 }
9625 }
9626 }
9627 LoadKeyUsed.insert(Key);
9628 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9629 return hash_value(LI->getPointerOperand());
9630 };
9633 bool IsOrdered = true;
9634 unsigned NumInstructions = 0;
9635 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9636 // nodes.
9637 for (auto [I, V] : enumerate(TE.Scalars)) {
9638 size_t Key = 1, Idx = 1;
9639 if (auto *Inst = dyn_cast<Instruction>(V);
9640 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9641 !isDeleted(Inst) && !isVectorized(V)) {
9642 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9643 /*AllowAlternate=*/false);
9644 ++NumInstructions;
9645 }
9646 auto &Container = SortedValues[Key];
9647 if (IsOrdered && !KeyToIndex.contains(V) &&
9648 !(isa<Constant, ExtractElementInst>(V) ||
9650 ((Container.contains(Idx) &&
9651 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9652 (!Container.empty() && !Container.contains(Idx) &&
9653 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9654 IsOrdered = false;
9655 auto &KTI = KeyToIndex[V];
9656 if (KTI.empty())
9657 Container[Idx].push_back(V);
9658 KTI.push_back(I);
9659 }
9661 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9662 if (!IsOrdered && NumInstructions > 1) {
9663 unsigned Cnt = 0;
9664 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9665 for (const auto &D : SortedValues) {
9666 for (const auto &P : D.second) {
9667 unsigned Sz = 0;
9668 for (Value *V : P.second) {
9669 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9670 for (auto [K, Idx] : enumerate(Indices)) {
9671 TE.ReorderIndices[Cnt + K] = Idx;
9672 TE.Scalars[Cnt + K] = V;
9673 }
9674 Sz += Indices.size();
9675 Cnt += Indices.size();
9676 }
9677 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9678 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9679 *TTI, TE.Scalars.front()->getType(), Sz);
9680 SubVectors.emplace_back(Cnt - Sz, SubVF);
9681 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9682 DemandedElts.clearBit(I);
9683 } else if (!P.second.empty() && isConstant(P.second.front())) {
9684 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9685 DemandedElts.clearBit(I);
9686 }
9687 }
9688 }
9689 }
9690 // Reuses always require shuffles, so consider it as profitable.
9691 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9692 return;
9693 // Do simple cost estimation.
9696 auto *ScalarTy = TE.Scalars.front()->getType();
9697 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9698 for (auto [Idx, Sz] : SubVectors) {
9700 Idx, getWidenedType(ScalarTy, Sz));
9701 }
9702 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9703 assert(SLPReVec && "Only supported by REVEC.");
9704 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9705 // of CreateInsertElement.
9706 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9707 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9708 if (DemandedElts[I])
9709 Cost +=
9710 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9711 CostKind, I * ScalarTyNumElements, FTy);
9712 } else {
9713 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9714 /*Extract=*/false, CostKind);
9715 }
9716 int Sz = TE.Scalars.size();
9717 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9718 TE.ReorderIndices.end());
9719 for (unsigned I : seq<unsigned>(Sz)) {
9720 Value *V = TE.getOrdered(I);
9721 if (isa<PoisonValue>(V)) {
9722 ReorderMask[I] = PoisonMaskElem;
9723 } else if (isConstant(V) || DemandedElts[I]) {
9724 ReorderMask[I] = I + TE.ReorderIndices.size();
9725 }
9726 }
9728 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9731 VecTy, ReorderMask);
9732 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9733 ReorderMask.assign(Sz, PoisonMaskElem);
9734 for (unsigned I : seq<unsigned>(Sz)) {
9735 Value *V = TE.getOrdered(I);
9736 if (isConstant(V)) {
9737 DemandedElts.clearBit(I);
9738 if (!isa<PoisonValue>(V))
9739 ReorderMask[I] = I;
9740 } else {
9741 ReorderMask[I] = I + Sz;
9742 }
9743 }
9745 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9746 if (!DemandedElts.isAllOnes())
9747 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9748 if (Cost >= BVCost) {
9749 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9750 reorderScalars(TE.Scalars, Mask);
9751 TE.ReorderIndices.clear();
9752 }
9753}
9754
9757 BaseGraphSize = VectorizableTree.size();
9758 // Turn graph transforming mode on and off, when done.
9759 class GraphTransformModeRAAI {
9760 bool &SavedIsGraphTransformMode;
9761
9762 public:
9763 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9764 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9765 IsGraphTransformMode = true;
9766 }
9767 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9768 } TransformContext(IsGraphTransformMode);
9769 // Operands are profitable if they are:
9770 // 1. At least one constant
9771 // or
9772 // 2. Splats
9773 // or
9774 // 3. Results in good vectorization opportunity, i.e. may generate vector
9775 // nodes and reduce cost of the graph.
9776 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9777 const InstructionsState &S) {
9779 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9780 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9781 I2->getOperand(Op));
9782 return all_of(
9783 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9784 return all_of(Cand,
9785 [](const std::pair<Value *, Value *> &P) {
9786 return isa<Constant>(P.first) ||
9787 isa<Constant>(P.second) || P.first == P.second;
9788 }) ||
9790 });
9791 };
9792
9793 // Try to reorder gather nodes for better vectorization opportunities.
9794 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9795 TreeEntry &E = *VectorizableTree[Idx];
9796 if (E.isGather())
9797 reorderGatherNode(E);
9798 }
9799
9800 // The tree may grow here, so iterate over nodes, built before.
9801 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9802 TreeEntry &E = *VectorizableTree[Idx];
9803 if (E.isGather()) {
9804 ArrayRef<Value *> VL = E.Scalars;
9805 const unsigned Sz = getVectorElementSize(VL.front());
9806 unsigned MinVF = getMinVF(2 * Sz);
9807 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9808 // same opcode and same parent block or all constants.
9809 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9810 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9811 E.isAltShuffle() || !allSameBlock(VL)) ||
9812 allConstant(VL) || isSplat(VL))
9813 continue;
9814 // Try to find vectorizable sequences and transform them into a series of
9815 // insertvector instructions.
9816 unsigned StartIdx = 0;
9817 unsigned End = VL.size();
9818 for (unsigned VF = getFloorFullVectorNumberOfElements(
9819 *TTI, VL.front()->getType(), VL.size() - 1);
9820 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9821 *TTI, VL.front()->getType(), VF - 1)) {
9822 if (StartIdx + VF > End)
9823 continue;
9825 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9826 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9827 // If any instruction is vectorized already - do not try again.
9828 // Reuse the existing node, if it fully matches the slice.
9829 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9830 SE || getTreeEntry(Slice.back())) {
9831 if (!SE)
9832 continue;
9833 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9834 continue;
9835 }
9836 // Constant already handled effectively - skip.
9837 if (allConstant(Slice))
9838 continue;
9839 // Do not try to vectorize small splats (less than vector register and
9840 // only with the single non-undef element).
9841 bool IsSplat = isSplat(Slice);
9842 if (Slices.empty() || !IsSplat ||
9843 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9844 Slice.front()->getType(), VF)),
9845 1U, VF - 1) !=
9847 Slice.front()->getType(), 2 * VF)),
9848 1U, 2 * VF)) ||
9849 count(Slice, Slice.front()) ==
9850 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9851 : 1)) {
9852 if (IsSplat)
9853 continue;
9854 InstructionsState S = getSameOpcode(Slice, *TLI);
9855 if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9856 (S.getOpcode() == Instruction::Load &&
9858 (S.getOpcode() != Instruction::Load &&
9859 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
9860 continue;
9861 if (VF == 2) {
9862 // Try to vectorize reduced values or if all users are vectorized.
9863 // For expensive instructions extra extracts might be profitable.
9864 if ((!UserIgnoreList || E.Idx != 0) &&
9865 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9867 !all_of(Slice, [&](Value *V) {
9868 if (isa<PoisonValue>(V))
9869 return true;
9870 return areAllUsersVectorized(cast<Instruction>(V),
9871 UserIgnoreList);
9872 }))
9873 continue;
9874 if (S.getOpcode() == Instruction::Load) {
9875 OrdersType Order;
9876 SmallVector<Value *> PointerOps;
9877 LoadsState Res =
9878 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9879 // Do not vectorize gathers.
9880 if (Res == LoadsState::ScatterVectorize ||
9881 Res == LoadsState::Gather) {
9882 if (Res == LoadsState::Gather) {
9884 // If reductions and the scalars from the root node are
9885 // analyzed - mark as non-vectorizable reduction.
9886 if (UserIgnoreList && E.Idx == 0)
9887 analyzedReductionVals(Slice);
9888 }
9889 continue;
9890 }
9891 } else if (S.getOpcode() == Instruction::ExtractElement ||
9892 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9894 !CheckOperandsProfitability(
9895 S.getMainOp(),
9896 cast<Instruction>(*find_if(reverse(Slice),
9897 IsaPred<Instruction>)),
9898 S))) {
9899 // Do not vectorize extractelements (handled effectively
9900 // alread). Do not vectorize non-profitable instructions (with
9901 // low cost and non-vectorizable operands.)
9902 continue;
9903 }
9904 }
9905 }
9906 Slices.emplace_back(Cnt, Slice.size());
9907 }
9908 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9909 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9910 if (StartIdx == Cnt)
9911 StartIdx = Cnt + Sz;
9912 if (End == Cnt + Sz)
9913 End = Cnt;
9914 };
9915 for (auto [Cnt, Sz] : Slices) {
9916 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9917 // If any instruction is vectorized already - do not try again.
9918 if (TreeEntry *SE = getTreeEntry(Slice.front());
9919 SE || getTreeEntry(Slice.back())) {
9920 if (!SE)
9921 continue;
9922 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9923 continue;
9924 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9925 AddCombinedNode(SE->Idx, Cnt, Sz);
9926 continue;
9927 }
9928 unsigned PrevSize = VectorizableTree.size();
9929 [[maybe_unused]] unsigned PrevEntriesSize =
9930 LoadEntriesToVectorize.size();
9931 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9932 if (PrevSize + 1 == VectorizableTree.size() &&
9933 VectorizableTree[PrevSize]->isGather() &&
9934 VectorizableTree[PrevSize]->hasState() &&
9935 VectorizableTree[PrevSize]->getOpcode() !=
9936 Instruction::ExtractElement &&
9937 !isSplat(Slice)) {
9938 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9939 analyzedReductionVals(Slice);
9940 VectorizableTree.pop_back();
9941 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9942 "LoadEntriesToVectorize expected to remain the same");
9943 continue;
9944 }
9945 AddCombinedNode(PrevSize, Cnt, Sz);
9946 }
9947 }
9948 // Restore ordering, if no extra vectorization happened.
9949 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9950 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9951 reorderScalars(E.Scalars, Mask);
9952 E.ReorderIndices.clear();
9953 }
9954 }
9955 if (!E.hasState())
9956 continue;
9957 switch (E.getOpcode()) {
9958 case Instruction::Load: {
9959 // No need to reorder masked gather loads, just reorder the scalar
9960 // operands.
9961 if (E.State != TreeEntry::Vectorize)
9962 break;
9963 Type *ScalarTy = E.getMainOp()->getType();
9964 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9965 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9966 // Check if profitable to represent consecutive load + reverse as strided
9967 // load with stride -1.
9968 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9969 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9970 SmallVector<int> Mask;
9971 inversePermutation(E.ReorderIndices, Mask);
9972 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9973 InstructionCost OriginalVecCost =
9974 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9979 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9980 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9981 if (StridedCost < OriginalVecCost)
9982 // Strided load is more profitable than consecutive load + reverse -
9983 // transform the node to strided load.
9984 E.State = TreeEntry::StridedVectorize;
9985 }
9986 break;
9987 }
9988 case Instruction::Store: {
9989 Type *ScalarTy =
9990 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9991 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9992 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9993 // Check if profitable to represent consecutive load + reverse as strided
9994 // load with stride -1.
9995 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9996 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9997 SmallVector<int> Mask;
9998 inversePermutation(E.ReorderIndices, Mask);
9999 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
10000 InstructionCost OriginalVecCost =
10001 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
10006 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10007 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
10008 if (StridedCost < OriginalVecCost)
10009 // Strided store is more profitable than reverse + consecutive store -
10010 // transform the node to strided store.
10011 E.State = TreeEntry::StridedVectorize;
10012 } else if (!E.ReorderIndices.empty()) {
10013 // Check for interleaved stores.
10014 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
10015 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10016 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
10017 if (Mask.size() < 4)
10018 return 0u;
10019 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10021 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
10023 VecTy, Factor, BaseSI->getAlign(),
10024 BaseSI->getPointerAddressSpace()))
10025 return Factor;
10026 }
10027
10028 return 0u;
10029 };
10030 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
10031 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10032 if (InterleaveFactor != 0)
10033 E.setInterleave(InterleaveFactor);
10034 }
10035 break;
10036 }
10037 case Instruction::Select: {
10038 if (E.State != TreeEntry::Vectorize)
10039 break;
10040 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
10041 if (MinMaxID == Intrinsic::not_intrinsic)
10042 break;
10043 // This node is a minmax node.
10044 E.CombinedOp = TreeEntry::MinMax;
10045 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
10046 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10047 CondEntry->State == TreeEntry::Vectorize) {
10048 // The condition node is part of the combined minmax node.
10049 CondEntry->State = TreeEntry::CombinedVectorize;
10050 }
10051 break;
10052 }
10053 default:
10054 break;
10055 }
10056 }
10057
10058 if (LoadEntriesToVectorize.empty()) {
10059 // Single load node - exit.
10060 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10061 VectorizableTree.front()->getOpcode() == Instruction::Load)
10062 return;
10063 // Small graph with small VF - exit.
10064 constexpr unsigned SmallTree = 3;
10065 constexpr unsigned SmallVF = 2;
10066 if ((VectorizableTree.size() <= SmallTree &&
10067 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10068 (VectorizableTree.size() <= 2 && UserIgnoreList))
10069 return;
10070
10071 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10072 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
10073 getCanonicalGraphSize() <= SmallTree &&
10074 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
10075 [](const std::unique_ptr<TreeEntry> &TE) {
10076 return TE->isGather() && TE->hasState() &&
10077 TE->getOpcode() == Instruction::Load &&
10078 !allSameBlock(TE->Scalars);
10079 }) == 1)
10080 return;
10081 }
10082
10083 // A list of loads to be gathered during the vectorization process. We can
10084 // try to vectorize them at the end, if profitable.
10087 GatheredLoads;
10088
10089 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10090 TreeEntry &E = *TE;
10091 if (E.isGather() &&
10092 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10093 (!E.hasState() && any_of(E.Scalars,
10094 [&](Value *V) {
10095 return isa<LoadInst>(V) &&
10096 !isVectorized(V) &&
10097 !isDeleted(cast<Instruction>(V));
10098 }))) &&
10099 !isSplat(E.Scalars)) {
10100 for (Value *V : E.Scalars) {
10101 auto *LI = dyn_cast<LoadInst>(V);
10102 if (!LI)
10103 continue;
10104 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
10105 continue;
10107 *this, V, *DL, *SE, *TTI,
10108 GatheredLoads[std::make_tuple(
10109 LI->getParent(),
10111 LI->getType())]);
10112 }
10113 }
10114 }
10115 // Try to vectorize gathered loads if this is not just a gather of loads.
10116 if (!GatheredLoads.empty())
10117 tryToVectorizeGatheredLoads(GatheredLoads);
10118}
10119
10120/// Merges shuffle masks and emits final shuffle instruction, if required. It
10121/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10122/// when the actual shuffle instruction is generated only if this is actually
10123/// required. Otherwise, the shuffle instruction emission is delayed till the
10124/// end of the process, to reduce the number of emitted instructions and further
10125/// analysis/transformations.
10126class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10127 bool IsFinalized = false;
10128 SmallVector<int> CommonMask;
10130 const TargetTransformInfo &TTI;
10132 SmallDenseSet<Value *> VectorizedVals;
10133 BoUpSLP &R;
10134 SmallPtrSetImpl<Value *> &CheckedExtracts;
10135 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10136 /// While set, still trying to estimate the cost for the same nodes and we
10137 /// can delay actual cost estimation (virtual shuffle instruction emission).
10138 /// May help better estimate the cost if same nodes must be permuted + allows
10139 /// to move most of the long shuffles cost estimation to TTI.
10140 bool SameNodesEstimated = true;
10141
10142 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10143 if (Ty->getScalarType()->isPointerTy()) {
10147 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10148 Ty->getScalarType());
10149 if (auto *VTy = dyn_cast<VectorType>(Ty))
10150 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10151 return Res;
10152 }
10153 return Constant::getAllOnesValue(Ty);
10154 }
10155
10156 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10157 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10158 return TTI::TCC_Free;
10159 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10160 InstructionCost GatherCost = 0;
10161 SmallVector<Value *> Gathers(VL);
10162 if (!Root && isSplat(VL)) {
10163 // Found the broadcasting of the single scalar, calculate the cost as
10164 // the broadcast.
10165 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10166 assert(It != VL.end() && "Expected at least one non-undef value.");
10167 // Add broadcast for non-identity shuffle only.
10168 bool NeedShuffle =
10169 count(VL, *It) > 1 &&
10170 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10171 if (!NeedShuffle) {
10172 if (isa<FixedVectorType>(ScalarTy)) {
10173 assert(SLPReVec && "FixedVectorType is not expected.");
10174 return TTI.getShuffleCost(
10175 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10176 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10177 cast<FixedVectorType>(ScalarTy));
10178 }
10179 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10180 CostKind, std::distance(VL.begin(), It),
10181 PoisonValue::get(VecTy), *It);
10182 }
10183
10184 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10185 transform(VL, ShuffleMask.begin(), [](Value *V) {
10186 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10187 });
10188 InstructionCost InsertCost =
10189 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10190 PoisonValue::get(VecTy), *It);
10191 return InsertCost + ::getShuffleCost(TTI,
10193 VecTy, ShuffleMask, CostKind,
10194 /*Index=*/0, /*SubTp=*/nullptr,
10195 /*Args=*/*It);
10196 }
10197 return GatherCost +
10198 (all_of(Gathers, IsaPred<UndefValue>)
10200 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10201 ScalarTy));
10202 };
10203
10204 /// Compute the cost of creating a vector containing the extracted values from
10205 /// \p VL.
10207 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10208 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10209 unsigned NumParts) {
10210 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10211 unsigned NumElts =
10212 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10213 auto *EE = dyn_cast<ExtractElementInst>(V);
10214 if (!EE)
10215 return Sz;
10216 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10217 if (!VecTy)
10218 return Sz;
10219 return std::max(Sz, VecTy->getNumElements());
10220 });
10221 // FIXME: this must be moved to TTI for better estimation.
10222 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10223 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10225 -> std::optional<TTI::ShuffleKind> {
10226 if (NumElts <= EltsPerVector)
10227 return std::nullopt;
10228 int OffsetReg0 =
10229 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10230 [](int S, int I) {
10231 if (I == PoisonMaskElem)
10232 return S;
10233 return std::min(S, I);
10234 }),
10235 EltsPerVector);
10236 int OffsetReg1 = OffsetReg0;
10237 DenseSet<int> RegIndices;
10238 // Check that if trying to permute same single/2 input vectors.
10240 int FirstRegId = -1;
10241 Indices.assign(1, OffsetReg0);
10242 for (auto [Pos, I] : enumerate(Mask)) {
10243 if (I == PoisonMaskElem)
10244 continue;
10245 int Idx = I - OffsetReg0;
10246 int RegId =
10247 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10248 if (FirstRegId < 0)
10249 FirstRegId = RegId;
10250 RegIndices.insert(RegId);
10251 if (RegIndices.size() > 2)
10252 return std::nullopt;
10253 if (RegIndices.size() == 2) {
10254 ShuffleKind = TTI::SK_PermuteTwoSrc;
10255 if (Indices.size() == 1) {
10256 OffsetReg1 = alignDown(
10257 std::accumulate(
10258 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10259 [&](int S, int I) {
10260 if (I == PoisonMaskElem)
10261 return S;
10262 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10263 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10264 if (RegId == FirstRegId)
10265 return S;
10266 return std::min(S, I);
10267 }),
10268 EltsPerVector);
10269 Indices.push_back(OffsetReg1 % NumElts);
10270 }
10271 Idx = I - OffsetReg1;
10272 }
10273 I = (Idx % NumElts) % EltsPerVector +
10274 (RegId == FirstRegId ? 0 : EltsPerVector);
10275 }
10276 return ShuffleKind;
10277 };
10279
10280 // Process extracts in blocks of EltsPerVector to check if the source vector
10281 // operand can be re-used directly. If not, add the cost of creating a
10282 // shuffle to extract the values into a vector register.
10283 for (unsigned Part : seq<unsigned>(NumParts)) {
10284 if (!ShuffleKinds[Part])
10285 continue;
10286 ArrayRef<int> MaskSlice = Mask.slice(
10287 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10288 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10289 copy(MaskSlice, SubMask.begin());
10291 std::optional<TTI::ShuffleKind> RegShuffleKind =
10292 CheckPerRegistersShuffle(SubMask, Indices);
10293 if (!RegShuffleKind) {
10294 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10296 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10297 Cost +=
10298 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10299 getWidenedType(ScalarTy, NumElts), MaskSlice);
10300 continue;
10301 }
10302 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10303 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10304 Cost +=
10305 ::getShuffleCost(TTI, *RegShuffleKind,
10306 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10307 }
10308 const unsigned BaseVF = getFullVectorNumberOfElements(
10309 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10310 for (unsigned Idx : Indices) {
10311 assert((Idx + EltsPerVector) <= BaseVF &&
10312 "SK_ExtractSubvector index out of range");
10314 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10315 Idx, getWidenedType(ScalarTy, EltsPerVector));
10316 }
10317 // Second attempt to check, if just a permute is better estimated than
10318 // subvector extract.
10319 SubMask.assign(NumElts, PoisonMaskElem);
10320 copy(MaskSlice, SubMask.begin());
10321 InstructionCost OriginalCost = ::getShuffleCost(
10322 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10323 if (OriginalCost < Cost)
10324 Cost = OriginalCost;
10325 }
10326 return Cost;
10327 }
10328 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10329 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10330 /// elements.
10331 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10332 ArrayRef<int> Mask, unsigned Part,
10333 unsigned SliceSize) {
10334 if (SameNodesEstimated) {
10335 // Delay the cost estimation if the same nodes are reshuffling.
10336 // If we already requested the cost of reshuffling of E1 and E2 before, no
10337 // need to estimate another cost with the sub-Mask, instead include this
10338 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10339 // estimation.
10340 if ((InVectors.size() == 2 &&
10341 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10342 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10343 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10344 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10345 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10346 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10347 "Expected all poisoned elements.");
10348 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10349 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10350 return;
10351 }
10352 // Found non-matching nodes - need to estimate the cost for the matched
10353 // and transform mask.
10354 Cost += createShuffle(InVectors.front(),
10355 InVectors.size() == 1 ? nullptr : InVectors.back(),
10356 CommonMask);
10357 transformMaskAfterShuffle(CommonMask, CommonMask);
10358 } else if (InVectors.size() == 2) {
10359 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10360 transformMaskAfterShuffle(CommonMask, CommonMask);
10361 }
10362 SameNodesEstimated = false;
10363 if (!E2 && InVectors.size() == 1) {
10364 unsigned VF = E1.getVectorFactor();
10365 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10366 VF = std::max(VF,
10367 cast<FixedVectorType>(V1->getType())->getNumElements());
10368 } else {
10369 const auto *E = cast<const TreeEntry *>(InVectors.front());
10370 VF = std::max(VF, E->getVectorFactor());
10371 }
10372 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10373 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10374 CommonMask[Idx] = Mask[Idx] + VF;
10375 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10376 transformMaskAfterShuffle(CommonMask, CommonMask);
10377 } else {
10378 auto P = InVectors.front();
10379 Cost += createShuffle(&E1, E2, Mask);
10380 unsigned VF = Mask.size();
10381 if (Value *V1 = P.dyn_cast<Value *>()) {
10382 VF = std::max(VF,
10383 getNumElements(V1->getType()));
10384 } else {
10385 const auto *E = cast<const TreeEntry *>(P);
10386 VF = std::max(VF, E->getVectorFactor());
10387 }
10388 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10389 if (Mask[Idx] != PoisonMaskElem)
10390 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10391 Cost += createShuffle(P, InVectors.front(), CommonMask);
10392 transformMaskAfterShuffle(CommonMask, CommonMask);
10393 }
10394 }
10395
10396 class ShuffleCostBuilder {
10397 const TargetTransformInfo &TTI;
10398
10399 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10400 int Index = -1;
10401 return Mask.empty() ||
10402 (VF == Mask.size() &&
10405 Index == 0);
10406 }
10407
10408 public:
10409 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10410 ~ShuffleCostBuilder() = default;
10411 InstructionCost createShuffleVector(Value *V1, Value *,
10412 ArrayRef<int> Mask) const {
10413 // Empty mask or identity mask are free.
10414 unsigned VF =
10415 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10416 if (isEmptyOrIdentity(Mask, VF))
10417 return TTI::TCC_Free;
10418 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10419 cast<VectorType>(V1->getType()), Mask);
10420 }
10421 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10422 // Empty mask or identity mask are free.
10423 unsigned VF =
10424 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10425 if (isEmptyOrIdentity(Mask, VF))
10426 return TTI::TCC_Free;
10427 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10428 cast<VectorType>(V1->getType()), Mask);
10429 }
10430 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10431 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10432 return TTI::TCC_Free;
10433 }
10434 void resizeToMatch(Value *&, Value *&) const {}
10435 };
10436
10437 /// Smart shuffle instruction emission, walks through shuffles trees and
10438 /// tries to find the best matching vector for the actual shuffle
10439 /// instruction.
10441 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10443 ArrayRef<int> Mask) {
10444 ShuffleCostBuilder Builder(TTI);
10445 SmallVector<int> CommonMask(Mask);
10446 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10447 unsigned CommonVF = Mask.size();
10448 InstructionCost ExtraCost = 0;
10449 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10450 unsigned VF) -> InstructionCost {
10451 if (E.isGather() && allConstant(E.Scalars))
10452 return TTI::TCC_Free;
10453 Type *EScalarTy = E.Scalars.front()->getType();
10454 bool IsSigned = true;
10455 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10456 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10457 IsSigned = It->second.second;
10458 }
10459 if (EScalarTy != ScalarTy) {
10460 unsigned CastOpcode = Instruction::Trunc;
10461 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10462 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10463 if (DstSz > SrcSz)
10464 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10465 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10466 getWidenedType(EScalarTy, VF),
10467 TTI::CastContextHint::None, CostKind);
10468 }
10469 return TTI::TCC_Free;
10470 };
10471 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10472 if (isa<Constant>(V))
10473 return TTI::TCC_Free;
10474 auto *VecTy = cast<VectorType>(V->getType());
10475 Type *EScalarTy = VecTy->getElementType();
10476 if (EScalarTy != ScalarTy) {
10477 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10478 unsigned CastOpcode = Instruction::Trunc;
10479 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10480 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10481 if (DstSz > SrcSz)
10482 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10483 return TTI.getCastInstrCost(
10484 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10485 VecTy, TTI::CastContextHint::None, CostKind);
10486 }
10487 return TTI::TCC_Free;
10488 };
10489 if (!V1 && !V2 && !P2.isNull()) {
10490 // Shuffle 2 entry nodes.
10491 const TreeEntry *E = cast<const TreeEntry *>(P1);
10492 unsigned VF = E->getVectorFactor();
10493 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10494 CommonVF = std::max(VF, E2->getVectorFactor());
10495 assert(all_of(Mask,
10496 [=](int Idx) {
10497 return Idx < 2 * static_cast<int>(CommonVF);
10498 }) &&
10499 "All elements in mask must be less than 2 * CommonVF.");
10500 if (E->Scalars.size() == E2->Scalars.size()) {
10501 SmallVector<int> EMask = E->getCommonMask();
10502 SmallVector<int> E2Mask = E2->getCommonMask();
10503 if (!EMask.empty() || !E2Mask.empty()) {
10504 for (int &Idx : CommonMask) {
10505 if (Idx == PoisonMaskElem)
10506 continue;
10507 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10508 Idx = EMask[Idx];
10509 else if (Idx >= static_cast<int>(CommonVF))
10510 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10511 E->Scalars.size();
10512 }
10513 }
10514 CommonVF = E->Scalars.size();
10515 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10516 GetNodeMinBWAffectedCost(*E2, CommonVF);
10517 } else {
10518 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10519 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10520 }
10521 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10522 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10523 } else if (!V1 && P2.isNull()) {
10524 // Shuffle single entry node.
10525 const TreeEntry *E = cast<const TreeEntry *>(P1);
10526 unsigned VF = E->getVectorFactor();
10527 CommonVF = VF;
10528 assert(
10529 all_of(Mask,
10530 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10531 "All elements in mask must be less than CommonVF.");
10532 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10533 SmallVector<int> EMask = E->getCommonMask();
10534 assert(!EMask.empty() && "Expected non-empty common mask.");
10535 for (int &Idx : CommonMask) {
10536 if (Idx != PoisonMaskElem)
10537 Idx = EMask[Idx];
10538 }
10539 CommonVF = E->Scalars.size();
10540 } else if (unsigned Factor = E->getInterleaveFactor();
10541 Factor > 0 && E->Scalars.size() != Mask.size() &&
10543 Factor)) {
10544 // Deinterleaved nodes are free.
10545 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10546 }
10547 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10548 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10549 // Not identity/broadcast? Try to see if the original vector is better.
10550 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10551 CommonVF == CommonMask.size() &&
10552 any_of(enumerate(CommonMask),
10553 [](const auto &&P) {
10554 return P.value() != PoisonMaskElem &&
10555 static_cast<unsigned>(P.value()) != P.index();
10556 }) &&
10557 any_of(CommonMask,
10558 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10559 SmallVector<int> ReorderMask;
10560 inversePermutation(E->ReorderIndices, ReorderMask);
10561 ::addMask(CommonMask, ReorderMask);
10562 }
10563 } else if (V1 && P2.isNull()) {
10564 // Shuffle single vector.
10565 ExtraCost += GetValueMinBWAffectedCost(V1);
10566 CommonVF = getVF(V1);
10567 assert(
10568 all_of(Mask,
10569 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10570 "All elements in mask must be less than CommonVF.");
10571 } else if (V1 && !V2) {
10572 // Shuffle vector and tree node.
10573 unsigned VF = getVF(V1);
10574 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10575 CommonVF = std::max(VF, E2->getVectorFactor());
10576 assert(all_of(Mask,
10577 [=](int Idx) {
10578 return Idx < 2 * static_cast<int>(CommonVF);
10579 }) &&
10580 "All elements in mask must be less than 2 * CommonVF.");
10581 if (E2->Scalars.size() == VF && VF != CommonVF) {
10582 SmallVector<int> E2Mask = E2->getCommonMask();
10583 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10584 for (int &Idx : CommonMask) {
10585 if (Idx == PoisonMaskElem)
10586 continue;
10587 if (Idx >= static_cast<int>(CommonVF))
10588 Idx = E2Mask[Idx - CommonVF] + VF;
10589 }
10590 CommonVF = VF;
10591 }
10592 ExtraCost += GetValueMinBWAffectedCost(V1);
10593 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10594 ExtraCost += GetNodeMinBWAffectedCost(
10595 *E2, std::min(CommonVF, E2->getVectorFactor()));
10596 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10597 } else if (!V1 && V2) {
10598 // Shuffle vector and tree node.
10599 unsigned VF = getVF(V2);
10600 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10601 CommonVF = std::max(VF, E1->getVectorFactor());
10602 assert(all_of(Mask,
10603 [=](int Idx) {
10604 return Idx < 2 * static_cast<int>(CommonVF);
10605 }) &&
10606 "All elements in mask must be less than 2 * CommonVF.");
10607 if (E1->Scalars.size() == VF && VF != CommonVF) {
10608 SmallVector<int> E1Mask = E1->getCommonMask();
10609 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10610 for (int &Idx : CommonMask) {
10611 if (Idx == PoisonMaskElem)
10612 continue;
10613 if (Idx >= static_cast<int>(CommonVF))
10614 Idx = E1Mask[Idx - CommonVF] + VF;
10615 else
10616 Idx = E1Mask[Idx];
10617 }
10618 CommonVF = VF;
10619 }
10620 ExtraCost += GetNodeMinBWAffectedCost(
10621 *E1, std::min(CommonVF, E1->getVectorFactor()));
10622 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10623 ExtraCost += GetValueMinBWAffectedCost(V2);
10624 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10625 } else {
10626 assert(V1 && V2 && "Expected both vectors.");
10627 unsigned VF = getVF(V1);
10628 CommonVF = std::max(VF, getVF(V2));
10629 assert(all_of(Mask,
10630 [=](int Idx) {
10631 return Idx < 2 * static_cast<int>(CommonVF);
10632 }) &&
10633 "All elements in mask must be less than 2 * CommonVF.");
10634 ExtraCost +=
10635 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10636 if (V1->getType() != V2->getType()) {
10637 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10638 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10639 } else {
10640 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10641 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10642 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10643 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10644 }
10645 }
10646 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10647 assert(SLPReVec && "FixedVectorType is not expected.");
10649 CommonMask);
10650 }
10651 InVectors.front() =
10652 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10653 if (InVectors.size() == 2)
10654 InVectors.pop_back();
10655 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10656 V1, V2, CommonMask, Builder);
10657 }
10658
10659public:
10661 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10662 SmallPtrSetImpl<Value *> &CheckedExtracts)
10663 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10664 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10665 CheckedExtracts(CheckedExtracts) {}
10666 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10667 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10668 unsigned NumParts, bool &UseVecBaseAsInput) {
10669 UseVecBaseAsInput = false;
10670 if (Mask.empty())
10671 return nullptr;
10672 Value *VecBase = nullptr;
10673 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10674 if (!E->ReorderIndices.empty()) {
10675 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10676 E->ReorderIndices.end());
10677 reorderScalars(VL, ReorderMask);
10678 }
10679 // Check if it can be considered reused if same extractelements were
10680 // vectorized already.
10681 bool PrevNodeFound = any_of(
10682 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10683 [&](const std::unique_ptr<TreeEntry> &TE) {
10684 return ((TE->hasState() && !TE->isAltShuffle() &&
10685 TE->getOpcode() == Instruction::ExtractElement) ||
10686 TE->isGather()) &&
10687 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10688 return VL.size() > Data.index() &&
10689 (Mask[Data.index()] == PoisonMaskElem ||
10690 isa<UndefValue>(VL[Data.index()]) ||
10691 Data.value() == VL[Data.index()]);
10692 });
10693 });
10694 SmallPtrSet<Value *, 4> UniqueBases;
10695 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10696 for (unsigned Part : seq<unsigned>(NumParts)) {
10697 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10698 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10699 for (auto [I, V] :
10700 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10701 // Ignore non-extractelement scalars.
10702 if (isa<UndefValue>(V) ||
10703 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10704 continue;
10705 // If all users of instruction are going to be vectorized and this
10706 // instruction itself is not going to be vectorized, consider this
10707 // instruction as dead and remove its cost from the final cost of the
10708 // vectorized tree.
10709 // Also, avoid adjusting the cost for extractelements with multiple uses
10710 // in different graph entries.
10711 auto *EE = cast<ExtractElementInst>(V);
10712 VecBase = EE->getVectorOperand();
10713 UniqueBases.insert(VecBase);
10714 const TreeEntry *VE = R.getTreeEntry(V);
10715 if (!CheckedExtracts.insert(V).second ||
10716 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10717 any_of(EE->users(),
10718 [&](User *U) {
10719 return isa<GetElementPtrInst>(U) &&
10720 !R.areAllUsersVectorized(cast<Instruction>(U),
10721 &VectorizedVals);
10722 }) ||
10723 (VE && VE != E))
10724 continue;
10725 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10726 if (!EEIdx)
10727 continue;
10728 unsigned Idx = *EEIdx;
10729 // Take credit for instruction that will become dead.
10730 if (EE->hasOneUse() || !PrevNodeFound) {
10731 Instruction *Ext = EE->user_back();
10732 if (isa<SExtInst, ZExtInst>(Ext) &&
10733 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10734 // Use getExtractWithExtendCost() to calculate the cost of
10735 // extractelement/ext pair.
10736 Cost -=
10737 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10738 EE->getVectorOperandType(), Idx);
10739 // Add back the cost of s|zext which is subtracted separately.
10741 Ext->getOpcode(), Ext->getType(), EE->getType(),
10742 TTI::getCastContextHint(Ext), CostKind, Ext);
10743 continue;
10744 }
10745 }
10746 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10747 CostKind, Idx);
10748 }
10749 }
10750 // Check that gather of extractelements can be represented as just a
10751 // shuffle of a single/two vectors the scalars are extracted from.
10752 // Found the bunch of extractelement instructions that must be gathered
10753 // into a vector and can be represented as a permutation elements in a
10754 // single input vector or of 2 input vectors.
10755 // Done for reused if same extractelements were vectorized already.
10756 if (!PrevNodeFound)
10757 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10758 InVectors.assign(1, E);
10759 CommonMask.assign(Mask.begin(), Mask.end());
10760 transformMaskAfterShuffle(CommonMask, CommonMask);
10761 SameNodesEstimated = false;
10762 if (NumParts != 1 && UniqueBases.size() != 1) {
10763 UseVecBaseAsInput = true;
10764 VecBase =
10765 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10766 }
10767 return VecBase;
10768 }
10769 /// Checks if the specified entry \p E needs to be delayed because of its
10770 /// dependency nodes.
10771 std::optional<InstructionCost>
10772 needToDelay(const TreeEntry *,
10774 // No need to delay the cost estimation during analysis.
10775 return std::nullopt;
10776 }
10777 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10778 if (&E1 == &E2) {
10779 assert(all_of(Mask,
10780 [&](int Idx) {
10781 return Idx < static_cast<int>(E1.getVectorFactor());
10782 }) &&
10783 "Expected single vector shuffle mask.");
10784 add(E1, Mask);
10785 return;
10786 }
10787 if (InVectors.empty()) {
10788 CommonMask.assign(Mask.begin(), Mask.end());
10789 InVectors.assign({&E1, &E2});
10790 return;
10791 }
10792 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10793 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10794 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10795 if (NumParts == 0 || NumParts >= Mask.size() ||
10796 MaskVecTy->getNumElements() % NumParts != 0 ||
10797 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10798 MaskVecTy->getNumElements() / NumParts))
10799 NumParts = 1;
10800 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10801 const auto *It =
10802 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10803 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10804 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10805 }
10806 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10807 if (InVectors.empty()) {
10808 CommonMask.assign(Mask.begin(), Mask.end());
10809 InVectors.assign(1, &E1);
10810 return;
10811 }
10812 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10813 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10814 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10815 if (NumParts == 0 || NumParts >= Mask.size() ||
10816 MaskVecTy->getNumElements() % NumParts != 0 ||
10817 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10818 MaskVecTy->getNumElements() / NumParts))
10819 NumParts = 1;
10820 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10821 const auto *It =
10822 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10823 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10824 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10825 if (!SameNodesEstimated && InVectors.size() == 1)
10826 InVectors.emplace_back(&E1);
10827 }
10828 /// Adds 2 input vectors and the mask for their shuffling.
10829 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10830 // May come only for shuffling of 2 vectors with extractelements, already
10831 // handled in adjustExtracts.
10832 assert(InVectors.size() == 1 &&
10833 all_of(enumerate(CommonMask),
10834 [&](auto P) {
10835 if (P.value() == PoisonMaskElem)
10836 return Mask[P.index()] == PoisonMaskElem;
10837 auto *EI = cast<ExtractElementInst>(
10838 cast<const TreeEntry *>(InVectors.front())
10839 ->getOrdered(P.index()));
10840 return EI->getVectorOperand() == V1 ||
10841 EI->getVectorOperand() == V2;
10842 }) &&
10843 "Expected extractelement vectors.");
10844 }
10845 /// Adds another one input vector and the mask for the shuffling.
10846 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10847 if (InVectors.empty()) {
10848 assert(CommonMask.empty() && !ForExtracts &&
10849 "Expected empty input mask/vectors.");
10850 CommonMask.assign(Mask.begin(), Mask.end());
10851 InVectors.assign(1, V1);
10852 return;
10853 }
10854 if (ForExtracts) {
10855 // No need to add vectors here, already handled them in adjustExtracts.
10856 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10857 !CommonMask.empty() &&
10858 all_of(enumerate(CommonMask),
10859 [&](auto P) {
10860 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10861 ->getOrdered(P.index());
10862 if (P.value() == PoisonMaskElem)
10863 return P.value() == Mask[P.index()] ||
10864 isa<UndefValue>(Scalar);
10865 if (isa<Constant>(V1))
10866 return true;
10867 auto *EI = cast<ExtractElementInst>(Scalar);
10868 return EI->getVectorOperand() == V1;
10869 }) &&
10870 "Expected only tree entry for extractelement vectors.");
10871 return;
10872 }
10873 assert(!InVectors.empty() && !CommonMask.empty() &&
10874 "Expected only tree entries from extracts/reused buildvectors.");
10875 unsigned VF = getVF(V1);
10876 if (InVectors.size() == 2) {
10877 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10878 transformMaskAfterShuffle(CommonMask, CommonMask);
10879 VF = std::max<unsigned>(VF, CommonMask.size());
10880 } else if (const auto *InTE =
10881 InVectors.front().dyn_cast<const TreeEntry *>()) {
10882 VF = std::max(VF, InTE->getVectorFactor());
10883 } else {
10884 VF = std::max(
10885 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10886 ->getNumElements());
10887 }
10888 InVectors.push_back(V1);
10889 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10890 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10891 CommonMask[Idx] = Mask[Idx] + VF;
10892 }
10893 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10894 Value *Root = nullptr) {
10895 Cost += getBuildVectorCost(VL, Root);
10896 if (!Root) {
10897 // FIXME: Need to find a way to avoid use of getNullValue here.
10899 unsigned VF = VL.size();
10900 if (MaskVF != 0)
10901 VF = std::min(VF, MaskVF);
10902 for (Value *V : VL.take_front(VF)) {
10903 if (isa<UndefValue>(V)) {
10904 Vals.push_back(cast<Constant>(V));
10905 continue;
10906 }
10907 Vals.push_back(Constant::getNullValue(V->getType()));
10908 }
10909 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10910 assert(SLPReVec && "FixedVectorType is not expected.");
10911 // When REVEC is enabled, we need to expand vector types into scalar
10912 // types.
10913 unsigned VecTyNumElements = VecTy->getNumElements();
10914 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10915 for (auto [I, V] : enumerate(Vals)) {
10916 Type *ScalarTy = V->getType()->getScalarType();
10917 Constant *NewVal;
10918 if (isa<PoisonValue>(V))
10919 NewVal = PoisonValue::get(ScalarTy);
10920 else if (isa<UndefValue>(V))
10921 NewVal = UndefValue::get(ScalarTy);
10922 else
10923 NewVal = Constant::getNullValue(ScalarTy);
10924 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10925 NewVal);
10926 }
10927 Vals.swap(NewVals);
10928 }
10929 return ConstantVector::get(Vals);
10930 }
10933 cast<FixedVectorType>(Root->getType())->getNumElements()),
10934 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10935 }
10937 /// Finalize emission of the shuffles.
10940 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10941 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10942 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10943 IsFinalized = true;
10944 if (Action) {
10945 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10946 if (InVectors.size() == 2)
10947 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10948 else
10949 Cost += createShuffle(Vec, nullptr, CommonMask);
10950 transformMaskAfterShuffle(CommonMask, CommonMask);
10951 assert(VF > 0 &&
10952 "Expected vector length for the final value before action.");
10953 Value *V = cast<Value *>(Vec);
10954 Action(V, CommonMask);
10955 InVectors.front() = V;
10956 }
10957 if (!SubVectors.empty()) {
10958 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10959 if (InVectors.size() == 2)
10960 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10961 else
10962 Cost += createShuffle(Vec, nullptr, CommonMask);
10963 transformMaskAfterShuffle(CommonMask, CommonMask);
10964 // Add subvectors permutation cost.
10965 if (!SubVectorsMask.empty()) {
10966 assert(SubVectorsMask.size() <= CommonMask.size() &&
10967 "Expected same size of masks for subvectors and common mask.");
10968 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10969 copy(SubVectorsMask, SVMask.begin());
10970 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10971 if (I2 != PoisonMaskElem) {
10972 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10973 I1 = I2 + CommonMask.size();
10974 }
10975 }
10977 getWidenedType(ScalarTy, CommonMask.size()),
10978 SVMask, CostKind);
10979 }
10980 for (auto [E, Idx] : SubVectors) {
10981 Type *EScalarTy = E->Scalars.front()->getType();
10982 bool IsSigned = true;
10983 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10984 EScalarTy =
10985 IntegerType::get(EScalarTy->getContext(), It->second.first);
10986 IsSigned = It->second.second;
10987 }
10988 if (ScalarTy != EScalarTy) {
10989 unsigned CastOpcode = Instruction::Trunc;
10990 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10991 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10992 if (DstSz > SrcSz)
10993 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10995 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10996 getWidenedType(EScalarTy, E->getVectorFactor()),
10998 }
11001 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
11002 getWidenedType(ScalarTy, E->getVectorFactor()));
11003 if (!CommonMask.empty()) {
11004 std::iota(std::next(CommonMask.begin(), Idx),
11005 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
11006 Idx);
11007 }
11008 }
11009 }
11010
11011 if (!ExtMask.empty()) {
11012 if (CommonMask.empty()) {
11013 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11014 } else {
11015 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11016 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11017 if (ExtMask[I] == PoisonMaskElem)
11018 continue;
11019 NewMask[I] = CommonMask[ExtMask[I]];
11020 }
11021 CommonMask.swap(NewMask);
11022 }
11023 }
11024 if (CommonMask.empty()) {
11025 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11026 return Cost;
11027 }
11028 return Cost +
11029 createShuffle(InVectors.front(),
11030 InVectors.size() == 2 ? InVectors.back() : nullptr,
11031 CommonMask);
11032 }
11033
11035 assert((IsFinalized || CommonMask.empty()) &&
11036 "Shuffle construction must be finalized.");
11037 }
11038};
11039
11040const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
11041 unsigned Idx) const {
11042 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
11043 return VE;
11044 const auto *It =
11045 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11046 return TE->isGather() &&
11047 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
11048 return EI.EdgeIdx == Idx && EI.UserTE == E;
11049 }) != TE->UserTreeIndices.end();
11050 });
11051 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
11052 return It->get();
11053}
11054
11055TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
11056 if (TE.State == TreeEntry::ScatterVectorize ||
11057 TE.State == TreeEntry::StridedVectorize)
11059 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11060 !TE.isAltShuffle()) {
11061 if (TE.ReorderIndices.empty())
11063 SmallVector<int> Mask;
11064 inversePermutation(TE.ReorderIndices, Mask);
11065 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
11067 }
11069}
11070
11071/// Builds the arguments types vector for the given call instruction with the
11072/// given \p ID for the specified vector factor.
11075 const unsigned VF, unsigned MinBW,
11076 const TargetTransformInfo *TTI) {
11077 SmallVector<Type *> ArgTys;
11078 for (auto [Idx, Arg] : enumerate(CI->args())) {
11081 ArgTys.push_back(Arg->getType());
11082 continue;
11083 }
11084 if (MinBW > 0) {
11085 ArgTys.push_back(
11086 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11087 continue;
11088 }
11089 }
11090 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11091 }
11092 return ArgTys;
11093}
11094
11096BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11097 SmallPtrSetImpl<Value *> &CheckedExtracts) {
11098 ArrayRef<Value *> VL = E->Scalars;
11099
11100 Type *ScalarTy = getValueType(VL[0]);
11101 if (!isValidElementType(ScalarTy))
11104
11105 // If we have computed a smaller type for the expression, update VecTy so
11106 // that the costs will be accurate.
11107 auto It = MinBWs.find(E);
11108 Type *OrigScalarTy = ScalarTy;
11109 if (It != MinBWs.end()) {
11110 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11111 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11112 if (VecTy)
11113 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
11114 }
11115 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11116 unsigned EntryVF = E->getVectorFactor();
11117 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
11118
11119 if (E->isGather()) {
11120 if (allConstant(VL))
11121 return 0;
11122 if (isa<InsertElementInst>(VL[0]))
11124 if (isa<CmpInst>(VL.front()))
11125 ScalarTy = VL.front()->getType();
11126 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11127 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11128 }
11129 InstructionCost CommonCost = 0;
11131 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11132 !isReverseOrder(E->ReorderIndices))) {
11133 SmallVector<int> NewMask;
11134 if (E->getOpcode() == Instruction::Store) {
11135 // For stores the order is actually a mask.
11136 NewMask.resize(E->ReorderIndices.size());
11137 copy(E->ReorderIndices, NewMask.begin());
11138 } else {
11139 inversePermutation(E->ReorderIndices, NewMask);
11140 }
11141 ::addMask(Mask, NewMask);
11142 }
11143 if (!E->ReuseShuffleIndices.empty())
11144 ::addMask(Mask, E->ReuseShuffleIndices);
11145 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11146 CommonCost =
11147 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11148 assert((E->State == TreeEntry::Vectorize ||
11149 E->State == TreeEntry::ScatterVectorize ||
11150 E->State == TreeEntry::StridedVectorize) &&
11151 "Unhandled state");
11152 assert(E->getOpcode() &&
11153 ((allSameType(VL) && allSameBlock(VL)) ||
11154 (E->getOpcode() == Instruction::GetElementPtr &&
11155 E->getMainOp()->getType()->isPointerTy())) &&
11156 "Invalid VL");
11157 Instruction *VL0 = E->getMainOp();
11158 unsigned ShuffleOrOp =
11159 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11160 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11161 ShuffleOrOp = E->CombinedOp;
11162 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11163 const unsigned Sz = UniqueValues.size();
11164 SmallBitVector UsedScalars(Sz, false);
11165 for (unsigned I = 0; I < Sz; ++I) {
11166 if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11167 continue;
11168 UsedScalars.set(I);
11169 }
11170 auto GetCastContextHint = [&](Value *V) {
11171 if (const TreeEntry *OpTE = getTreeEntry(V))
11172 return getCastContextHint(*OpTE);
11173 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11174 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11175 !SrcState.isAltShuffle())
11178 };
11179 auto GetCostDiff =
11180 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11182 // Calculate the cost of this instruction.
11183 InstructionCost ScalarCost = 0;
11184 if (isa<CastInst, CallInst>(VL0)) {
11185 // For some of the instructions no need to calculate cost for each
11186 // particular instruction, we can use the cost of the single
11187 // instruction x total number of scalar instructions.
11188 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11189 } else {
11190 for (unsigned I = 0; I < Sz; ++I) {
11191 if (UsedScalars.test(I))
11192 continue;
11193 ScalarCost += ScalarEltCost(I);
11194 }
11195 }
11196
11197 InstructionCost VecCost = VectorCost(CommonCost);
11198 // Check if the current node must be resized, if the parent node is not
11199 // resized.
11200 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11201 E->Idx != 0 &&
11202 (E->getOpcode() != Instruction::Load ||
11203 !E->UserTreeIndices.empty())) {
11204 const EdgeInfo &EI =
11205 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11206 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11207 });
11208 if (EI.UserTE->getOpcode() != Instruction::Select ||
11209 EI.EdgeIdx != 0) {
11210 auto UserBWIt = MinBWs.find(EI.UserTE);
11211 Type *UserScalarTy =
11212 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11213 if (UserBWIt != MinBWs.end())
11214 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11215 UserBWIt->second.first);
11216 if (ScalarTy != UserScalarTy) {
11217 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11218 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11219 unsigned VecOpcode;
11220 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11221 if (BWSz > SrcBWSz)
11222 VecOpcode = Instruction::Trunc;
11223 else
11224 VecOpcode =
11225 It->second.second ? Instruction::SExt : Instruction::ZExt;
11226 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11227 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11228 CostKind);
11229 }
11230 }
11231 }
11232 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11233 ScalarCost, "Calculated costs for Tree"));
11234 return VecCost - ScalarCost;
11235 };
11236 // Calculate cost difference from vectorizing set of GEPs.
11237 // Negative value means vectorizing is profitable.
11238 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11239 assert((E->State == TreeEntry::Vectorize ||
11240 E->State == TreeEntry::StridedVectorize) &&
11241 "Entry state expected to be Vectorize or StridedVectorize here.");
11242 InstructionCost ScalarCost = 0;
11243 InstructionCost VecCost = 0;
11244 std::tie(ScalarCost, VecCost) = getGEPCosts(
11245 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11246 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11247 "Calculated GEPs cost for Tree"));
11248
11249 return VecCost - ScalarCost;
11250 };
11251
11252 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11253 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11254 if (MinMaxID == Intrinsic::not_intrinsic)
11256 Type *CanonicalType = Ty;
11257 if (CanonicalType->isPtrOrPtrVectorTy())
11258 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11259 CanonicalType->getContext(),
11260 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11261
11262 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11263 {CanonicalType, CanonicalType});
11264 InstructionCost IntrinsicCost =
11265 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11266 // If the selects are the only uses of the compares, they will be
11267 // dead and we can adjust the cost by removing their cost.
11268 if (VI && SelectOnly) {
11269 assert((!Ty->isVectorTy() || SLPReVec) &&
11270 "Expected only for scalar type.");
11271 auto *CI = cast<CmpInst>(VI->getOperand(0));
11272 IntrinsicCost -= TTI->getCmpSelInstrCost(
11273 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11274 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11275 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11276 }
11277 return IntrinsicCost;
11278 };
11279 switch (ShuffleOrOp) {
11280 case Instruction::PHI: {
11281 // Count reused scalars.
11282 InstructionCost ScalarCost = 0;
11284 for (Value *V : UniqueValues) {
11285 auto *PHI = dyn_cast<PHINode>(V);
11286 if (!PHI)
11287 continue;
11288
11289 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11290 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11291 Value *Op = PHI->getIncomingValue(I);
11292 Operands[I] = Op;
11293 }
11294 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11295 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11296 if (!OpTE->ReuseShuffleIndices.empty())
11297 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11298 OpTE->Scalars.size());
11299 }
11300
11301 return CommonCost - ScalarCost;
11302 }
11303 case Instruction::ExtractValue:
11304 case Instruction::ExtractElement: {
11305 auto GetScalarCost = [&](unsigned Idx) {
11306 if (isa<PoisonValue>(UniqueValues[Idx]))
11308
11309 auto *I = cast<Instruction>(UniqueValues[Idx]);
11310 VectorType *SrcVecTy;
11311 if (ShuffleOrOp == Instruction::ExtractElement) {
11312 auto *EE = cast<ExtractElementInst>(I);
11313 SrcVecTy = EE->getVectorOperandType();
11314 } else {
11315 auto *EV = cast<ExtractValueInst>(I);
11316 Type *AggregateTy = EV->getAggregateOperand()->getType();
11317 unsigned NumElts;
11318 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11319 NumElts = ATy->getNumElements();
11320 else
11321 NumElts = AggregateTy->getStructNumElements();
11322 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11323 }
11324 if (I->hasOneUse()) {
11325 Instruction *Ext = I->user_back();
11326 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11327 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11328 // Use getExtractWithExtendCost() to calculate the cost of
11329 // extractelement/ext pair.
11331 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11332 // Subtract the cost of s|zext which is subtracted separately.
11334 Ext->getOpcode(), Ext->getType(), I->getType(),
11336 return Cost;
11337 }
11338 }
11339 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11341 };
11342 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11343 return GetCostDiff(GetScalarCost, GetVectorCost);
11344 }
11345 case Instruction::InsertElement: {
11346 assert(E->ReuseShuffleIndices.empty() &&
11347 "Unique insertelements only are expected.");
11348 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11349 unsigned const NumElts = SrcVecTy->getNumElements();
11350 unsigned const NumScalars = VL.size();
11351
11352 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11353
11354 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11355 unsigned OffsetBeg = *getElementIndex(VL.front());
11356 unsigned OffsetEnd = OffsetBeg;
11357 InsertMask[OffsetBeg] = 0;
11358 for (auto [I, V] : enumerate(VL.drop_front())) {
11359 unsigned Idx = *getElementIndex(V);
11360 if (OffsetBeg > Idx)
11361 OffsetBeg = Idx;
11362 else if (OffsetEnd < Idx)
11363 OffsetEnd = Idx;
11364 InsertMask[Idx] = I + 1;
11365 }
11366 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11367 if (NumOfParts > 0 && NumOfParts < NumElts)
11368 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11369 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11370 VecScalarsSz;
11371 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11372 unsigned InsertVecSz = std::min<unsigned>(
11373 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11374 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11375 bool IsWholeSubvector =
11376 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11377 // Check if we can safely insert a subvector. If it is not possible, just
11378 // generate a whole-sized vector and shuffle the source vector and the new
11379 // subvector.
11380 if (OffsetBeg + InsertVecSz > VecSz) {
11381 // Align OffsetBeg to generate correct mask.
11382 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11383 InsertVecSz = VecSz;
11384 }
11385
11386 APInt DemandedElts = APInt::getZero(NumElts);
11387 // TODO: Add support for Instruction::InsertValue.
11389 if (!E->ReorderIndices.empty()) {
11390 inversePermutation(E->ReorderIndices, Mask);
11391 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11392 } else {
11393 Mask.assign(VecSz, PoisonMaskElem);
11394 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11395 }
11396 bool IsIdentity = true;
11397 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11398 Mask.swap(PrevMask);
11399 for (unsigned I = 0; I < NumScalars; ++I) {
11400 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11401 DemandedElts.setBit(InsertIdx);
11402 IsIdentity &= InsertIdx - OffsetBeg == I;
11403 Mask[InsertIdx - OffsetBeg] = I;
11404 }
11405 assert(Offset < NumElts && "Failed to find vector index offset");
11406
11408 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11409 /*Insert*/ true, /*Extract*/ false,
11410 CostKind);
11411
11412 // First cost - resize to actual vector size if not identity shuffle or
11413 // need to shift the vector.
11414 // Do not calculate the cost if the actual size is the register size and
11415 // we can merge this shuffle with the following SK_Select.
11416 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11417 if (!IsIdentity)
11419 InsertVecTy, Mask);
11420 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11421 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11422 }));
11423 // Second cost - permutation with subvector, if some elements are from the
11424 // initial vector or inserting a subvector.
11425 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11426 // subvector of ActualVecTy.
11427 SmallBitVector InMask =
11428 isUndefVector(FirstInsert->getOperand(0),
11429 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11430 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11431 if (InsertVecSz != VecSz) {
11432 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11433 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11434 CostKind, OffsetBeg - Offset, InsertVecTy);
11435 } else {
11436 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11437 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11438 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11439 I <= End; ++I)
11440 if (Mask[I] != PoisonMaskElem)
11441 Mask[I] = I + VecSz;
11442 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11443 Mask[I] =
11444 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11445 Cost +=
11446 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11447 }
11448 }
11449 return Cost;
11450 }
11451 case Instruction::ZExt:
11452 case Instruction::SExt:
11453 case Instruction::FPToUI:
11454 case Instruction::FPToSI:
11455 case Instruction::FPExt:
11456 case Instruction::PtrToInt:
11457 case Instruction::IntToPtr:
11458 case Instruction::SIToFP:
11459 case Instruction::UIToFP:
11460 case Instruction::Trunc:
11461 case Instruction::FPTrunc:
11462 case Instruction::BitCast: {
11463 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11464 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11465 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11466 unsigned Opcode = ShuffleOrOp;
11467 unsigned VecOpcode = Opcode;
11468 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11469 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11470 // Check if the values are candidates to demote.
11471 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11472 if (SrcIt != MinBWs.end()) {
11473 SrcBWSz = SrcIt->second.first;
11474 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11475 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11476 SrcVecTy =
11477 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11478 }
11479 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11480 if (BWSz == SrcBWSz) {
11481 VecOpcode = Instruction::BitCast;
11482 } else if (BWSz < SrcBWSz) {
11483 VecOpcode = Instruction::Trunc;
11484 } else if (It != MinBWs.end()) {
11485 assert(BWSz > SrcBWSz && "Invalid cast!");
11486 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11487 } else if (SrcIt != MinBWs.end()) {
11488 assert(BWSz > SrcBWSz && "Invalid cast!");
11489 VecOpcode =
11490 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11491 }
11492 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11493 !SrcIt->second.second) {
11494 VecOpcode = Instruction::UIToFP;
11495 }
11496 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11497 assert(Idx == 0 && "Expected 0 index only");
11498 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11499 VL0->getOperand(0)->getType(),
11501 };
11502 auto GetVectorCost = [=](InstructionCost CommonCost) {
11503 // Do not count cost here if minimum bitwidth is in effect and it is just
11504 // a bitcast (here it is just a noop).
11505 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11506 return CommonCost;
11507 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11508 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11509
11510 bool IsArithmeticExtendedReduction =
11511 E->Idx == 0 && UserIgnoreList &&
11512 all_of(*UserIgnoreList, [](Value *V) {
11513 auto *I = cast<Instruction>(V);
11514 return is_contained({Instruction::Add, Instruction::FAdd,
11515 Instruction::Mul, Instruction::FMul,
11516 Instruction::And, Instruction::Or,
11517 Instruction::Xor},
11518 I->getOpcode());
11519 });
11520 if (IsArithmeticExtendedReduction &&
11521 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11522 return CommonCost;
11523 return CommonCost +
11524 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11525 VecOpcode == Opcode ? VI : nullptr);
11526 };
11527 return GetCostDiff(GetScalarCost, GetVectorCost);
11528 }
11529 case Instruction::FCmp:
11530 case Instruction::ICmp:
11531 case Instruction::Select: {
11532 CmpPredicate VecPred, SwappedVecPred;
11533 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11534 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11535 match(VL0, MatchCmp))
11536 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11537 else
11538 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11541 auto GetScalarCost = [&](unsigned Idx) {
11542 if (isa<PoisonValue>(UniqueValues[Idx]))
11544
11545 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11546 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11549 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11550 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11551 !match(VI, MatchCmp)) ||
11552 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11553 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11554 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11557
11559 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11560 CostKind, getOperandInfo(VI->getOperand(0)),
11561 getOperandInfo(VI->getOperand(1)), VI);
11562 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11563 if (IntrinsicCost.isValid())
11564 ScalarCost = IntrinsicCost;
11565
11566 return ScalarCost;
11567 };
11568 auto GetVectorCost = [&](InstructionCost CommonCost) {
11569 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11570
11571 InstructionCost VecCost =
11572 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11573 CostKind, getOperandInfo(E->getOperand(0)),
11574 getOperandInfo(E->getOperand(1)), VL0);
11575 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11576 auto *CondType =
11577 getWidenedType(SI->getCondition()->getType(), VL.size());
11578 unsigned CondNumElements = CondType->getNumElements();
11579 unsigned VecTyNumElements = getNumElements(VecTy);
11580 assert(VecTyNumElements >= CondNumElements &&
11581 VecTyNumElements % CondNumElements == 0 &&
11582 "Cannot vectorize Instruction::Select");
11583 if (CondNumElements != VecTyNumElements) {
11584 // When the return type is i1 but the source is fixed vector type, we
11585 // need to duplicate the condition value.
11586 VecCost += ::getShuffleCost(
11587 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11588 createReplicatedMask(VecTyNumElements / CondNumElements,
11589 CondNumElements));
11590 }
11591 }
11592 return VecCost + CommonCost;
11593 };
11594 return GetCostDiff(GetScalarCost, GetVectorCost);
11595 }
11596 case TreeEntry::MinMax: {
11597 auto GetScalarCost = [&](unsigned Idx) {
11598 return GetMinMaxCost(OrigScalarTy);
11599 };
11600 auto GetVectorCost = [&](InstructionCost CommonCost) {
11601 InstructionCost VecCost = GetMinMaxCost(VecTy);
11602 return VecCost + CommonCost;
11603 };
11604 return GetCostDiff(GetScalarCost, GetVectorCost);
11605 }
11606 case Instruction::FNeg:
11607 case Instruction::Add:
11608 case Instruction::FAdd:
11609 case Instruction::Sub:
11610 case Instruction::FSub:
11611 case Instruction::Mul:
11612 case Instruction::FMul:
11613 case Instruction::UDiv:
11614 case Instruction::SDiv:
11615 case Instruction::FDiv:
11616 case Instruction::URem:
11617 case Instruction::SRem:
11618 case Instruction::FRem:
11619 case Instruction::Shl:
11620 case Instruction::LShr:
11621 case Instruction::AShr:
11622 case Instruction::And:
11623 case Instruction::Or:
11624 case Instruction::Xor: {
11625 auto GetScalarCost = [&](unsigned Idx) {
11626 if (isa<PoisonValue>(UniqueValues[Idx]))
11628
11629 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11630 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11631 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11632 TTI::OperandValueInfo Op2Info =
11633 TTI::getOperandInfo(VI->getOperand(OpIdx));
11634 SmallVector<const Value *> Operands(VI->operand_values());
11635 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11636 Op1Info, Op2Info, Operands, VI);
11637 };
11638 auto GetVectorCost = [=](InstructionCost CommonCost) {
11639 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11640 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11641 ArrayRef<Value *> Ops = E->getOperand(I);
11642 if (all_of(Ops, [&](Value *Op) {
11643 auto *CI = dyn_cast<ConstantInt>(Op);
11644 return CI && CI->getValue().countr_one() >= It->second.first;
11645 }))
11646 return CommonCost;
11647 }
11648 }
11649 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11650 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11651 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11652 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11653 Op2Info, {}, nullptr, TLI) +
11654 CommonCost;
11655 };
11656 return GetCostDiff(GetScalarCost, GetVectorCost);
11657 }
11658 case Instruction::GetElementPtr: {
11659 return CommonCost + GetGEPCostDiff(VL, VL0);
11660 }
11661 case Instruction::Load: {
11662 auto GetScalarCost = [&](unsigned Idx) {
11663 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11664 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11665 VI->getAlign(), VI->getPointerAddressSpace(),
11667 };
11668 auto *LI0 = cast<LoadInst>(VL0);
11669 auto GetVectorCost = [&](InstructionCost CommonCost) {
11670 InstructionCost VecLdCost;
11671 switch (E->State) {
11672 case TreeEntry::Vectorize:
11673 if (unsigned Factor = E->getInterleaveFactor()) {
11674 VecLdCost = TTI->getInterleavedMemoryOpCost(
11675 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11676 LI0->getPointerAddressSpace(), CostKind);
11677
11678 } else {
11679 VecLdCost = TTI->getMemoryOpCost(
11680 Instruction::Load, VecTy, LI0->getAlign(),
11681 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11682 }
11683 break;
11684 case TreeEntry::StridedVectorize: {
11685 Align CommonAlignment =
11686 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11687 VecLdCost = TTI->getStridedMemoryOpCost(
11688 Instruction::Load, VecTy, LI0->getPointerOperand(),
11689 /*VariableMask=*/false, CommonAlignment, CostKind);
11690 break;
11691 }
11692 case TreeEntry::ScatterVectorize: {
11693 Align CommonAlignment =
11694 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11695 VecLdCost = TTI->getGatherScatterOpCost(
11696 Instruction::Load, VecTy, LI0->getPointerOperand(),
11697 /*VariableMask=*/false, CommonAlignment, CostKind);
11698 break;
11699 }
11700 case TreeEntry::CombinedVectorize:
11701 case TreeEntry::NeedToGather:
11702 llvm_unreachable("Unexpected vectorization state.");
11703 }
11704 return VecLdCost + CommonCost;
11705 };
11706
11707 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11708 // If this node generates masked gather load then it is not a terminal node.
11709 // Hence address operand cost is estimated separately.
11710 if (E->State == TreeEntry::ScatterVectorize)
11711 return Cost;
11712
11713 // Estimate cost of GEPs since this tree node is a terminator.
11714 SmallVector<Value *> PointerOps(VL.size());
11715 for (auto [I, V] : enumerate(VL))
11716 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11717 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11718 }
11719 case Instruction::Store: {
11720 bool IsReorder = !E->ReorderIndices.empty();
11721 auto GetScalarCost = [=](unsigned Idx) {
11722 auto *VI = cast<StoreInst>(VL[Idx]);
11723 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11724 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11725 VI->getAlign(), VI->getPointerAddressSpace(),
11726 CostKind, OpInfo, VI);
11727 };
11728 auto *BaseSI =
11729 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11730 auto GetVectorCost = [=](InstructionCost CommonCost) {
11731 // We know that we can merge the stores. Calculate the cost.
11732 InstructionCost VecStCost;
11733 if (E->State == TreeEntry::StridedVectorize) {
11734 Align CommonAlignment =
11735 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11736 VecStCost = TTI->getStridedMemoryOpCost(
11737 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11738 /*VariableMask=*/false, CommonAlignment, CostKind);
11739 } else {
11740 assert(E->State == TreeEntry::Vectorize &&
11741 "Expected either strided or consecutive stores.");
11742 if (unsigned Factor = E->getInterleaveFactor()) {
11743 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11744 "No reused shuffles expected");
11745 CommonCost = 0;
11746 VecStCost = TTI->getInterleavedMemoryOpCost(
11747 Instruction::Store, VecTy, Factor, std::nullopt,
11748 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11749 } else {
11750 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11751 VecStCost = TTI->getMemoryOpCost(
11752 Instruction::Store, VecTy, BaseSI->getAlign(),
11753 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11754 }
11755 }
11756 return VecStCost + CommonCost;
11757 };
11758 SmallVector<Value *> PointerOps(VL.size());
11759 for (auto [I, V] : enumerate(VL)) {
11760 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11761 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11762 }
11763
11764 return GetCostDiff(GetScalarCost, GetVectorCost) +
11765 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11766 }
11767 case Instruction::Call: {
11768 auto GetScalarCost = [&](unsigned Idx) {
11769 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11772 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11773 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11774 }
11777 CI->getFunctionType()->params(), CostKind);
11778 };
11779 auto GetVectorCost = [=](InstructionCost CommonCost) {
11780 auto *CI = cast<CallInst>(VL0);
11783 CI, ID, VecTy->getNumElements(),
11784 It != MinBWs.end() ? It->second.first : 0, TTI);
11785 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11786 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11787 };
11788 return GetCostDiff(GetScalarCost, GetVectorCost);
11789 }
11790 case Instruction::ShuffleVector: {
11791 if (!SLPReVec || E->isAltShuffle())
11792 assert(E->isAltShuffle() &&
11793 ((Instruction::isBinaryOp(E->getOpcode()) &&
11794 Instruction::isBinaryOp(E->getAltOpcode())) ||
11795 (Instruction::isCast(E->getOpcode()) &&
11796 Instruction::isCast(E->getAltOpcode())) ||
11797 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11798 "Invalid Shuffle Vector Operand");
11799 // Try to find the previous shuffle node with the same operands and same
11800 // main/alternate ops.
11801 auto TryFindNodeWithEqualOperands = [=]() {
11802 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11803 if (TE.get() == E)
11804 break;
11805 if (TE->hasState() && TE->isAltShuffle() &&
11806 ((TE->getOpcode() == E->getOpcode() &&
11807 TE->getAltOpcode() == E->getAltOpcode()) ||
11808 (TE->getOpcode() == E->getAltOpcode() &&
11809 TE->getAltOpcode() == E->getOpcode())) &&
11810 TE->hasEqualOperands(*E))
11811 return true;
11812 }
11813 return false;
11814 };
11815 auto GetScalarCost = [&](unsigned Idx) {
11816 if (isa<PoisonValue>(UniqueValues[Idx]))
11818
11819 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11820 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11821 (void)E;
11822 return TTI->getInstructionCost(VI, CostKind);
11823 };
11824 // Need to clear CommonCost since the final shuffle cost is included into
11825 // vector cost.
11826 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11827 // VecCost is equal to sum of the cost of creating 2 vectors
11828 // and the cost of creating shuffle.
11829 InstructionCost VecCost = 0;
11830 if (TryFindNodeWithEqualOperands()) {
11831 LLVM_DEBUG({
11832 dbgs() << "SLP: diamond match for alternate node found.\n";
11833 E->dump();
11834 });
11835 // No need to add new vector costs here since we're going to reuse
11836 // same main/alternate vector ops, just do different shuffling.
11837 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11838 VecCost =
11839 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11840 VecCost +=
11841 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11842 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11843 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11844 VecCost = TTIRef.getCmpSelInstrCost(
11845 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11846 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11847 VL0);
11848 VecCost += TTIRef.getCmpSelInstrCost(
11849 E->getOpcode(), VecTy, MaskTy,
11850 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11851 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11852 E->getAltOp());
11853 } else {
11854 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11855 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11856 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11857 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11858 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11859 unsigned SrcBWSz =
11860 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11861 if (SrcIt != MinBWs.end()) {
11862 SrcBWSz = SrcIt->second.first;
11863 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11864 SrcTy = getWidenedType(SrcSclTy, VL.size());
11865 }
11866 if (BWSz <= SrcBWSz) {
11867 if (BWSz < SrcBWSz)
11868 VecCost =
11869 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11871 LLVM_DEBUG({
11872 dbgs()
11873 << "SLP: alternate extension, which should be truncated.\n";
11874 E->dump();
11875 });
11876 return VecCost;
11877 }
11878 }
11879 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11881 VecCost +=
11882 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11884 }
11886 E->buildAltOpShuffleMask(
11887 [&](Instruction *I) {
11888 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11889 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11890 *TLI);
11891 },
11892 Mask);
11894 FinalVecTy, Mask, CostKind);
11895 // Patterns like [fadd,fsub] can be combined into a single instruction
11896 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11897 // need to take into account their order when looking for the most used
11898 // order.
11899 unsigned Opcode0 = E->getOpcode();
11900 unsigned Opcode1 = E->getAltOpcode();
11901 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11902 // If this pattern is supported by the target then we consider the
11903 // order.
11904 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11905 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11906 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11907 return AltVecCost < VecCost ? AltVecCost : VecCost;
11908 }
11909 // TODO: Check the reverse order too.
11910 return VecCost;
11911 };
11912 if (SLPReVec && !E->isAltShuffle())
11913 return GetCostDiff(
11914 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11915 // If a group uses mask in order, the shufflevector can be
11916 // eliminated by instcombine. Then the cost is 0.
11917 assert(isa<ShuffleVectorInst>(VL.front()) &&
11918 "Not supported shufflevector usage.");
11919 auto *SV = cast<ShuffleVectorInst>(VL.front());
11920 unsigned SVNumElements =
11921 cast<FixedVectorType>(SV->getOperand(0)->getType())
11922 ->getNumElements();
11923 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11924 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11925 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11926 int NextIndex = 0;
11927 if (!all_of(Group, [&](Value *V) {
11928 assert(isa<ShuffleVectorInst>(V) &&
11929 "Not supported shufflevector usage.");
11930 auto *SV = cast<ShuffleVectorInst>(V);
11931 int Index;
11932 [[maybe_unused]] bool IsExtractSubvectorMask =
11933 SV->isExtractSubvectorMask(Index);
11934 assert(IsExtractSubvectorMask &&
11935 "Not supported shufflevector usage.");
11936 if (NextIndex != Index)
11937 return false;
11938 NextIndex += SV->getShuffleMask().size();
11939 return true;
11940 }))
11941 return ::getShuffleCost(
11943 calculateShufflevectorMask(E->Scalars));
11944 }
11945 return TTI::TCC_Free;
11946 });
11947 return GetCostDiff(GetScalarCost, GetVectorCost);
11948 }
11949 case Instruction::Freeze:
11950 return CommonCost;
11951 default:
11952 llvm_unreachable("Unknown instruction");
11953 }
11954}
11955
11956bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11957 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11958 << VectorizableTree.size() << " is fully vectorizable .\n");
11959
11960 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11962 return TE->isGather() &&
11963 !any_of(TE->Scalars,
11964 [this](Value *V) { return EphValues.contains(V); }) &&
11965 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11966 TE->Scalars.size() < Limit ||
11967 (((TE->hasState() &&
11968 TE->getOpcode() == Instruction::ExtractElement) ||
11969 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11970 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11971 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
11972 !TE->isAltShuffle()) ||
11973 any_of(TE->Scalars, IsaPred<LoadInst>));
11974 };
11975
11976 // We only handle trees of heights 1 and 2.
11977 if (VectorizableTree.size() == 1 &&
11978 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11979 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11980 (ForReduction &&
11981 AreVectorizableGathers(VectorizableTree[0].get(),
11982 VectorizableTree[0]->Scalars.size()) &&
11983 VectorizableTree[0]->getVectorFactor() > 2)))
11984 return true;
11985
11986 if (VectorizableTree.size() != 2)
11987 return false;
11988
11989 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11990 // with the second gather nodes if they have less scalar operands rather than
11991 // the initial tree element (may be profitable to shuffle the second gather)
11992 // or they are extractelements, which form shuffle.
11994 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11995 AreVectorizableGathers(VectorizableTree[1].get(),
11996 VectorizableTree[0]->Scalars.size()))
11997 return true;
11998
11999 // Gathering cost would be too much for tiny trees.
12000 if (VectorizableTree[0]->isGather() ||
12001 (VectorizableTree[1]->isGather() &&
12002 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
12003 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
12004 return false;
12005
12006 return true;
12007}
12008
12009static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
12011 bool MustMatchOrInst) {
12012 // Look past the root to find a source value. Arbitrarily follow the
12013 // path through operand 0 of any 'or'. Also, peek through optional
12014 // shift-left-by-multiple-of-8-bits.
12015 Value *ZextLoad = Root;
12016 const APInt *ShAmtC;
12017 bool FoundOr = false;
12018 while (!isa<ConstantExpr>(ZextLoad) &&
12019 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
12020 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
12021 ShAmtC->urem(8) == 0))) {
12022 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12023 ZextLoad = BinOp->getOperand(0);
12024 if (BinOp->getOpcode() == Instruction::Or)
12025 FoundOr = true;
12026 }
12027 // Check if the input is an extended load of the required or/shift expression.
12028 Value *Load;
12029 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12030 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
12031 return false;
12032
12033 // Require that the total load bit width is a legal integer type.
12034 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
12035 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
12036 Type *SrcTy = Load->getType();
12037 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
12038 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
12039 return false;
12040
12041 // Everything matched - assume that we can fold the whole sequence using
12042 // load combining.
12043 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
12044 << *(cast<Instruction>(Root)) << "\n");
12045
12046 return true;
12047}
12048
12050 if (RdxKind != RecurKind::Or)
12051 return false;
12052
12053 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12054 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12055 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
12056 /* MatchOr */ false);
12057}
12058
12060 // Peek through a final sequence of stores and check if all operations are
12061 // likely to be load-combined.
12062 unsigned NumElts = Stores.size();
12063 for (Value *Scalar : Stores) {
12064 Value *X;
12065 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
12066 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
12067 return false;
12068 }
12069 return true;
12070}
12071
12072bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
12073 if (!DebugCounter::shouldExecute(VectorizedGraphs))
12074 return true;
12075
12076 // Graph is empty - do nothing.
12077 if (VectorizableTree.empty()) {
12078 assert(ExternalUses.empty() && "We shouldn't have any external users");
12079
12080 return true;
12081 }
12082
12083 // No need to vectorize inserts of gathered values.
12084 if (VectorizableTree.size() == 2 &&
12085 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12086 VectorizableTree[1]->isGather() &&
12087 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12088 !(isSplat(VectorizableTree[1]->Scalars) ||
12089 allConstant(VectorizableTree[1]->Scalars))))
12090 return true;
12091
12092 // If the graph includes only PHI nodes and gathers, it is defnitely not
12093 // profitable for the vectorization, we can skip it, if the cost threshold is
12094 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12095 // gathers/buildvectors.
12096 constexpr int Limit = 4;
12097 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12098 !VectorizableTree.empty() &&
12099 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12100 return (TE->isGather() &&
12101 (!TE->hasState() ||
12102 TE->getOpcode() != Instruction::ExtractElement) &&
12103 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12104 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12105 }))
12106 return true;
12107
12108 // We can vectorize the tree if its size is greater than or equal to the
12109 // minimum size specified by the MinTreeSize command line option.
12110 if (VectorizableTree.size() >= MinTreeSize)
12111 return false;
12112
12113 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12114 // can vectorize it if we can prove it fully vectorizable.
12115 if (isFullyVectorizableTinyTree(ForReduction))
12116 return false;
12117
12118 // Check if any of the gather node forms an insertelement buildvector
12119 // somewhere.
12120 bool IsAllowedSingleBVNode =
12121 VectorizableTree.size() > 1 ||
12122 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12123 !VectorizableTree.front()->isAltShuffle() &&
12124 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12125 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12126 allSameBlock(VectorizableTree.front()->Scalars));
12127 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12128 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12129 return isa<ExtractElementInst, UndefValue>(V) ||
12130 (IsAllowedSingleBVNode &&
12131 !V->hasNUsesOrMore(UsesLimit) &&
12132 any_of(V->users(), IsaPred<InsertElementInst>));
12133 });
12134 }))
12135 return false;
12136
12137 if (VectorizableTree.back()->isGather() &&
12138 VectorizableTree.back()->hasState() &&
12139 VectorizableTree.back()->isAltShuffle() &&
12140 VectorizableTree.back()->getVectorFactor() > 2 &&
12141 allSameBlock(VectorizableTree.back()->Scalars) &&
12142 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12144 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12145 VectorizableTree.back()->getVectorFactor()),
12146 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12147 /*Insert=*/true, /*Extract=*/false,
12149 return false;
12150
12151 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12152 // vectorizable.
12153 return true;
12154}
12155
12158 constexpr unsigned SmallTree = 3;
12159 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12160 getCanonicalGraphSize() <= SmallTree &&
12161 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12162 [](const std::unique_ptr<TreeEntry> &TE) {
12163 return TE->isGather() && TE->hasState() &&
12164 TE->getOpcode() == Instruction::Load &&
12165 !allSameBlock(TE->Scalars);
12166 }) == 1)
12167 return true;
12168 return false;
12169 }
12170 bool Res = false;
12171 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12172 TreeEntry &E = *VectorizableTree[Idx];
12173 if (!E.isGather())
12174 continue;
12175 if (E.hasState() && E.getOpcode() != Instruction::Load)
12176 return false;
12177 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12178 continue;
12179 Res = true;
12180 }
12181 return Res;
12182}
12183
12185 // Walk from the bottom of the tree to the top, tracking which values are
12186 // live. When we see a call instruction that is not part of our tree,
12187 // query TTI to see if there is a cost to keeping values live over it
12188 // (for example, if spills and fills are required).
12189 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12191
12193 Instruction *PrevInst = nullptr;
12194
12195 // The entries in VectorizableTree are not necessarily ordered by their
12196 // position in basic blocks. Collect them and order them by dominance so later
12197 // instructions are guaranteed to be visited first. For instructions in
12198 // different basic blocks, we only scan to the beginning of the block, so
12199 // their order does not matter, as long as all instructions in a basic block
12200 // are grouped together. Using dominance ensures a deterministic order.
12201 SmallVector<Instruction *, 16> OrderedScalars;
12202 for (const auto &TEPtr : VectorizableTree) {
12203 if (TEPtr->State != TreeEntry::Vectorize)
12204 continue;
12205 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12206 if (!Inst)
12207 continue;
12208 OrderedScalars.push_back(Inst);
12209 }
12210 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12211 auto *NodeA = DT->getNode(A->getParent());
12212 auto *NodeB = DT->getNode(B->getParent());
12213 assert(NodeA && "Should only process reachable instructions");
12214 assert(NodeB && "Should only process reachable instructions");
12215 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12216 "Different nodes should have different DFS numbers");
12217 if (NodeA != NodeB)
12218 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12219 return B->comesBefore(A);
12220 });
12221
12222 for (Instruction *Inst : OrderedScalars) {
12223 if (!PrevInst) {
12224 PrevInst = Inst;
12225 continue;
12226 }
12227
12228 // Update LiveValues.
12229 LiveValues.erase(PrevInst);
12230 for (auto &J : PrevInst->operands()) {
12231 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12232 LiveValues.insert(cast<Instruction>(&*J));
12233 }
12234
12235 LLVM_DEBUG({
12236 dbgs() << "SLP: #LV: " << LiveValues.size();
12237 for (auto *X : LiveValues)
12238 dbgs() << " " << X->getName();
12239 dbgs() << ", Looking at ";
12240 Inst->dump();
12241 });
12242
12243 // Now find the sequence of instructions between PrevInst and Inst.
12244 unsigned NumCalls = 0;
12245 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12246 PrevInstIt =
12247 PrevInst->getIterator().getReverse();
12248 while (InstIt != PrevInstIt) {
12249 if (PrevInstIt == PrevInst->getParent()->rend()) {
12250 PrevInstIt = Inst->getParent()->rbegin();
12251 continue;
12252 }
12253
12254 auto NoCallIntrinsic = [this](Instruction *I) {
12255 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12256 if (II->isAssumeLikeIntrinsic())
12257 return true;
12258 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12259 InstructionCost IntrCost =
12261 InstructionCost CallCost =
12262 TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
12264 if (IntrCost < CallCost)
12265 return true;
12266 }
12267 return false;
12268 };
12269
12270 // Debug information does not impact spill cost.
12271 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12272 &*PrevInstIt != PrevInst)
12273 NumCalls++;
12274
12275 ++PrevInstIt;
12276 }
12277
12278 if (NumCalls) {
12280 for (auto *II : LiveValues) {
12281 auto *ScalarTy = II->getType();
12282 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12283 ScalarTy = VectorTy->getElementType();
12284 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12285 }
12286 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12287 }
12288
12289 PrevInst = Inst;
12290 }
12291
12292 return Cost;
12293}
12294
12295/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12296/// buildvector sequence.
12298 const InsertElementInst *IE2) {
12299 if (IE1 == IE2)
12300 return false;
12301 const auto *I1 = IE1;
12302 const auto *I2 = IE2;
12303 const InsertElementInst *PrevI1;
12304 const InsertElementInst *PrevI2;
12305 unsigned Idx1 = *getElementIndex(IE1);
12306 unsigned Idx2 = *getElementIndex(IE2);
12307 do {
12308 if (I2 == IE1)
12309 return true;
12310 if (I1 == IE2)
12311 return false;
12312 PrevI1 = I1;
12313 PrevI2 = I2;
12314 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12315 getElementIndex(I1).value_or(Idx2) != Idx2)
12316 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12317 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12318 getElementIndex(I2).value_or(Idx1) != Idx1)
12319 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12320 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12321 llvm_unreachable("Two different buildvectors not expected.");
12322}
12323
12324namespace {
12325/// Returns incoming Value *, if the requested type is Value * too, or a default
12326/// value, otherwise.
12327struct ValueSelect {
12328 template <typename U>
12329 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12330 return V;
12331 }
12332 template <typename U>
12333 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12334 return U();
12335 }
12336};
12337} // namespace
12338
12339/// Does the analysis of the provided shuffle masks and performs the requested
12340/// actions on the vectors with the given shuffle masks. It tries to do it in
12341/// several steps.
12342/// 1. If the Base vector is not undef vector, resizing the very first mask to
12343/// have common VF and perform action for 2 input vectors (including non-undef
12344/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12345/// and processed as a shuffle of 2 elements.
12346/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12347/// action only for 1 vector with the given mask, if it is not the identity
12348/// mask.
12349/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12350/// vectors, combing the masks properly between the steps.
12351template <typename T>
12353 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12354 function_ref<unsigned(T *)> GetVF,
12355 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12357 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12358 SmallVector<int> Mask(ShuffleMask.begin()->second);
12359 auto VMIt = std::next(ShuffleMask.begin());
12360 T *Prev = nullptr;
12361 SmallBitVector UseMask =
12362 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12363 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12364 if (!IsBaseUndef.all()) {
12365 // Base is not undef, need to combine it with the next subvectors.
12366 std::pair<T *, bool> Res =
12367 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12368 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12369 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12370 if (Mask[Idx] == PoisonMaskElem)
12371 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12372 else
12373 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12374 }
12375 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
12376 assert((!V || GetVF(V) == Mask.size()) &&
12377 "Expected base vector of VF number of elements.");
12378 Prev = Action(Mask, {nullptr, Res.first});
12379 } else if (ShuffleMask.size() == 1) {
12380 // Base is undef and only 1 vector is shuffled - perform the action only for
12381 // single vector, if the mask is not the identity mask.
12382 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12383 /*ForSingleMask=*/true);
12384 if (Res.second)
12385 // Identity mask is found.
12386 Prev = Res.first;
12387 else
12388 Prev = Action(Mask, {ShuffleMask.begin()->first});
12389 } else {
12390 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12391 // shuffles step by step, combining shuffle between the steps.
12392 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12393 unsigned Vec2VF = GetVF(VMIt->first);
12394 if (Vec1VF == Vec2VF) {
12395 // No need to resize the input vectors since they are of the same size, we
12396 // can shuffle them directly.
12397 ArrayRef<int> SecMask = VMIt->second;
12398 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12399 if (SecMask[I] != PoisonMaskElem) {
12400 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12401 Mask[I] = SecMask[I] + Vec1VF;
12402 }
12403 }
12404 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12405 } else {
12406 // Vectors of different sizes - resize and reshuffle.
12407 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12408 /*ForSingleMask=*/false);
12409 std::pair<T *, bool> Res2 =
12410 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12411 ArrayRef<int> SecMask = VMIt->second;
12412 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12413 if (Mask[I] != PoisonMaskElem) {
12414 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12415 if (Res1.second)
12416 Mask[I] = I;
12417 } else if (SecMask[I] != PoisonMaskElem) {
12418 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12419 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12420 }
12421 }
12422 Prev = Action(Mask, {Res1.first, Res2.first});
12423 }
12424 VMIt = std::next(VMIt);
12425 }
12426 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
12427 // Perform requested actions for the remaining masks/vectors.
12428 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12429 // Shuffle other input vectors, if any.
12430 std::pair<T *, bool> Res =
12431 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12432 ArrayRef<int> SecMask = VMIt->second;
12433 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12434 if (SecMask[I] != PoisonMaskElem) {
12435 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12436 "Multiple uses of scalars.");
12437 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12438 } else if (Mask[I] != PoisonMaskElem) {
12439 Mask[I] = I;
12440 }
12441 }
12442 Prev = Action(Mask, {Prev, Res.first});
12443 }
12444 return Prev;
12445}
12446
12447namespace {
12448/// Data type for handling buildvector sequences with the reused scalars from
12449/// other tree entries.
12450template <typename T> struct ShuffledInsertData {
12451 /// List of insertelements to be replaced by shuffles.
12452 SmallVector<InsertElementInst *> InsertElements;
12453 /// The parent vectors and shuffle mask for the given list of inserts.
12455};
12456} // namespace
12457
12460 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12461 << VectorizableTree.size() << ".\n");
12462
12463 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12464
12465 SmallPtrSet<Value *, 4> CheckedExtracts;
12466 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12467 TreeEntry &TE = *VectorizableTree[I];
12468 // No need to count the cost for combined entries, they are combined and
12469 // just skip their cost.
12470 if (TE.State == TreeEntry::CombinedVectorize) {
12471 LLVM_DEBUG(
12472 dbgs() << "SLP: Skipping cost for combined node that starts with "
12473 << *TE.Scalars[0] << ".\n";
12474 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12475 continue;
12476 }
12477 if (TE.isGather() && TE.hasState()) {
12478 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12479 E && E->getVectorFactor() == TE.getVectorFactor() &&
12480 E->isSame(TE.Scalars)) {
12481 // Some gather nodes might be absolutely the same as some vectorizable
12482 // nodes after reordering, need to handle it.
12483 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12484 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12485 << "SLP: Current total cost = " << Cost << "\n");
12486 continue;
12487 }
12488 }
12489
12490 // Exclude cost of gather loads nodes which are not used. These nodes were
12491 // built as part of the final attempt to vectorize gathered loads.
12492 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12493 "Expected gather nodes with users only.");
12494
12495 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12496 Cost += C;
12497 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12498 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12499 << "SLP: Current total cost = " << Cost << "\n");
12500 }
12501
12502 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12503 InstructionCost ExtractCost = 0;
12505 SmallVector<APInt> DemandedElts;
12506 SmallDenseSet<Value *, 4> UsedInserts;
12508 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12510 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12511 // Keep track {Scalar, Index, User} tuple.
12512 // On AArch64, this helps in fusing a mov instruction, associated with
12513 // extractelement, with fmul in the backend so that extractelement is free.
12515 for (ExternalUser &EU : ExternalUses) {
12516 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12517 }
12518 for (ExternalUser &EU : ExternalUses) {
12519 // Uses by ephemeral values are free (because the ephemeral value will be
12520 // removed prior to code generation, and so the extraction will be
12521 // removed as well).
12522 if (EphValues.count(EU.User))
12523 continue;
12524
12525 // Used in unreachable blocks or in EH pads (rarely executed) or is
12526 // terminated with unreachable instruction.
12527 if (BasicBlock *UserParent =
12528 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12529 UserParent &&
12530 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12531 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12532 continue;
12533
12534 // We only add extract cost once for the same scalar.
12535 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12536 !ExtractCostCalculated.insert(EU.Scalar).second)
12537 continue;
12538
12539 // No extract cost for vector "scalar"
12540 if (isa<FixedVectorType>(EU.Scalar->getType()))
12541 continue;
12542
12543 // If found user is an insertelement, do not calculate extract cost but try
12544 // to detect it as a final shuffled/identity match.
12545 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12546 VU && VU->getOperand(1) == EU.Scalar) {
12547 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12548 if (!UsedInserts.insert(VU).second)
12549 continue;
12550 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12551 if (InsertIdx) {
12552 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12553 auto *It = find_if(
12554 ShuffledInserts,
12555 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12556 // Checks if 2 insertelements are from the same buildvector.
12557 InsertElementInst *VecInsert = Data.InsertElements.front();
12559 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12560 Value *Op0 = II->getOperand(0);
12561 if (getTreeEntry(II) && !getTreeEntry(Op0))
12562 return nullptr;
12563 return Op0;
12564 });
12565 });
12566 int VecId = -1;
12567 if (It == ShuffledInserts.end()) {
12568 auto &Data = ShuffledInserts.emplace_back();
12569 Data.InsertElements.emplace_back(VU);
12570 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12571 VecId = ShuffledInserts.size() - 1;
12572 auto It = MinBWs.find(ScalarTE);
12573 if (It != MinBWs.end() &&
12574 VectorCasts
12575 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12576 .second) {
12577 unsigned BWSz = It->second.first;
12578 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12579 unsigned VecOpcode;
12580 if (DstBWSz < BWSz)
12581 VecOpcode = Instruction::Trunc;
12582 else
12583 VecOpcode =
12584 It->second.second ? Instruction::SExt : Instruction::ZExt;
12587 VecOpcode, FTy,
12588 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12589 FTy->getNumElements()),
12591 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12592 << " for extending externally used vector with "
12593 "non-equal minimum bitwidth.\n");
12594 Cost += C;
12595 }
12596 } else {
12597 if (isFirstInsertElement(VU, It->InsertElements.front()))
12598 It->InsertElements.front() = VU;
12599 VecId = std::distance(ShuffledInserts.begin(), It);
12600 }
12601 int InIdx = *InsertIdx;
12602 SmallVectorImpl<int> &Mask =
12603 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12604 if (Mask.empty())
12605 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12606 Mask[InIdx] = EU.Lane;
12607 DemandedElts[VecId].setBit(InIdx);
12608 continue;
12609 }
12610 }
12611 }
12612
12614 // If we plan to rewrite the tree in a smaller type, we will need to sign
12615 // extend the extracted value back to the original type. Here, we account
12616 // for the extract and the added cost of the sign extend if needed.
12617 InstructionCost ExtraCost = TTI::TCC_Free;
12618 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12619 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12620 auto It = MinBWs.find(Entry);
12621 if (It != MinBWs.end()) {
12622 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12623 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12624 ? Instruction::ZExt
12625 : Instruction::SExt;
12626 VecTy = getWidenedType(MinTy, BundleWidth);
12627 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12628 VecTy, EU.Lane);
12629 } else {
12630 ExtraCost =
12631 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12632 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12633 }
12634 // Leave the scalar instructions as is if they are cheaper than extracts.
12635 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12636 Entry->getOpcode() == Instruction::Load) {
12637 // Checks if the user of the external scalar is phi in loop body.
12638 auto IsPhiInLoop = [&](const ExternalUser &U) {
12639 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12640 auto *I = cast<Instruction>(U.Scalar);
12641 const Loop *L = LI->getLoopFor(Phi->getParent());
12642 return L && (Phi->getParent() == I->getParent() ||
12643 L == LI->getLoopFor(I->getParent()));
12644 }
12645 return false;
12646 };
12647 if (!ValueToExtUses) {
12648 ValueToExtUses.emplace();
12649 for_each(enumerate(ExternalUses), [&](const auto &P) {
12650 // Ignore phis in loops.
12651 if (IsPhiInLoop(P.value()))
12652 return;
12653
12654 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12655 });
12656 }
12657 // Can use original instruction, if no operands vectorized or they are
12658 // marked as externally used already.
12659 auto *Inst = cast<Instruction>(EU.Scalar);
12660 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12661 auto OperandIsScalar = [&](Value *V) {
12662 if (!getTreeEntry(V)) {
12663 // Some extractelements might be not vectorized, but
12664 // transformed into shuffle and removed from the function,
12665 // consider it here.
12666 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12667 return !EE->hasOneUse() || !MustGather.contains(EE);
12668 return true;
12669 }
12670 return ValueToExtUses->contains(V);
12671 };
12672 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12673 bool CanBeUsedAsScalarCast = false;
12674 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12675 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12676 Op && all_of(Op->operands(), OperandIsScalar)) {
12677 InstructionCost OpCost =
12678 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12680 : 0;
12681 if (ScalarCost + OpCost <= ExtraCost) {
12682 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12683 ScalarCost += OpCost;
12684 }
12685 }
12686 }
12687 if (CanBeUsedAsScalar) {
12688 bool KeepScalar = ScalarCost <= ExtraCost;
12689 // Try to keep original scalar if the user is the phi node from the same
12690 // block as the root phis, currently vectorized. It allows to keep
12691 // better ordering info of PHIs, being vectorized currently.
12692 bool IsProfitablePHIUser =
12693 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12694 VectorizableTree.front()->Scalars.size() > 2)) &&
12695 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12696 !Inst->hasNUsesOrMore(UsesLimit) &&
12697 none_of(Inst->users(),
12698 [&](User *U) {
12699 auto *PHIUser = dyn_cast<PHINode>(U);
12700 return (!PHIUser ||
12701 PHIUser->getParent() !=
12702 cast<Instruction>(
12703 VectorizableTree.front()->getMainOp())
12704 ->getParent()) &&
12705 !getTreeEntry(U);
12706 }) &&
12707 count_if(Entry->Scalars, [&](Value *V) {
12708 return ValueToExtUses->contains(V);
12709 }) <= 2;
12710 if (IsProfitablePHIUser) {
12711 KeepScalar = true;
12712 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12713 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12714 (!GatheredLoadsEntriesFirst.has_value() ||
12715 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12716 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12717 return ValueToExtUses->contains(V);
12718 });
12719 auto It = ExtractsCount.find(Entry);
12720 if (It != ExtractsCount.end()) {
12721 assert(ScalarUsesCount >= It->getSecond().size() &&
12722 "Expected total number of external uses not less than "
12723 "number of scalar uses.");
12724 ScalarUsesCount -= It->getSecond().size();
12725 }
12726 // Keep original scalar if number of externally used instructions in
12727 // the same entry is not power of 2. It may help to do some extra
12728 // vectorization for now.
12729 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12730 }
12731 if (KeepScalar) {
12732 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12733 for_each(Inst->operands(), [&](Value *V) {
12734 auto It = ValueToExtUses->find(V);
12735 if (It != ValueToExtUses->end()) {
12736 // Replace all uses to avoid compiler crash.
12737 ExternalUses[It->second].User = nullptr;
12738 }
12739 });
12740 ExtraCost = ScalarCost;
12741 if (!IsPhiInLoop(EU))
12742 ExtractsCount[Entry].insert(Inst);
12743 if (CanBeUsedAsScalarCast) {
12744 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12745 // Update the users of the operands of the cast operand to avoid
12746 // compiler crash.
12747 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12748 for_each(IOp->operands(), [&](Value *V) {
12749 auto It = ValueToExtUses->find(V);
12750 if (It != ValueToExtUses->end()) {
12751 // Replace all uses to avoid compiler crash.
12752 ExternalUses[It->second].User = nullptr;
12753 }
12754 });
12755 }
12756 }
12757 }
12758 }
12759 }
12760
12761 ExtractCost += ExtraCost;
12762 }
12763 // Insert externals for extract of operands of casts to be emitted as scalars
12764 // instead of extractelement.
12765 for (Value *V : ScalarOpsFromCasts) {
12766 ExternalUsesAsOriginalScalar.insert(V);
12767 if (const TreeEntry *E = getTreeEntry(V)) {
12768 ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
12769 }
12770 }
12771 // Add reduced value cost, if resized.
12772 if (!VectorizedVals.empty()) {
12773 const TreeEntry &Root = *VectorizableTree.front();
12774 auto BWIt = MinBWs.find(&Root);
12775 if (BWIt != MinBWs.end()) {
12776 Type *DstTy = Root.Scalars.front()->getType();
12777 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12778 unsigned SrcSz =
12779 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12780 if (OriginalSz != SrcSz) {
12781 unsigned Opcode = Instruction::Trunc;
12782 if (OriginalSz > SrcSz)
12783 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12784 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12785 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12786 assert(SLPReVec && "Only supported by REVEC.");
12787 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12788 }
12789 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12792 }
12793 }
12794 }
12795
12796 InstructionCost SpillCost = getSpillCost();
12797 Cost += SpillCost + ExtractCost;
12798 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12799 bool) {
12800 InstructionCost C = 0;
12801 unsigned VF = Mask.size();
12802 unsigned VecVF = TE->getVectorFactor();
12803 if (VF != VecVF &&
12804 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12806 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12807 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12808 OrigMask.begin());
12810 getWidenedType(TE->getMainOp()->getType(), VecVF),
12811 OrigMask);
12812 LLVM_DEBUG(
12813 dbgs() << "SLP: Adding cost " << C
12814 << " for final shuffle of insertelement external users.\n";
12815 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12816 Cost += C;
12817 return std::make_pair(TE, true);
12818 }
12819 return std::make_pair(TE, false);
12820 };
12821 // Calculate the cost of the reshuffled vectors, if any.
12822 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12823 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12824 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12825 unsigned VF = 0;
12826 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12828 assert((TEs.size() == 1 || TEs.size() == 2) &&
12829 "Expected exactly 1 or 2 tree entries.");
12830 if (TEs.size() == 1) {
12831 if (VF == 0)
12832 VF = TEs.front()->getVectorFactor();
12833 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12834 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12835 !all_of(enumerate(Mask), [=](const auto &Data) {
12836 return Data.value() == PoisonMaskElem ||
12837 (Data.index() < VF &&
12838 static_cast<int>(Data.index()) == Data.value());
12839 })) {
12842 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12843 << " for final shuffle of insertelement "
12844 "external users.\n";
12845 TEs.front()->dump();
12846 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12847 Cost += C;
12848 }
12849 } else {
12850 if (VF == 0) {
12851 if (TEs.front() &&
12852 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12853 VF = TEs.front()->getVectorFactor();
12854 else
12855 VF = Mask.size();
12856 }
12857 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12860 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12861 << " for final shuffle of vector node and external "
12862 "insertelement users.\n";
12863 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12864 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12865 Cost += C;
12866 }
12867 VF = Mask.size();
12868 return TEs.back();
12869 };
12870 (void)performExtractsShuffleAction<const TreeEntry>(
12871 MutableArrayRef(Vector.data(), Vector.size()), Base,
12872 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12873 EstimateShufflesCost);
12875 cast<FixedVectorType>(
12876 ShuffledInserts[I].InsertElements.front()->getType()),
12877 DemandedElts[I],
12878 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12879 Cost -= InsertCost;
12880 }
12881
12882 // Add the cost for reduced value resize (if required).
12883 if (ReductionBitWidth != 0) {
12884 assert(UserIgnoreList && "Expected reduction tree.");
12885 const TreeEntry &E = *VectorizableTree.front();
12886 auto It = MinBWs.find(&E);
12887 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12888 unsigned SrcSize = It->second.first;
12889 unsigned DstSize = ReductionBitWidth;
12890 unsigned Opcode = Instruction::Trunc;
12891 if (SrcSize < DstSize) {
12892 bool IsArithmeticExtendedReduction =
12893 all_of(*UserIgnoreList, [](Value *V) {
12894 auto *I = cast<Instruction>(V);
12895 return is_contained({Instruction::Add, Instruction::FAdd,
12896 Instruction::Mul, Instruction::FMul,
12897 Instruction::And, Instruction::Or,
12898 Instruction::Xor},
12899 I->getOpcode());
12900 });
12901 if (IsArithmeticExtendedReduction)
12902 Opcode =
12903 Instruction::BitCast; // Handle it by getExtendedReductionCost
12904 else
12905 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12906 }
12907 if (Opcode != Instruction::BitCast) {
12908 auto *SrcVecTy =
12909 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12910 auto *DstVecTy =
12911 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12912 TTI::CastContextHint CCH = getCastContextHint(E);
12913 InstructionCost CastCost;
12914 switch (E.getOpcode()) {
12915 case Instruction::SExt:
12916 case Instruction::ZExt:
12917 case Instruction::Trunc: {
12918 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12919 CCH = getCastContextHint(*OpTE);
12920 break;
12921 }
12922 default:
12923 break;
12924 }
12925 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12927 Cost += CastCost;
12928 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12929 << " for final resize for reduction from " << SrcVecTy
12930 << " to " << DstVecTy << "\n";
12931 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12932 }
12933 }
12934 }
12935
12936#ifndef NDEBUG
12937 SmallString<256> Str;
12938 {
12940 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12941 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12942 << "SLP: Total Cost = " << Cost << ".\n";
12943 }
12944 LLVM_DEBUG(dbgs() << Str);
12945 if (ViewSLPTree)
12946 ViewGraph(this, "SLP" + F->getName(), false, Str);
12947#endif
12948
12949 return Cost;
12950}
12951
12952/// Tries to find extractelement instructions with constant indices from fixed
12953/// vector type and gather such instructions into a bunch, which highly likely
12954/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12955/// successful, the matched scalars are replaced by poison values in \p VL for
12956/// future analysis.
12957std::optional<TTI::ShuffleKind>
12958BoUpSLP::tryToGatherSingleRegisterExtractElements(
12960 // Scan list of gathered scalars for extractelements that can be represented
12961 // as shuffles.
12963 SmallVector<int> UndefVectorExtracts;
12964 for (int I = 0, E = VL.size(); I < E; ++I) {
12965 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12966 if (!EI) {
12967 if (isa<UndefValue>(VL[I]))
12968 UndefVectorExtracts.push_back(I);
12969 continue;
12970 }
12971 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12972 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12973 continue;
12974 std::optional<unsigned> Idx = getExtractIndex(EI);
12975 // Undefined index.
12976 if (!Idx) {
12977 UndefVectorExtracts.push_back(I);
12978 continue;
12979 }
12980 if (Idx >= VecTy->getNumElements()) {
12981 UndefVectorExtracts.push_back(I);
12982 continue;
12983 }
12984 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12985 ExtractMask.reset(*Idx);
12986 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12987 UndefVectorExtracts.push_back(I);
12988 continue;
12989 }
12990 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12991 }
12992 // Sort the vector operands by the maximum number of uses in extractelements.
12994 VectorOpToIdx.takeVector();
12995 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
12996 return P1.second.size() > P2.second.size();
12997 });
12998 // Find the best pair of the vectors or a single vector.
12999 const int UndefSz = UndefVectorExtracts.size();
13000 unsigned SingleMax = 0;
13001 unsigned PairMax = 0;
13002 if (!Vectors.empty()) {
13003 SingleMax = Vectors.front().second.size() + UndefSz;
13004 if (Vectors.size() > 1) {
13005 auto *ItNext = std::next(Vectors.begin());
13006 PairMax = SingleMax + ItNext->second.size();
13007 }
13008 }
13009 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
13010 return std::nullopt;
13011 // Check if better to perform a shuffle of 2 vectors or just of a single
13012 // vector.
13013 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
13014 SmallVector<Value *> GatheredExtracts(
13015 VL.size(), PoisonValue::get(VL.front()->getType()));
13016 if (SingleMax >= PairMax && SingleMax) {
13017 for (int Idx : Vectors.front().second)
13018 std::swap(GatheredExtracts[Idx], VL[Idx]);
13019 } else if (!Vectors.empty()) {
13020 for (unsigned Idx : {0, 1})
13021 for (int Idx : Vectors[Idx].second)
13022 std::swap(GatheredExtracts[Idx], VL[Idx]);
13023 }
13024 // Add extracts from undefs too.
13025 for (int Idx : UndefVectorExtracts)
13026 std::swap(GatheredExtracts[Idx], VL[Idx]);
13027 // Check that gather of extractelements can be represented as just a
13028 // shuffle of a single/two vectors the scalars are extracted from.
13029 std::optional<TTI::ShuffleKind> Res =
13030 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
13031 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
13032 // TODO: try to check other subsets if possible.
13033 // Restore the original VL if attempt was not successful.
13034 copy(SavedVL, VL.begin());
13035 return std::nullopt;
13036 }
13037 // Restore unused scalars from mask, if some of the extractelements were not
13038 // selected for shuffle.
13039 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
13040 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
13041 isa<UndefValue>(GatheredExtracts[I])) {
13042 std::swap(VL[I], GatheredExtracts[I]);
13043 continue;
13044 }
13045 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
13046 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13047 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13048 is_contained(UndefVectorExtracts, I))
13049 continue;
13050 }
13051 return Res;
13052}
13053
13054/// Tries to find extractelement instructions with constant indices from fixed
13055/// vector type and gather such instructions into a bunch, which highly likely
13056/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
13057/// successful, the matched scalars are replaced by poison values in \p VL for
13058/// future analysis.
13060BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
13062 unsigned NumParts) const {
13063 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
13064 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
13065 Mask.assign(VL.size(), PoisonMaskElem);
13066 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13067 for (unsigned Part : seq<unsigned>(NumParts)) {
13068 // Scan list of gathered scalars for extractelements that can be represented
13069 // as shuffles.
13071 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13072 SmallVector<int> SubMask;
13073 std::optional<TTI::ShuffleKind> Res =
13074 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13075 ShufflesRes[Part] = Res;
13076 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13077 }
13078 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13079 return Res.has_value();
13080 }))
13081 ShufflesRes.clear();
13082 return ShufflesRes;
13083}
13084
13085std::optional<TargetTransformInfo::ShuffleKind>
13086BoUpSLP::isGatherShuffledSingleRegisterEntry(
13087 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
13088 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
13089 Entries.clear();
13090 // TODO: currently checking only for Scalars in the tree entry, need to count
13091 // reused elements too for better cost estimation.
13092 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
13093 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13094 : TE->UserTreeIndices.front();
13095 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13096 const BasicBlock *TEInsertBlock = nullptr;
13097 // Main node of PHI entries keeps the correct order of operands/incoming
13098 // blocks.
13099 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13100 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13101 TEInsertPt = TEInsertBlock->getTerminator();
13102 } else {
13103 TEInsertBlock = TEInsertPt->getParent();
13104 }
13105 if (!DT->isReachableFromEntry(TEInsertBlock))
13106 return std::nullopt;
13107 auto *NodeUI = DT->getNode(TEInsertBlock);
13108 assert(NodeUI && "Should only process reachable instructions");
13109 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13110 auto CheckOrdering = [&](const Instruction *InsertPt) {
13111 // Argument InsertPt is an instruction where vector code for some other
13112 // tree entry (one that shares one or more scalars with TE) is going to be
13113 // generated. This lambda returns true if insertion point of vector code
13114 // for the TE dominates that point (otherwise dependency is the other way
13115 // around). The other node is not limited to be of a gather kind. Gather
13116 // nodes are not scheduled and their vector code is inserted before their
13117 // first user. If user is PHI, that is supposed to be at the end of a
13118 // predecessor block. Otherwise it is the last instruction among scalars of
13119 // the user node. So, instead of checking dependency between instructions
13120 // themselves, we check dependency between their insertion points for vector
13121 // code (since each scalar instruction ends up as a lane of a vector
13122 // instruction).
13123 const BasicBlock *InsertBlock = InsertPt->getParent();
13124 auto *NodeEUI = DT->getNode(InsertBlock);
13125 if (!NodeEUI)
13126 return false;
13127 assert((NodeUI == NodeEUI) ==
13128 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13129 "Different nodes should have different DFS numbers");
13130 // Check the order of the gather nodes users.
13131 if (TEInsertPt->getParent() != InsertBlock &&
13132 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13133 return false;
13134 if (TEInsertPt->getParent() == InsertBlock &&
13135 TEInsertPt->comesBefore(InsertPt))
13136 return false;
13137 return true;
13138 };
13139 // Find all tree entries used by the gathered values. If no common entries
13140 // found - not a shuffle.
13141 // Here we build a set of tree nodes for each gathered value and trying to
13142 // find the intersection between these sets. If we have at least one common
13143 // tree node for each gathered value - we have just a permutation of the
13144 // single vector. If we have 2 different sets, we're in situation where we
13145 // have a permutation of 2 input vectors.
13147 DenseMap<Value *, int> UsedValuesEntry;
13148 for (Value *V : VL) {
13149 if (isConstant(V))
13150 continue;
13151 // Build a list of tree entries where V is used.
13153 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13154 if (TEPtr == TE || TEPtr->Idx == 0)
13155 continue;
13156 assert(any_of(TEPtr->Scalars,
13157 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13158 "Must contain at least single gathered value.");
13159 assert(TEPtr->UserTreeIndices.size() == 1 &&
13160 "Expected only single user of a gather node.");
13161 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13162
13163 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13164 const Instruction *InsertPt =
13165 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13166 : &getLastInstructionInBundle(UseEI.UserTE);
13167 if (TEInsertPt == InsertPt) {
13168 // If 2 gathers are operands of the same entry (regardless of whether
13169 // user is PHI or else), compare operands indices, use the earlier one
13170 // as the base.
13171 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13172 continue;
13173 // If the user instruction is used for some reason in different
13174 // vectorized nodes - make it depend on index.
13175 if (TEUseEI.UserTE != UseEI.UserTE &&
13176 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13177 continue;
13178 }
13179
13180 // Check if the user node of the TE comes after user node of TEPtr,
13181 // otherwise TEPtr depends on TE.
13182 if ((TEInsertBlock != InsertPt->getParent() ||
13183 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13184 !CheckOrdering(InsertPt))
13185 continue;
13186 VToTEs.insert(TEPtr);
13187 }
13188 if (const TreeEntry *VTE = getTreeEntry(V)) {
13189 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13190 if (VTE->State != TreeEntry::Vectorize) {
13191 auto It = MultiNodeScalars.find(V);
13192 if (It == MultiNodeScalars.end())
13193 continue;
13194 VTE = *It->getSecond().begin();
13195 // Iterate through all vectorized nodes.
13196 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
13197 return MTE->State == TreeEntry::Vectorize;
13198 });
13199 if (MIt == It->getSecond().end())
13200 continue;
13201 VTE = *MIt;
13202 }
13203 }
13204 if (none_of(TE->CombinedEntriesWithIndices,
13205 [&](const auto &P) { return P.first == VTE->Idx; })) {
13206 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13207 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13208 continue;
13209 }
13210 VToTEs.insert(VTE);
13211 }
13212 if (VToTEs.empty())
13213 continue;
13214 if (UsedTEs.empty()) {
13215 // The first iteration, just insert the list of nodes to vector.
13216 UsedTEs.push_back(VToTEs);
13217 UsedValuesEntry.try_emplace(V, 0);
13218 } else {
13219 // Need to check if there are any previously used tree nodes which use V.
13220 // If there are no such nodes, consider that we have another one input
13221 // vector.
13222 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13223 unsigned Idx = 0;
13224 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13225 // Do we have a non-empty intersection of previously listed tree entries
13226 // and tree entries using current V?
13227 set_intersect(VToTEs, Set);
13228 if (!VToTEs.empty()) {
13229 // Yes, write the new subset and continue analysis for the next
13230 // scalar.
13231 Set.swap(VToTEs);
13232 break;
13233 }
13234 VToTEs = SavedVToTEs;
13235 ++Idx;
13236 }
13237 // No non-empty intersection found - need to add a second set of possible
13238 // source vectors.
13239 if (Idx == UsedTEs.size()) {
13240 // If the number of input vectors is greater than 2 - not a permutation,
13241 // fallback to the regular gather.
13242 // TODO: support multiple reshuffled nodes.
13243 if (UsedTEs.size() == 2)
13244 continue;
13245 UsedTEs.push_back(SavedVToTEs);
13246 Idx = UsedTEs.size() - 1;
13247 }
13248 UsedValuesEntry.try_emplace(V, Idx);
13249 }
13250 }
13251
13252 if (UsedTEs.empty()) {
13253 Entries.clear();
13254 return std::nullopt;
13255 }
13256
13257 unsigned VF = 0;
13258 if (UsedTEs.size() == 1) {
13259 // Keep the order to avoid non-determinism.
13260 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13261 UsedTEs.front().end());
13262 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13263 return TE1->Idx < TE2->Idx;
13264 });
13265 // Try to find the perfect match in another gather node at first.
13266 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13267 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13268 });
13269 if (It != FirstEntries.end() &&
13270 ((*It)->getVectorFactor() == VL.size() ||
13271 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13272 TE->ReuseShuffleIndices.size() == VL.size() &&
13273 (*It)->isSame(TE->Scalars)))) {
13274 Entries.push_back(*It);
13275 if ((*It)->getVectorFactor() == VL.size()) {
13276 std::iota(std::next(Mask.begin(), Part * VL.size()),
13277 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13278 } else {
13279 SmallVector<int> CommonMask = TE->getCommonMask();
13280 copy(CommonMask, Mask.begin());
13281 }
13282 // Clear undef scalars.
13283 for (unsigned I : seq<unsigned>(VL.size()))
13284 if (isa<PoisonValue>(VL[I]))
13285 Mask[Part * VL.size() + I] = PoisonMaskElem;
13287 }
13288 // No perfect match, just shuffle, so choose the first tree node from the
13289 // tree.
13290 Entries.push_back(FirstEntries.front());
13291 VF = FirstEntries.front()->getVectorFactor();
13292 } else {
13293 // Try to find nodes with the same vector factor.
13294 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13295 // Keep the order of tree nodes to avoid non-determinism.
13297 for (const TreeEntry *TE : UsedTEs.front()) {
13298 unsigned VF = TE->getVectorFactor();
13299 auto It = VFToTE.find(VF);
13300 if (It != VFToTE.end()) {
13301 if (It->second->Idx > TE->Idx)
13302 It->getSecond() = TE;
13303 continue;
13304 }
13305 VFToTE.try_emplace(VF, TE);
13306 }
13307 // Same, keep the order to avoid non-determinism.
13308 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13309 UsedTEs.back().end());
13310 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13311 return TE1->Idx < TE2->Idx;
13312 });
13313 for (const TreeEntry *TE : SecondEntries) {
13314 auto It = VFToTE.find(TE->getVectorFactor());
13315 if (It != VFToTE.end()) {
13316 VF = It->first;
13317 Entries.push_back(It->second);
13318 Entries.push_back(TE);
13319 break;
13320 }
13321 }
13322 // No 2 source vectors with the same vector factor - just choose 2 with max
13323 // index.
13324 if (Entries.empty()) {
13325 Entries.push_back(*llvm::max_element(
13326 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13327 return TE1->Idx < TE2->Idx;
13328 }));
13329 Entries.push_back(SecondEntries.front());
13330 VF = std::max(Entries.front()->getVectorFactor(),
13331 Entries.back()->getVectorFactor());
13332 } else {
13333 VF = Entries.front()->getVectorFactor();
13334 }
13335 }
13336
13337 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13338 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13339 // vectorized.
13340 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13341 auto *PHI = cast<PHINode>(V);
13342 auto *PHI1 = cast<PHINode>(V1);
13343 // Check that all incoming values are compatible/from same parent (if they
13344 // are instructions).
13345 // The incoming values are compatible if they all are constants, or
13346 // instruction with the same/alternate opcodes from the same basic block.
13347 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13348 Value *In = PHI->getIncomingValue(I);
13349 Value *In1 = PHI1->getIncomingValue(I);
13350 if (isConstant(In) && isConstant(In1))
13351 continue;
13352 if (!getSameOpcode({In, In1}, *TLI))
13353 return false;
13354 if (cast<Instruction>(In)->getParent() !=
13355 cast<Instruction>(In1)->getParent())
13356 return false;
13357 }
13358 return true;
13359 };
13360 // Check if the value can be ignored during analysis for shuffled gathers.
13361 // We suppose it is better to ignore instruction, which do not form splats,
13362 // are not vectorized/not extractelements (these instructions will be handled
13363 // by extractelements processing) or may form vector node in future.
13364 auto MightBeIgnored = [=](Value *V) {
13365 auto *I = dyn_cast<Instruction>(V);
13366 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13368 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13369 };
13370 // Check that the neighbor instruction may form a full vector node with the
13371 // current instruction V. It is possible, if they have same/alternate opcode
13372 // and same parent basic block.
13373 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13374 Value *V1 = VL[Idx];
13375 bool UsedInSameVTE = false;
13376 auto It = UsedValuesEntry.find(V1);
13377 if (It != UsedValuesEntry.end())
13378 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13379 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13380 getSameOpcode({V, V1}, *TLI) &&
13381 cast<Instruction>(V)->getParent() ==
13382 cast<Instruction>(V1)->getParent() &&
13383 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13384 };
13385 // Build a shuffle mask for better cost estimation and vector emission.
13386 SmallBitVector UsedIdxs(Entries.size());
13388 for (int I = 0, E = VL.size(); I < E; ++I) {
13389 Value *V = VL[I];
13390 auto It = UsedValuesEntry.find(V);
13391 if (It == UsedValuesEntry.end())
13392 continue;
13393 // Do not try to shuffle scalars, if they are constants, or instructions
13394 // that can be vectorized as a result of the following vector build
13395 // vectorization.
13396 if (isConstant(V) || (MightBeIgnored(V) &&
13397 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13398 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13399 continue;
13400 unsigned Idx = It->second;
13401 EntryLanes.emplace_back(Idx, I);
13402 UsedIdxs.set(Idx);
13403 }
13404 // Iterate through all shuffled scalars and select entries, which can be used
13405 // for final shuffle.
13407 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13408 if (!UsedIdxs.test(I))
13409 continue;
13410 // Fix the entry number for the given scalar. If it is the first entry, set
13411 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13412 // These indices are used when calculating final shuffle mask as the vector
13413 // offset.
13414 for (std::pair<unsigned, int> &Pair : EntryLanes)
13415 if (Pair.first == I)
13416 Pair.first = TempEntries.size();
13417 TempEntries.push_back(Entries[I]);
13418 }
13419 Entries.swap(TempEntries);
13420 if (EntryLanes.size() == Entries.size() &&
13421 !VL.equals(ArrayRef(TE->Scalars)
13422 .slice(Part * VL.size(),
13423 std::min<int>(VL.size(), TE->Scalars.size())))) {
13424 // We may have here 1 or 2 entries only. If the number of scalars is equal
13425 // to the number of entries, no need to do the analysis, it is not very
13426 // profitable. Since VL is not the same as TE->Scalars, it means we already
13427 // have some shuffles before. Cut off not profitable case.
13428 Entries.clear();
13429 return std::nullopt;
13430 }
13431 // Build the final mask, check for the identity shuffle, if possible.
13432 bool IsIdentity = Entries.size() == 1;
13433 // Pair.first is the offset to the vector, while Pair.second is the index of
13434 // scalar in the list.
13435 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13436 unsigned Idx = Part * VL.size() + Pair.second;
13437 Mask[Idx] =
13438 Pair.first * VF +
13439 (ForOrder ? std::distance(
13440 Entries[Pair.first]->Scalars.begin(),
13441 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13442 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13443 IsIdentity &= Mask[Idx] == Pair.second;
13444 }
13445 if (ForOrder || IsIdentity || Entries.empty()) {
13446 switch (Entries.size()) {
13447 case 1:
13448 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13450 break;
13451 case 2:
13452 if (EntryLanes.size() > 2 || VL.size() <= 2)
13454 break;
13455 default:
13456 break;
13457 }
13458 } else if (!isa<VectorType>(VL.front()->getType()) &&
13459 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13460 // Do the cost estimation if shuffle beneficial than buildvector.
13461 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13462 std::next(Mask.begin(), (Part + 1) * VL.size()));
13463 int MinElement = SubMask.front(), MaxElement = SubMask.front();
13464 for (int Idx : SubMask) {
13465 if (Idx == PoisonMaskElem)
13466 continue;
13467 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13468 MinElement = Idx;
13469 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13470 MaxElement = Idx;
13471 }
13472 assert(MaxElement >= 0 && MinElement >= 0 &&
13473 MaxElement % VF >= MinElement % VF &&
13474 "Expected at least single element.");
13475 unsigned NewVF = std::max<unsigned>(
13476 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13477 (MaxElement % VF) -
13478 (MinElement % VF) + 1));
13479 if (NewVF < VF) {
13480 for_each(SubMask, [&](int &Idx) {
13481 if (Idx == PoisonMaskElem)
13482 return;
13483 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13484 (Idx >= static_cast<int>(VF) ? NewVF : 0);
13485 });
13486 } else {
13487 NewVF = VF;
13488 }
13489
13491 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
13492 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13493 auto GetShuffleCost = [&,
13496 VectorType *VecTy) -> InstructionCost {
13497 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13499 Mask, Entries.front()->getInterleaveFactor()))
13500 return TTI::TCC_Free;
13501 return ::getShuffleCost(TTI,
13502 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13504 VecTy, Mask, CostKind);
13505 };
13506 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13507 InstructionCost FirstShuffleCost = 0;
13508 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13509 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13510 FirstShuffleCost = ShuffleCost;
13511 } else {
13512 // Transform mask to include only first entry.
13513 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13514 bool IsIdentity = true;
13515 for (auto [I, Idx] : enumerate(FirstMask)) {
13516 if (Idx >= static_cast<int>(NewVF)) {
13518 } else {
13519 DemandedElts.clearBit(I);
13520 if (Idx != PoisonMaskElem)
13521 IsIdentity &= static_cast<int>(I) == Idx;
13522 }
13523 }
13524 if (!IsIdentity)
13525 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13526 FirstShuffleCost += TTI->getScalarizationOverhead(
13527 MaskVecTy, DemandedElts, /*Insert=*/true,
13528 /*Extract=*/false, CostKind);
13529 }
13530 InstructionCost SecondShuffleCost = 0;
13531 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13532 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13533 SecondShuffleCost = ShuffleCost;
13534 } else {
13535 // Transform mask to include only first entry.
13536 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13537 bool IsIdentity = true;
13538 for (auto [I, Idx] : enumerate(SecondMask)) {
13539 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
13541 } else {
13542 DemandedElts.clearBit(I);
13543 if (Idx != PoisonMaskElem) {
13544 Idx -= NewVF;
13545 IsIdentity &= static_cast<int>(I) == Idx;
13546 }
13547 }
13548 }
13549 if (!IsIdentity)
13550 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13551 SecondShuffleCost += TTI->getScalarizationOverhead(
13552 MaskVecTy, DemandedElts, /*Insert=*/true,
13553 /*Extract=*/false, CostKind);
13554 }
13555 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13556 for (auto [I, Idx] : enumerate(SubMask))
13557 if (Idx == PoisonMaskElem)
13558 DemandedElts.clearBit(I);
13559 InstructionCost BuildVectorCost =
13560 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13561 /*Extract=*/false, CostKind);
13562 const TreeEntry *BestEntry = nullptr;
13563 if (FirstShuffleCost < ShuffleCost) {
13564 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13565 std::next(Mask.begin(), (Part + 1) * VL.size()),
13566 [&](int &Idx) {
13567 if (Idx >= static_cast<int>(VF))
13568 Idx = PoisonMaskElem;
13569 });
13570 BestEntry = Entries.front();
13571 ShuffleCost = FirstShuffleCost;
13572 }
13573 if (SecondShuffleCost < ShuffleCost) {
13574 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13575 std::next(Mask.begin(), (Part + 1) * VL.size()),
13576 [&](int &Idx) {
13577 if (Idx < static_cast<int>(VF))
13578 Idx = PoisonMaskElem;
13579 else
13580 Idx -= VF;
13581 });
13582 BestEntry = Entries[1];
13583 ShuffleCost = SecondShuffleCost;
13584 }
13585 if (BuildVectorCost >= ShuffleCost) {
13586 if (BestEntry) {
13587 Entries.clear();
13588 Entries.push_back(BestEntry);
13589 }
13590 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13592 }
13593 }
13594 Entries.clear();
13595 // Clear the corresponding mask elements.
13596 std::fill(std::next(Mask.begin(), Part * VL.size()),
13597 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13598 return std::nullopt;
13599}
13600
13602BoUpSLP::isGatherShuffledEntry(
13603 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13604 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13605 bool ForOrder) {
13606 assert(NumParts > 0 && NumParts < VL.size() &&
13607 "Expected positive number of registers.");
13608 Entries.clear();
13609 // No need to check for the topmost gather node.
13610 if (TE == VectorizableTree.front().get() &&
13611 (!GatheredLoadsEntriesFirst.has_value() ||
13612 none_of(ArrayRef(VectorizableTree).drop_front(),
13613 [](const std::unique_ptr<TreeEntry> &TE) {
13614 return !TE->isGather();
13615 })))
13616 return {};
13617 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13618 // implemented yet.
13619 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
13620 return {};
13621 Mask.assign(VL.size(), PoisonMaskElem);
13622 assert((TE->UserTreeIndices.size() == 1 ||
13623 TE == VectorizableTree.front().get()) &&
13624 "Expected only single user of the gather node.");
13625 assert(VL.size() % NumParts == 0 &&
13626 "Number of scalars must be divisible by NumParts.");
13627 if (!TE->UserTreeIndices.empty() &&
13628 TE->UserTreeIndices.front().UserTE->isGather() &&
13629 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13630 assert(
13631 (TE->Idx == 0 ||
13632 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
13633 isSplat(TE->Scalars)) &&
13634 "Expected splat or extractelements only node.");
13635 return {};
13636 }
13637 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13639 for (unsigned Part : seq<unsigned>(NumParts)) {
13640 ArrayRef<Value *> SubVL =
13641 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13642 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13643 std::optional<TTI::ShuffleKind> SubRes =
13644 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13645 ForOrder);
13646 if (!SubRes)
13647 SubEntries.clear();
13648 Res.push_back(SubRes);
13649 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13650 SubEntries.front()->getVectorFactor() == VL.size() &&
13651 (SubEntries.front()->isSame(TE->Scalars) ||
13652 SubEntries.front()->isSame(VL))) {
13653 SmallVector<const TreeEntry *> LocalSubEntries;
13654 LocalSubEntries.swap(SubEntries);
13655 Entries.clear();
13656 Res.clear();
13657 std::iota(Mask.begin(), Mask.end(), 0);
13658 // Clear undef scalars.
13659 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13660 if (isa<PoisonValue>(VL[I]))
13662 Entries.emplace_back(1, LocalSubEntries.front());
13664 return Res;
13665 }
13666 }
13667 if (all_of(Res,
13668 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13669 Entries.clear();
13670 return {};
13671 }
13672 return Res;
13673}
13674
13675InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13676 Type *ScalarTy) const {
13677 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13678 bool DuplicateNonConst = false;
13679 // Find the cost of inserting/extracting values from the vector.
13680 // Check if the same elements are inserted several times and count them as
13681 // shuffle candidates.
13682 APInt ShuffledElements = APInt::getZero(VL.size());
13683 DenseMap<Value *, unsigned> UniqueElements;
13686 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13687 if (V->getType() != ScalarTy) {
13688 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13690 V = nullptr;
13691 }
13692 if (!ForPoisonSrc)
13693 Cost +=
13694 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13695 I, Constant::getNullValue(VecTy), V);
13696 };
13697 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13698 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13699 Value *V = VL[I];
13700 // No need to shuffle duplicates for constants.
13701 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13702 ShuffledElements.setBit(I);
13703 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13704 continue;
13705 }
13706
13707 auto Res = UniqueElements.try_emplace(V, I);
13708 if (Res.second) {
13709 EstimateInsertCost(I, V);
13710 ShuffleMask[I] = I;
13711 continue;
13712 }
13713
13714 DuplicateNonConst = true;
13715 ShuffledElements.setBit(I);
13716 ShuffleMask[I] = Res.first->second;
13717 }
13718 if (ForPoisonSrc) {
13719 if (isa<FixedVectorType>(ScalarTy)) {
13720 assert(SLPReVec && "Only supported by REVEC.");
13721 // We don't need to insert elements one by one. Instead, we can insert the
13722 // entire vector into the destination.
13723 Cost = 0;
13724 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13725 for (unsigned I : seq<unsigned>(VL.size()))
13726 if (!ShuffledElements[I])
13728 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13729 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13730 } else {
13732 /*DemandedElts*/ ~ShuffledElements,
13733 /*Insert*/ true,
13734 /*Extract*/ false, CostKind, VL);
13735 }
13736 }
13737 if (DuplicateNonConst)
13739 VecTy, ShuffleMask);
13740 return Cost;
13741}
13742
13743Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13744 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13745 if (Res)
13746 return *Res;
13747 // Get the basic block this bundle is in. All instructions in the bundle
13748 // should be in this block (except for extractelement-like instructions with
13749 // constant indices or gathered loads).
13750 auto *Front = E->getMainOp();
13751 auto *BB = Front->getParent();
13752 assert(((GatheredLoadsEntriesFirst.has_value() &&
13753 E->getOpcode() == Instruction::Load && E->isGather() &&
13754 E->Idx < *GatheredLoadsEntriesFirst) ||
13755 all_of(E->Scalars,
13756 [=](Value *V) -> bool {
13757 if (E->getOpcode() == Instruction::GetElementPtr &&
13758 !isa<GetElementPtrInst>(V))
13759 return true;
13760 auto *I = dyn_cast<Instruction>(V);
13761 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13762 isVectorLikeInstWithConstOps(I);
13763 })) &&
13764 "Expected gathered loads or GEPs or instructions from same basic "
13765 "block.");
13766
13767 auto FindLastInst = [&]() {
13768 Instruction *LastInst = Front;
13769 for (Value *V : E->Scalars) {
13770 auto *I = dyn_cast<Instruction>(V);
13771 if (!I)
13772 continue;
13773 if (LastInst->getParent() == I->getParent()) {
13774 if (LastInst->comesBefore(I))
13775 LastInst = I;
13776 continue;
13777 }
13778 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13779 !isa<GetElementPtrInst>(I)) ||
13780 (isVectorLikeInstWithConstOps(LastInst) &&
13782 (GatheredLoadsEntriesFirst.has_value() &&
13783 E->getOpcode() == Instruction::Load && E->isGather() &&
13784 E->Idx < *GatheredLoadsEntriesFirst)) &&
13785 "Expected vector-like or non-GEP in GEP node insts only.");
13786 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13787 LastInst = I;
13788 continue;
13789 }
13790 if (!DT->isReachableFromEntry(I->getParent()))
13791 continue;
13792 auto *NodeA = DT->getNode(LastInst->getParent());
13793 auto *NodeB = DT->getNode(I->getParent());
13794 assert(NodeA && "Should only process reachable instructions");
13795 assert(NodeB && "Should only process reachable instructions");
13796 assert((NodeA == NodeB) ==
13797 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13798 "Different nodes should have different DFS numbers");
13799 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13800 LastInst = I;
13801 }
13802 BB = LastInst->getParent();
13803 return LastInst;
13804 };
13805
13806 auto FindFirstInst = [&]() {
13807 Instruction *FirstInst = Front;
13808 for (Value *V : E->Scalars) {
13809 auto *I = dyn_cast<Instruction>(V);
13810 if (!I)
13811 continue;
13812 if (FirstInst->getParent() == I->getParent()) {
13813 if (I->comesBefore(FirstInst))
13814 FirstInst = I;
13815 continue;
13816 }
13817 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13818 !isa<GetElementPtrInst>(I)) ||
13819 (isVectorLikeInstWithConstOps(FirstInst) &&
13821 "Expected vector-like or non-GEP in GEP node insts only.");
13822 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13823 FirstInst = I;
13824 continue;
13825 }
13826 if (!DT->isReachableFromEntry(I->getParent()))
13827 continue;
13828 auto *NodeA = DT->getNode(FirstInst->getParent());
13829 auto *NodeB = DT->getNode(I->getParent());
13830 assert(NodeA && "Should only process reachable instructions");
13831 assert(NodeB && "Should only process reachable instructions");
13832 assert((NodeA == NodeB) ==
13833 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13834 "Different nodes should have different DFS numbers");
13835 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13836 FirstInst = I;
13837 }
13838 return FirstInst;
13839 };
13840
13841 // Set insertpoint for gathered loads to the very first load.
13842 if (GatheredLoadsEntriesFirst.has_value() &&
13843 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13844 E->getOpcode() == Instruction::Load) {
13845 Res = FindFirstInst();
13846 return *Res;
13847 }
13848
13849 // Set the insert point to the beginning of the basic block if the entry
13850 // should not be scheduled.
13851 if (doesNotNeedToSchedule(E->Scalars) ||
13852 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13853 if ((E->getOpcode() == Instruction::GetElementPtr &&
13854 any_of(E->Scalars,
13855 [](Value *V) {
13856 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13857 })) ||
13858 all_of(E->Scalars,
13859 [](Value *V) {
13860 return isa<PoisonValue>(V) ||
13861 (!isVectorLikeInstWithConstOps(V) &&
13862 isUsedOutsideBlock(V));
13863 }) ||
13864 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13865 return isa<ExtractElementInst, UndefValue>(V) ||
13866 areAllOperandsNonInsts(V);
13867 })))
13868 Res = FindLastInst();
13869 else
13870 Res = FindFirstInst();
13871 return *Res;
13872 }
13873
13874 // Find the last instruction. The common case should be that BB has been
13875 // scheduled, and the last instruction is VL.back(). So we start with
13876 // VL.back() and iterate over schedule data until we reach the end of the
13877 // bundle. The end of the bundle is marked by null ScheduleData.
13878 if (BlocksSchedules.count(BB) && !E->isGather()) {
13879 Value *V = E->isOneOf(E->Scalars.back());
13881 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13882 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13883 if (Bundle && Bundle->isPartOfBundle())
13884 for (; Bundle; Bundle = Bundle->NextInBundle)
13885 Res = Bundle->Inst;
13886 }
13887
13888 // LastInst can still be null at this point if there's either not an entry
13889 // for BB in BlocksSchedules or there's no ScheduleData available for
13890 // VL.back(). This can be the case if buildTree_rec aborts for various
13891 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13892 // size is reached, etc.). ScheduleData is initialized in the scheduling
13893 // "dry-run".
13894 //
13895 // If this happens, we can still find the last instruction by brute force. We
13896 // iterate forwards from Front (inclusive) until we either see all
13897 // instructions in the bundle or reach the end of the block. If Front is the
13898 // last instruction in program order, LastInst will be set to Front, and we
13899 // will visit all the remaining instructions in the block.
13900 //
13901 // One of the reasons we exit early from buildTree_rec is to place an upper
13902 // bound on compile-time. Thus, taking an additional compile-time hit here is
13903 // not ideal. However, this should be exceedingly rare since it requires that
13904 // we both exit early from buildTree_rec and that the bundle be out-of-order
13905 // (causing us to iterate all the way to the end of the block).
13906 if (!Res)
13907 Res = FindLastInst();
13908 assert(Res && "Failed to find last instruction in bundle");
13909 return *Res;
13910}
13911
13912void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13913 auto *Front = E->getMainOp();
13914 Instruction *LastInst = &getLastInstructionInBundle(E);
13915 assert(LastInst && "Failed to find last instruction in bundle");
13916 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13917 // If the instruction is PHI, set the insert point after all the PHIs.
13918 bool IsPHI = isa<PHINode>(LastInst);
13919 if (IsPHI)
13920 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13921 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13922 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13923 } else {
13924 // Set the insertion point after the last instruction in the bundle. Set the
13925 // debug location to Front.
13926 Builder.SetInsertPoint(
13927 LastInst->getParent(),
13929 }
13930 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13931}
13932
13933Value *BoUpSLP::gather(
13934 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13935 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13936 // List of instructions/lanes from current block and/or the blocks which are
13937 // part of the current loop. These instructions will be inserted at the end to
13938 // make it possible to optimize loops and hoist invariant instructions out of
13939 // the loops body with better chances for success.
13941 SmallSet<int, 4> PostponedIndices;
13942 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13943 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13945 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13946 InsertBB = InsertBB->getSinglePredecessor();
13947 return InsertBB && InsertBB == InstBB;
13948 };
13949 for (int I = 0, E = VL.size(); I < E; ++I) {
13950 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13951 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13952 getTreeEntry(Inst) ||
13953 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13954 PostponedIndices.insert(I).second)
13955 PostponedInsts.emplace_back(Inst, I);
13956 }
13957
13958 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13959 Type *Ty) {
13960 Value *Scalar = V;
13961 if (Scalar->getType() != Ty) {
13962 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13963 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13964 Value *V = Scalar;
13965 if (auto *CI = dyn_cast<CastInst>(Scalar);
13966 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13967 Value *Op = CI->getOperand(0);
13968 if (auto *IOp = dyn_cast<Instruction>(Op);
13969 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13970 V = Op;
13971 }
13972 Scalar = Builder.CreateIntCast(
13973 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13974 }
13975
13976 Instruction *InsElt;
13977 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13978 assert(SLPReVec && "FixedVectorType is not expected.");
13979 Vec =
13980 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
13981 auto *II = dyn_cast<IntrinsicInst>(Vec);
13982 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13983 return Vec;
13984 InsElt = II;
13985 } else {
13986 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13987 InsElt = dyn_cast<InsertElementInst>(Vec);
13988 if (!InsElt)
13989 return Vec;
13990 }
13991 GatherShuffleExtractSeq.insert(InsElt);
13992 CSEBlocks.insert(InsElt->getParent());
13993 // Add to our 'need-to-extract' list.
13994 if (isa<Instruction>(V)) {
13995 if (TreeEntry *Entry = getTreeEntry(V)) {
13996 // Find which lane we need to extract.
13997 User *UserOp = nullptr;
13998 if (Scalar != V) {
13999 if (auto *SI = dyn_cast<Instruction>(Scalar))
14000 UserOp = SI;
14001 } else {
14002 UserOp = InsElt;
14003 }
14004 if (UserOp) {
14005 unsigned FoundLane = Entry->findLaneForValue(V);
14006 ExternalUses.emplace_back(V, UserOp, FoundLane);
14007 }
14008 }
14009 }
14010 return Vec;
14011 };
14012 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14013 Value *Vec = PoisonValue::get(VecTy);
14014 SmallVector<int> NonConsts;
14016 std::iota(Mask.begin(), Mask.end(), 0);
14017 Value *OriginalRoot = Root;
14018 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14019 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14020 SV->getOperand(0)->getType() == VecTy) {
14021 Root = SV->getOperand(0);
14022 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14023 }
14024 // Insert constant values at first.
14025 for (int I = 0, E = VL.size(); I < E; ++I) {
14026 if (PostponedIndices.contains(I))
14027 continue;
14028 if (!isConstant(VL[I])) {
14029 NonConsts.push_back(I);
14030 continue;
14031 }
14032 if (isa<PoisonValue>(VL[I]))
14033 continue;
14034 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14035 Mask[I] = I + E;
14036 }
14037 if (Root) {
14038 if (isa<PoisonValue>(Vec)) {
14039 Vec = OriginalRoot;
14040 } else {
14041 Vec = CreateShuffle(Root, Vec, Mask);
14042 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
14043 OI && OI->hasNUses(0) &&
14044 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14045 return TE->VectorizedValue == OI;
14046 }))
14047 eraseInstruction(OI);
14048 }
14049 }
14050 // Insert non-constant values.
14051 for (int I : NonConsts)
14052 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14053 // Append instructions, which are/may be part of the loop, in the end to make
14054 // it possible to hoist non-loop-based instructions.
14055 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14056 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14057
14058 return Vec;
14059}
14060
14061/// Merges shuffle masks and emits final shuffle instruction, if required. It
14062/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14063/// when the actual shuffle instruction is generated only if this is actually
14064/// required. Otherwise, the shuffle instruction emission is delayed till the
14065/// end of the process, to reduce the number of emitted instructions and further
14066/// analysis/transformations.
14067/// The class also will look through the previously emitted shuffle instructions
14068/// and properly mark indices in mask as undef.
14069/// For example, given the code
14070/// \code
14071/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
14072/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
14073/// \endcode
14074/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
14075/// look through %s1 and %s2 and emit
14076/// \code
14077/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14078/// \endcode
14079/// instead.
14080/// If 2 operands are of different size, the smallest one will be resized and
14081/// the mask recalculated properly.
14082/// For example, given the code
14083/// \code
14084/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
14085/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
14086/// \endcode
14087/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
14088/// look through %s1 and %s2 and emit
14089/// \code
14090/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14091/// \endcode
14092/// instead.
14093class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
14094 bool IsFinalized = false;
14095 /// Combined mask for all applied operands and masks. It is built during
14096 /// analysis and actual emission of shuffle vector instructions.
14097 SmallVector<int> CommonMask;
14098 /// List of operands for the shuffle vector instruction. It hold at max 2
14099 /// operands, if the 3rd is going to be added, the first 2 are combined into
14100 /// shuffle with \p CommonMask mask, the first operand sets to be the
14101 /// resulting shuffle and the second operand sets to be the newly added
14102 /// operand. The \p CommonMask is transformed in the proper way after that.
14103 SmallVector<Value *, 2> InVectors;
14104 IRBuilderBase &Builder;
14105 BoUpSLP &R;
14106
14107 class ShuffleIRBuilder {
14108 IRBuilderBase &Builder;
14109 /// Holds all of the instructions that we gathered.
14110 SetVector<Instruction *> &GatherShuffleExtractSeq;
14111 /// A list of blocks that we are going to CSE.
14112 DenseSet<BasicBlock *> &CSEBlocks;
14113 /// Data layout.
14114 const DataLayout &DL;
14115
14116 public:
14117 ShuffleIRBuilder(IRBuilderBase &Builder,
14118 SetVector<Instruction *> &GatherShuffleExtractSeq,
14119 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
14120 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14121 CSEBlocks(CSEBlocks), DL(DL) {}
14122 ~ShuffleIRBuilder() = default;
14123 /// Creates shufflevector for the 2 operands with the given mask.
14124 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
14125 if (V1->getType() != V2->getType()) {
14127 V1->getType()->isIntOrIntVectorTy() &&
14128 "Expected integer vector types only.");
14129 if (V1->getType() != V2->getType()) {
14130 if (cast<VectorType>(V2->getType())
14131 ->getElementType()
14132 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14133 ->getElementType()
14134 ->getIntegerBitWidth())
14135 V2 = Builder.CreateIntCast(
14136 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
14137 else
14138 V1 = Builder.CreateIntCast(
14139 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
14140 }
14141 }
14142 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14143 if (auto *I = dyn_cast<Instruction>(Vec)) {
14144 GatherShuffleExtractSeq.insert(I);
14145 CSEBlocks.insert(I->getParent());
14146 }
14147 return Vec;
14148 }
14149 /// Creates permutation of the single vector operand with the given mask, if
14150 /// it is not identity mask.
14151 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
14152 if (Mask.empty())
14153 return V1;
14154 unsigned VF = Mask.size();
14155 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14156 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
14157 return V1;
14158 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14159 if (auto *I = dyn_cast<Instruction>(Vec)) {
14160 GatherShuffleExtractSeq.insert(I);
14161 CSEBlocks.insert(I->getParent());
14162 }
14163 return Vec;
14164 }
14165 Value *createIdentity(Value *V) { return V; }
14166 Value *createPoison(Type *Ty, unsigned VF) {
14167 return PoisonValue::get(getWidenedType(Ty, VF));
14168 }
14169 /// Resizes 2 input vector to match the sizes, if the they are not equal
14170 /// yet. The smallest vector is resized to the size of the larger vector.
14171 void resizeToMatch(Value *&V1, Value *&V2) {
14172 if (V1->getType() == V2->getType())
14173 return;
14174 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14175 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14176 int VF = std::max(V1VF, V2VF);
14177 int MinVF = std::min(V1VF, V2VF);
14178 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
14179 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14180 0);
14181 Value *&Op = MinVF == V1VF ? V1 : V2;
14182 Op = Builder.CreateShuffleVector(Op, IdentityMask);
14183 if (auto *I = dyn_cast<Instruction>(Op)) {
14184 GatherShuffleExtractSeq.insert(I);
14185 CSEBlocks.insert(I->getParent());
14186 }
14187 if (MinVF == V1VF)
14188 V1 = Op;
14189 else
14190 V2 = Op;
14191 }
14192 };
14193
14194 /// Smart shuffle instruction emission, walks through shuffles trees and
14195 /// tries to find the best matching vector for the actual shuffle
14196 /// instruction.
14197 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
14198 assert(V1 && "Expected at least one vector value.");
14199 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14200 R.CSEBlocks, *R.DL);
14201 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14202 ShuffleBuilder);
14203 }
14204
14205 /// Cast value \p V to the vector type with the same number of elements, but
14206 /// the base type \p ScalarTy.
14207 Value *castToScalarTyElem(Value *V,
14208 std::optional<bool> IsSigned = std::nullopt) {
14209 auto *VecTy = cast<VectorType>(V->getType());
14210 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
14211 if (VecTy->getElementType() == ScalarTy->getScalarType())
14212 return V;
14213 return Builder.CreateIntCast(
14214 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14215 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
14216 }
14217
14218public:
14220 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14221
14222 /// Adjusts extractelements after reusing them.
14223 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14224 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14225 unsigned NumParts, bool &UseVecBaseAsInput) {
14226 UseVecBaseAsInput = false;
14227 SmallPtrSet<Value *, 4> UniqueBases;
14228 Value *VecBase = nullptr;
14229 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14230 if (!E->ReorderIndices.empty()) {
14231 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14232 E->ReorderIndices.end());
14233 reorderScalars(VL, ReorderMask);
14234 }
14235 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14236 int Idx = Mask[I];
14237 if (Idx == PoisonMaskElem)
14238 continue;
14239 auto *EI = cast<ExtractElementInst>(VL[I]);
14240 VecBase = EI->getVectorOperand();
14241 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
14242 VecBase = TE->VectorizedValue;
14243 assert(VecBase && "Expected vectorized value.");
14244 UniqueBases.insert(VecBase);
14245 // If the only one use is vectorized - can delete the extractelement
14246 // itself.
14247 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14248 (NumParts != 1 && count(VL, EI) > 1) ||
14249 any_of(EI->users(), [&](User *U) {
14250 const TreeEntry *UTE = R.getTreeEntry(U);
14251 return !UTE || R.MultiNodeScalars.contains(U) ||
14252 (isa<GetElementPtrInst>(U) &&
14253 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14254 count_if(R.VectorizableTree,
14255 [&](const std::unique_ptr<TreeEntry> &TE) {
14256 return any_of(TE->UserTreeIndices,
14257 [&](const EdgeInfo &Edge) {
14258 return Edge.UserTE == UTE;
14259 }) &&
14260 is_contained(VL, EI);
14261 }) != 1;
14262 }))
14263 continue;
14264 R.eraseInstruction(EI);
14265 }
14266 if (NumParts == 1 || UniqueBases.size() == 1) {
14267 assert(VecBase && "Expected vectorized value.");
14268 return castToScalarTyElem(VecBase);
14269 }
14270 UseVecBaseAsInput = true;
14271 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14272 for (auto [I, Idx] : enumerate(Mask))
14273 if (Idx != PoisonMaskElem)
14274 Idx = I;
14275 };
14276 // Perform multi-register vector shuffle, joining them into a single virtual
14277 // long vector.
14278 // Need to shuffle each part independently and then insert all this parts
14279 // into a long virtual vector register, forming the original vector.
14280 Value *Vec = nullptr;
14281 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14282 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14283 for (unsigned Part : seq<unsigned>(NumParts)) {
14284 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14285 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14286 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14287 constexpr int MaxBases = 2;
14288 SmallVector<Value *, MaxBases> Bases(MaxBases);
14289 auto VLMask = zip(SubVL, SubMask);
14290 const unsigned VF = std::accumulate(
14291 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14292 if (std::get<1>(D) == PoisonMaskElem)
14293 return S;
14294 Value *VecOp =
14295 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14296 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14297 VecOp = TE->VectorizedValue;
14298 assert(VecOp && "Expected vectorized value.");
14299 const unsigned Size =
14300 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14301 return std::max(S, Size);
14302 });
14303 for (const auto [V, I] : VLMask) {
14304 if (I == PoisonMaskElem)
14305 continue;
14306 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14307 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14308 VecOp = TE->VectorizedValue;
14309 assert(VecOp && "Expected vectorized value.");
14310 VecOp = castToScalarTyElem(VecOp);
14311 Bases[I / VF] = VecOp;
14312 }
14313 if (!Bases.front())
14314 continue;
14315 Value *SubVec;
14316 if (Bases.back()) {
14317 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14318 TransformToIdentity(SubMask);
14319 } else {
14320 SubVec = Bases.front();
14321 }
14322 if (!Vec) {
14323 Vec = SubVec;
14324 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14325 [&](unsigned P) {
14326 ArrayRef<int> SubMask =
14327 Mask.slice(P * SliceSize,
14328 getNumElems(Mask.size(),
14329 SliceSize, P));
14330 return all_of(SubMask, [](int Idx) {
14331 return Idx == PoisonMaskElem;
14332 });
14333 })) &&
14334 "Expected first part or all previous parts masked.");
14335 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14336 } else {
14337 unsigned NewVF =
14338 cast<FixedVectorType>(Vec->getType())->getNumElements();
14339 if (Vec->getType() != SubVec->getType()) {
14340 unsigned SubVecVF =
14341 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14342 NewVF = std::max(NewVF, SubVecVF);
14343 }
14344 // Adjust SubMask.
14345 for (int &Idx : SubMask)
14346 if (Idx != PoisonMaskElem)
14347 Idx += NewVF;
14348 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14349 Vec = createShuffle(Vec, SubVec, VecMask);
14350 TransformToIdentity(VecMask);
14351 }
14352 }
14353 copy(VecMask, Mask.begin());
14354 return Vec;
14355 }
14356 /// Checks if the specified entry \p E needs to be delayed because of its
14357 /// dependency nodes.
14358 std::optional<Value *>
14359 needToDelay(const TreeEntry *E,
14361 // No need to delay emission if all deps are ready.
14362 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14363 return all_of(
14364 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14365 }))
14366 return std::nullopt;
14367 // Postpone gather emission, will be emitted after the end of the
14368 // process to keep correct order.
14369 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14370 return Builder.CreateAlignedLoad(
14371 ResVecTy,
14373 MaybeAlign());
14374 }
14375 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14376 /// shuffling.
14377 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14378 Value *V1 = E1.VectorizedValue;
14379 if (V1->getType()->isIntOrIntVectorTy())
14380 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14381 if (isa<PoisonValue>(V))
14382 return false;
14383 return !isKnownNonNegative(
14384 V, SimplifyQuery(*R.DL));
14385 }));
14386 Value *V2 = E2.VectorizedValue;
14387 if (V2->getType()->isIntOrIntVectorTy())
14388 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14389 if (isa<PoisonValue>(V))
14390 return false;
14391 return !isKnownNonNegative(
14392 V, SimplifyQuery(*R.DL));
14393 }));
14394 add(V1, V2, Mask);
14395 }
14396 /// Adds single input vector (in form of tree entry) and the mask for its
14397 /// shuffling.
14398 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14399 Value *V1 = E1.VectorizedValue;
14400 if (V1->getType()->isIntOrIntVectorTy())
14401 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14402 if (isa<PoisonValue>(V))
14403 return false;
14404 return !isKnownNonNegative(
14405 V, SimplifyQuery(*R.DL));
14406 }));
14407 add(V1, Mask);
14408 }
14409 /// Adds 2 input vectors and the mask for their shuffling.
14410 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14411 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14412 assert(isa<FixedVectorType>(V1->getType()) &&
14413 isa<FixedVectorType>(V2->getType()) &&
14414 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14415 V1 = castToScalarTyElem(V1);
14416 V2 = castToScalarTyElem(V2);
14417 if (InVectors.empty()) {
14418 InVectors.push_back(V1);
14419 InVectors.push_back(V2);
14420 CommonMask.assign(Mask.begin(), Mask.end());
14421 return;
14422 }
14423 Value *Vec = InVectors.front();
14424 if (InVectors.size() == 2) {
14425 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14426 transformMaskAfterShuffle(CommonMask, CommonMask);
14427 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14428 Mask.size()) {
14429 Vec = createShuffle(Vec, nullptr, CommonMask);
14430 transformMaskAfterShuffle(CommonMask, CommonMask);
14431 }
14432 V1 = createShuffle(V1, V2, Mask);
14433 unsigned VF = std::max(getVF(V1), getVF(Vec));
14434 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14435 if (Mask[Idx] != PoisonMaskElem)
14436 CommonMask[Idx] = Idx + VF;
14437 InVectors.front() = Vec;
14438 if (InVectors.size() == 2)
14439 InVectors.back() = V1;
14440 else
14441 InVectors.push_back(V1);
14442 }
14443 /// Adds another one input vector and the mask for the shuffling.
14444 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14445 assert(isa<FixedVectorType>(V1->getType()) &&
14446 "castToScalarTyElem expects V1 to be FixedVectorType");
14447 V1 = castToScalarTyElem(V1);
14448 if (InVectors.empty()) {
14449 InVectors.push_back(V1);
14450 CommonMask.assign(Mask.begin(), Mask.end());
14451 return;
14452 }
14453 const auto *It = find(InVectors, V1);
14454 if (It == InVectors.end()) {
14455 if (InVectors.size() == 2 ||
14456 InVectors.front()->getType() != V1->getType()) {
14457 Value *V = InVectors.front();
14458 if (InVectors.size() == 2) {
14459 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14460 transformMaskAfterShuffle(CommonMask, CommonMask);
14461 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14462 CommonMask.size()) {
14463 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14464 transformMaskAfterShuffle(CommonMask, CommonMask);
14465 }
14466 unsigned VF = std::max(CommonMask.size(), Mask.size());
14467 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14468 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14469 CommonMask[Idx] =
14470 V->getType() != V1->getType()
14471 ? Idx + VF
14472 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14473 ->getNumElements();
14474 if (V->getType() != V1->getType())
14475 V1 = createShuffle(V1, nullptr, Mask);
14476 InVectors.front() = V;
14477 if (InVectors.size() == 2)
14478 InVectors.back() = V1;
14479 else
14480 InVectors.push_back(V1);
14481 return;
14482 }
14483 // Check if second vector is required if the used elements are already
14484 // used from the first one.
14485 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14486 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14487 InVectors.push_back(V1);
14488 break;
14489 }
14490 }
14491 unsigned VF = 0;
14492 for (Value *V : InVectors)
14493 VF = std::max(VF, getVF(V));
14494 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14495 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14496 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14497 }
14498 /// Adds another one input vector and the mask for the shuffling.
14500 SmallVector<int> NewMask;
14501 inversePermutation(Order, NewMask);
14502 add(V1, NewMask);
14503 }
14504 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14505 Value *Root = nullptr) {
14506 return R.gather(VL, Root, ScalarTy,
14507 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14508 return createShuffle(V1, V2, Mask);
14509 });
14510 }
14511 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14512 /// Finalize emission of the shuffles.
14513 /// \param Action the action (if any) to be performed before final applying of
14514 /// the \p ExtMask mask.
14515 Value *
14517 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14518 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14519 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14520 IsFinalized = true;
14521 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
14522 SmallVector<int> NewExtMask(ExtMask);
14523 if (ScalarTyNumElements != 1) {
14524 assert(SLPReVec && "FixedVectorType is not expected.");
14525 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask);
14526 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask);
14527 ExtMask = NewExtMask;
14528 }
14529 if (Action) {
14530 Value *Vec = InVectors.front();
14531 if (InVectors.size() == 2) {
14532 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14533 InVectors.pop_back();
14534 } else {
14535 Vec = createShuffle(Vec, nullptr, CommonMask);
14536 }
14537 transformMaskAfterShuffle(CommonMask, CommonMask);
14538 assert(VF > 0 &&
14539 "Expected vector length for the final value before action.");
14540 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14541 if (VecVF < VF) {
14542 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14543 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14544 Vec = createShuffle(Vec, nullptr, ResizeMask);
14545 }
14546 Action(Vec, CommonMask);
14547 InVectors.front() = Vec;
14548 }
14549 if (!SubVectors.empty()) {
14550 Value *Vec = InVectors.front();
14551 if (InVectors.size() == 2) {
14552 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14553 InVectors.pop_back();
14554 } else {
14555 Vec = createShuffle(Vec, nullptr, CommonMask);
14556 }
14557 transformMaskAfterShuffle(CommonMask, CommonMask);
14558 auto CreateSubVectors = [&](Value *Vec,
14559 SmallVectorImpl<int> &CommonMask) {
14560 for (auto [E, Idx] : SubVectors) {
14561 Value *V = E->VectorizedValue;
14562 if (V->getType()->isIntOrIntVectorTy())
14563 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14564 if (isa<PoisonValue>(V))
14565 return false;
14566 return !isKnownNonNegative(
14567 V, SimplifyQuery(*R.DL));
14568 }));
14569 unsigned InsertionIndex = Idx * ScalarTyNumElements;
14570 Vec = createInsertVector(
14571 Builder, Vec, V, InsertionIndex,
14572 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
14573 _3));
14574 if (!CommonMask.empty()) {
14575 std::iota(
14576 std::next(CommonMask.begin(), InsertionIndex),
14577 std::next(CommonMask.begin(),
14578 (Idx + E->getVectorFactor()) * ScalarTyNumElements),
14579 InsertionIndex);
14580 }
14581 }
14582 return Vec;
14583 };
14584 if (SubVectorsMask.empty()) {
14585 Vec = CreateSubVectors(Vec, CommonMask);
14586 } else {
14587 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14588 copy(SubVectorsMask, SVMask.begin());
14589 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14590 if (I2 != PoisonMaskElem) {
14591 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14592 I1 = I2 + CommonMask.size();
14593 }
14594 }
14595 Value *InsertVec =
14596 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14597 Vec = createShuffle(InsertVec, Vec, SVMask);
14598 transformMaskAfterShuffle(CommonMask, SVMask);
14599 }
14600 InVectors.front() = Vec;
14601 }
14602
14603 if (!ExtMask.empty()) {
14604 if (CommonMask.empty()) {
14605 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14606 } else {
14607 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14608 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14609 if (ExtMask[I] == PoisonMaskElem)
14610 continue;
14611 NewMask[I] = CommonMask[ExtMask[I]];
14612 }
14613 CommonMask.swap(NewMask);
14614 }
14615 }
14616 if (CommonMask.empty()) {
14617 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14618 return InVectors.front();
14619 }
14620 if (InVectors.size() == 2)
14621 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14622 return createShuffle(InVectors.front(), nullptr, CommonMask);
14623 }
14624
14626 assert((IsFinalized || CommonMask.empty()) &&
14627 "Shuffle construction must be finalized.");
14628 }
14629};
14630
14631BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14632 unsigned NodeIdx) {
14633 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14634 InstructionsState S = getSameOpcode(VL, *TLI);
14635 // Special processing for GEPs bundle, which may include non-gep values.
14636 if (!S && VL.front()->getType()->isPointerTy()) {
14637 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14638 if (It != VL.end())
14639 S = getSameOpcode(*It, *TLI);
14640 }
14641 if (!S)
14642 return nullptr;
14643 auto CheckSameVE = [&](const TreeEntry *VE) {
14644 return VE->isSame(VL) &&
14645 (any_of(VE->UserTreeIndices,
14646 [E, NodeIdx](const EdgeInfo &EI) {
14647 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14648 }) ||
14649 any_of(VectorizableTree,
14650 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14651 return TE->isOperandGatherNode(
14652 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14653 VE->isSame(TE->Scalars);
14654 }));
14655 };
14656 TreeEntry *VE = getTreeEntry(S.getMainOp());
14657 if (VE && CheckSameVE(VE))
14658 return VE;
14659 auto It = MultiNodeScalars.find(S.getMainOp());
14660 if (It != MultiNodeScalars.end()) {
14661 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
14662 return TE != VE && CheckSameVE(TE);
14663 });
14664 if (I != It->getSecond().end())
14665 return *I;
14666 }
14667 return nullptr;
14668}
14669
14670Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14671 bool PostponedPHIs) {
14672 ValueList &VL = E->getOperand(NodeIdx);
14673 const unsigned VF = VL.size();
14674 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14675 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14676 // V may be affected by MinBWs.
14677 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14678 // factor is the number of elements, not their type.
14679 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14680 unsigned NumElements = getNumElements(VL.front()->getType());
14681 ShuffleInstructionBuilder ShuffleBuilder(
14682 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14683 : ScalarTy,
14684 Builder, *this);
14685 ShuffleBuilder.add(V, Mask);
14687 E->CombinedEntriesWithIndices.size());
14688 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14689 [&](const auto &P) {
14690 return std::make_pair(VectorizableTree[P.first].get(),
14691 P.second);
14692 });
14693 assert((E->CombinedEntriesWithIndices.empty() ||
14694 E->ReorderIndices.empty()) &&
14695 "Expected either combined subnodes or reordering");
14696 return ShuffleBuilder.finalize({}, SubVectors, {});
14697 };
14698 Value *V = vectorizeTree(VE, PostponedPHIs);
14699 if (VF * getNumElements(VL[0]->getType()) !=
14700 cast<FixedVectorType>(V->getType())->getNumElements()) {
14701 if (!VE->ReuseShuffleIndices.empty()) {
14702 // Reshuffle to get only unique values.
14703 // If some of the scalars are duplicated in the vectorization
14704 // tree entry, we do not vectorize them but instead generate a
14705 // mask for the reuses. But if there are several users of the
14706 // same entry, they may have different vectorization factors.
14707 // This is especially important for PHI nodes. In this case, we
14708 // need to adapt the resulting instruction for the user
14709 // vectorization factor and have to reshuffle it again to take
14710 // only unique elements of the vector. Without this code the
14711 // function incorrectly returns reduced vector instruction with
14712 // the same elements, not with the unique ones.
14713
14714 // block:
14715 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14716 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14717 // ... (use %2)
14718 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14719 // br %block
14721 for (auto [I, V] : enumerate(VL)) {
14722 if (isa<PoisonValue>(V))
14723 continue;
14724 Mask[I] = VE->findLaneForValue(V);
14725 }
14726 V = FinalShuffle(V, Mask);
14727 } else {
14728 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14729 "Expected vectorization factor less "
14730 "than original vector size.");
14731 SmallVector<int> UniformMask(VF, 0);
14732 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14733 V = FinalShuffle(V, UniformMask);
14734 }
14735 }
14736 // Need to update the operand gather node, if actually the operand is not a
14737 // vectorized node, but the buildvector/gather node, which matches one of
14738 // the vectorized nodes.
14739 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14740 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14741 }) == VE->UserTreeIndices.end()) {
14742 auto *It =
14743 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14744 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14745 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14746 });
14747 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14748 (*It)->VectorizedValue = V;
14749 }
14750 return V;
14751 }
14752
14753 // Find the corresponding gather entry and vectorize it.
14754 // Allows to be more accurate with tree/graph transformations, checks for the
14755 // correctness of the transformations in many cases.
14756 auto *I = find_if(VectorizableTree,
14757 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14758 return TE->isOperandGatherNode({E, NodeIdx});
14759 });
14760 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14761 assert(I->get()->UserTreeIndices.size() == 1 &&
14762 "Expected only single user for the gather node.");
14763 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14764 return vectorizeTree(I->get(), PostponedPHIs);
14765}
14766
14767template <typename BVTy, typename ResTy, typename... Args>
14768ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14769 Args &...Params) {
14770 assert(E->isGather() && "Expected gather node.");
14771 unsigned VF = E->getVectorFactor();
14772
14773 bool NeedFreeze = false;
14774 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14775 E->ReuseShuffleIndices.end());
14776 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14777 // Clear values, to be replaced by insertvector instructions.
14778 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14779 for_each(MutableArrayRef(GatheredScalars)
14780 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14781 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14783 E->CombinedEntriesWithIndices.size());
14784 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14785 [&](const auto &P) {
14786 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14787 });
14788 // Build a mask out of the reorder indices and reorder scalars per this
14789 // mask.
14790 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14791 E->ReorderIndices.end());
14792 if (!ReorderMask.empty())
14793 reorderScalars(GatheredScalars, ReorderMask);
14794 SmallVector<int> SubVectorsMask;
14795 inversePermutation(E->ReorderIndices, SubVectorsMask);
14796 // Transform non-clustered elements in the mask to poison (-1).
14797 // "Clustered" operations will be reordered using this mask later.
14798 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14799 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14800 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14801 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14802 } else {
14803 SubVectorsMask.clear();
14804 }
14805 SmallVector<Value *> StoredGS(GatheredScalars);
14806 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14807 unsigned I, unsigned SliceSize,
14808 bool IsNotPoisonous) {
14809 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14810 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14811 }))
14812 return false;
14813 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14814 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14815 if (UserTE->getNumOperands() != 2)
14816 return false;
14817 if (!IsNotPoisonous) {
14818 auto *It =
14819 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14820 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14821 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14822 }) != TE->UserTreeIndices.end();
14823 });
14824 if (It == VectorizableTree.end())
14825 return false;
14826 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14827 if (!(*It)->ReorderIndices.empty()) {
14828 inversePermutation((*It)->ReorderIndices, ReorderMask);
14829 reorderScalars(GS, ReorderMask);
14830 }
14831 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14832 Value *V0 = std::get<0>(P);
14833 Value *V1 = std::get<1>(P);
14834 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14835 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14836 is_contained(E->Scalars, V1));
14837 }))
14838 return false;
14839 }
14840 int Idx;
14841 if ((Mask.size() < InputVF &&
14843 Idx == 0) ||
14844 (Mask.size() == InputVF &&
14845 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14846 std::iota(
14847 std::next(Mask.begin(), I * SliceSize),
14848 std::next(Mask.begin(),
14849 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14850 0);
14851 } else {
14852 unsigned IVal =
14853 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14854 std::fill(
14855 std::next(Mask.begin(), I * SliceSize),
14856 std::next(Mask.begin(),
14857 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14858 IVal);
14859 }
14860 return true;
14861 };
14862 BVTy ShuffleBuilder(ScalarTy, Params...);
14863 ResTy Res = ResTy();
14865 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14867 Value *ExtractVecBase = nullptr;
14868 bool UseVecBaseAsInput = false;
14871 Type *OrigScalarTy = GatheredScalars.front()->getType();
14872 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14873 unsigned NumParts = TTI->getNumberOfParts(VecTy);
14874 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14875 VecTy->getNumElements() % NumParts != 0 ||
14877 VecTy->getNumElements() / NumParts))
14878 NumParts = 1;
14879 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14880 // Check for gathered extracts.
14881 bool Resized = false;
14882 ExtractShuffles =
14883 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14884 if (!ExtractShuffles.empty()) {
14885 SmallVector<const TreeEntry *> ExtractEntries;
14886 for (auto [Idx, I] : enumerate(ExtractMask)) {
14887 if (I == PoisonMaskElem)
14888 continue;
14889 if (const auto *TE = getTreeEntry(
14890 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14891 ExtractEntries.push_back(TE);
14892 }
14893 if (std::optional<ResTy> Delayed =
14894 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14895 // Delay emission of gathers which are not ready yet.
14896 PostponedGathers.insert(E);
14897 // Postpone gather emission, will be emitted after the end of the
14898 // process to keep correct order.
14899 return *Delayed;
14900 }
14901 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14902 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14903 ExtractVecBase = VecBase;
14904 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14905 if (VF == VecBaseTy->getNumElements() &&
14906 GatheredScalars.size() != VF) {
14907 Resized = true;
14908 GatheredScalars.append(VF - GatheredScalars.size(),
14909 PoisonValue::get(OrigScalarTy));
14910 }
14911 }
14912 }
14913 // Gather extracts after we check for full matched gathers only.
14914 if (!ExtractShuffles.empty() || !E->hasState() ||
14915 E->getOpcode() != Instruction::Load ||
14916 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14917 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14918 any_of(E->Scalars,
14919 [this](Value *V) {
14920 return isa<LoadInst>(V) && getTreeEntry(V);
14921 })) ||
14922 (E->hasState() && E->isAltShuffle()) ||
14923 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14924 isSplat(E->Scalars) ||
14925 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14926 GatherShuffles =
14927 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14928 }
14929 if (!GatherShuffles.empty()) {
14930 if (std::optional<ResTy> Delayed =
14931 ShuffleBuilder.needToDelay(E, Entries)) {
14932 // Delay emission of gathers which are not ready yet.
14933 PostponedGathers.insert(E);
14934 // Postpone gather emission, will be emitted after the end of the
14935 // process to keep correct order.
14936 return *Delayed;
14937 }
14938 if (GatherShuffles.size() == 1 &&
14939 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14940 Entries.front().front()->isSame(E->Scalars)) {
14941 // Perfect match in the graph, will reuse the previously vectorized
14942 // node. Cost is 0.
14943 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14944 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14945 // Restore the mask for previous partially matched values.
14946 Mask.resize(E->Scalars.size());
14947 const TreeEntry *FrontTE = Entries.front().front();
14948 if (FrontTE->ReorderIndices.empty() &&
14949 ((FrontTE->ReuseShuffleIndices.empty() &&
14950 E->Scalars.size() == FrontTE->Scalars.size()) ||
14951 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14952 std::iota(Mask.begin(), Mask.end(), 0);
14953 } else {
14954 for (auto [I, V] : enumerate(E->Scalars)) {
14955 if (isa<PoisonValue>(V)) {
14957 continue;
14958 }
14959 Mask[I] = FrontTE->findLaneForValue(V);
14960 }
14961 }
14962 ShuffleBuilder.add(*FrontTE, Mask);
14963 // Full matched entry found, no need to insert subvectors.
14964 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14965 return Res;
14966 }
14967 if (!Resized) {
14968 if (GatheredScalars.size() != VF &&
14969 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14970 return any_of(TEs, [&](const TreeEntry *TE) {
14971 return TE->getVectorFactor() == VF;
14972 });
14973 }))
14974 GatheredScalars.append(VF - GatheredScalars.size(),
14975 PoisonValue::get(OrigScalarTy));
14976 }
14977 // Remove shuffled elements from list of gathers.
14978 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14979 if (Mask[I] != PoisonMaskElem)
14980 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14981 }
14982 }
14983 }
14984 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14985 SmallVectorImpl<int> &ReuseMask,
14986 bool IsRootPoison) {
14987 // For splats with can emit broadcasts instead of gathers, so try to find
14988 // such sequences.
14989 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
14990 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14991 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
14992 SmallVector<int> UndefPos;
14993 DenseMap<Value *, unsigned> UniquePositions;
14994 // Gather unique non-const values and all constant values.
14995 // For repeated values, just shuffle them.
14996 int NumNonConsts = 0;
14997 int SinglePos = 0;
14998 for (auto [I, V] : enumerate(Scalars)) {
14999 if (isa<UndefValue>(V)) {
15000 if (!isa<PoisonValue>(V)) {
15001 ReuseMask[I] = I;
15002 UndefPos.push_back(I);
15003 }
15004 continue;
15005 }
15006 if (isConstant(V)) {
15007 ReuseMask[I] = I;
15008 continue;
15009 }
15010 ++NumNonConsts;
15011 SinglePos = I;
15012 Value *OrigV = V;
15013 Scalars[I] = PoisonValue::get(OrigScalarTy);
15014 if (IsSplat) {
15015 Scalars.front() = OrigV;
15016 ReuseMask[I] = 0;
15017 } else {
15018 const auto Res = UniquePositions.try_emplace(OrigV, I);
15019 Scalars[Res.first->second] = OrigV;
15020 ReuseMask[I] = Res.first->second;
15021 }
15022 }
15023 if (NumNonConsts == 1) {
15024 // Restore single insert element.
15025 if (IsSplat) {
15026 ReuseMask.assign(VF, PoisonMaskElem);
15027 std::swap(Scalars.front(), Scalars[SinglePos]);
15028 if (!UndefPos.empty() && UndefPos.front() == 0)
15029 Scalars.front() = UndefValue::get(OrigScalarTy);
15030 }
15031 ReuseMask[SinglePos] = SinglePos;
15032 } else if (!UndefPos.empty() && IsSplat) {
15033 // For undef values, try to replace them with the simple broadcast.
15034 // We can do it if the broadcasted value is guaranteed to be
15035 // non-poisonous, or by freezing the incoming scalar value first.
15036 auto *It = find_if(Scalars, [this, E](Value *V) {
15037 return !isa<UndefValue>(V) &&
15038 (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
15039 (E->UserTreeIndices.size() == 1 &&
15040 any_of(V->uses(), [E](const Use &U) {
15041 // Check if the value already used in the same operation in
15042 // one of the nodes already.
15043 return E->UserTreeIndices.front().EdgeIdx !=
15044 U.getOperandNo() &&
15045 is_contained(
15046 E->UserTreeIndices.front().UserTE->Scalars,
15047 U.getUser());
15048 })));
15049 });
15050 if (It != Scalars.end()) {
15051 // Replace undefs by the non-poisoned scalars and emit broadcast.
15052 int Pos = std::distance(Scalars.begin(), It);
15053 for (int I : UndefPos) {
15054 // Set the undef position to the non-poisoned scalar.
15055 ReuseMask[I] = Pos;
15056 // Replace the undef by the poison, in the mask it is replaced by
15057 // non-poisoned scalar already.
15058 if (I != Pos)
15059 Scalars[I] = PoisonValue::get(OrigScalarTy);
15060 }
15061 } else {
15062 // Replace undefs by the poisons, emit broadcast and then emit
15063 // freeze.
15064 for (int I : UndefPos) {
15065 ReuseMask[I] = PoisonMaskElem;
15066 if (isa<UndefValue>(Scalars[I]))
15067 Scalars[I] = PoisonValue::get(OrigScalarTy);
15068 }
15069 NeedFreeze = true;
15070 }
15071 }
15072 };
15073 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15074 bool IsNonPoisoned = true;
15075 bool IsUsedInExpr = true;
15076 Value *Vec1 = nullptr;
15077 if (!ExtractShuffles.empty()) {
15078 // Gather of extractelements can be represented as just a shuffle of
15079 // a single/two vectors the scalars are extracted from.
15080 // Find input vectors.
15081 Value *Vec2 = nullptr;
15082 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15083 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
15084 ExtractMask[I] = PoisonMaskElem;
15085 }
15086 if (UseVecBaseAsInput) {
15087 Vec1 = ExtractVecBase;
15088 } else {
15089 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15090 if (ExtractMask[I] == PoisonMaskElem)
15091 continue;
15092 if (isa<UndefValue>(E->Scalars[I]))
15093 continue;
15094 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15095 Value *VecOp = EI->getVectorOperand();
15096 if (const auto *TE = getTreeEntry(VecOp))
15097 if (TE->VectorizedValue)
15098 VecOp = TE->VectorizedValue;
15099 if (!Vec1) {
15100 Vec1 = VecOp;
15101 } else if (Vec1 != VecOp) {
15102 assert((!Vec2 || Vec2 == VecOp) &&
15103 "Expected only 1 or 2 vectors shuffle.");
15104 Vec2 = VecOp;
15105 }
15106 }
15107 }
15108 if (Vec2) {
15109 IsUsedInExpr = false;
15110 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
15111 isGuaranteedNotToBePoison(Vec2, AC);
15112 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15113 } else if (Vec1) {
15114 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
15115 IsUsedInExpr &= FindReusedSplat(
15116 ExtractMask,
15117 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15118 ExtractMask.size(), IsNotPoisonedVec);
15119 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
15120 IsNonPoisoned &= IsNotPoisonedVec;
15121 } else {
15122 IsUsedInExpr = false;
15123 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15124 /*ForExtracts=*/true);
15125 }
15126 }
15127 if (!GatherShuffles.empty()) {
15128 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
15129 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
15130 for (const auto [I, TEs] : enumerate(Entries)) {
15131 if (TEs.empty()) {
15132 assert(!GatherShuffles[I] &&
15133 "No shuffles with empty entries list expected.");
15134 continue;
15135 }
15136 assert((TEs.size() == 1 || TEs.size() == 2) &&
15137 "Expected shuffle of 1 or 2 entries.");
15138 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
15139 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
15140 VecMask.assign(VecMask.size(), PoisonMaskElem);
15141 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
15142 if (TEs.size() == 1) {
15143 bool IsNotPoisonedVec =
15144 TEs.front()->VectorizedValue
15145 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15146 : true;
15147 IsUsedInExpr &=
15148 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
15149 SliceSize, IsNotPoisonedVec);
15150 ShuffleBuilder.add(*TEs.front(), VecMask);
15151 IsNonPoisoned &= IsNotPoisonedVec;
15152 } else {
15153 IsUsedInExpr = false;
15154 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15155 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15156 IsNonPoisoned &=
15157 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15158 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15159 }
15160 }
15161 }
15162 // Try to figure out best way to combine values: build a shuffle and insert
15163 // elements or just build several shuffles.
15164 // Insert non-constant scalars.
15165 SmallVector<Value *> NonConstants(GatheredScalars);
15166 int EMSz = ExtractMask.size();
15167 int MSz = Mask.size();
15168 // Try to build constant vector and shuffle with it only if currently we
15169 // have a single permutation and more than 1 scalar constants.
15170 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15171 bool IsIdentityShuffle =
15172 ((UseVecBaseAsInput ||
15173 all_of(ExtractShuffles,
15174 [](const std::optional<TTI::ShuffleKind> &SK) {
15175 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15177 })) &&
15178 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
15179 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15180 (!GatherShuffles.empty() &&
15181 all_of(GatherShuffles,
15182 [](const std::optional<TTI::ShuffleKind> &SK) {
15183 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15185 }) &&
15186 none_of(Mask, [&](int I) { return I >= MSz; }) &&
15188 bool EnoughConstsForShuffle =
15189 IsSingleShuffle &&
15190 (none_of(GatheredScalars,
15191 [](Value *V) {
15192 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15193 }) ||
15194 any_of(GatheredScalars,
15195 [](Value *V) {
15196 return isa<Constant>(V) && !isa<UndefValue>(V);
15197 })) &&
15198 (!IsIdentityShuffle ||
15199 (GatheredScalars.size() == 2 &&
15200 any_of(GatheredScalars,
15201 [](Value *V) { return !isa<UndefValue>(V); })) ||
15202 count_if(GatheredScalars, [](Value *V) {
15203 return isa<Constant>(V) && !isa<PoisonValue>(V);
15204 }) > 1);
15205 // NonConstants array contains just non-constant values, GatheredScalars
15206 // contains only constant to build final vector and then shuffle.
15207 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
15208 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15209 NonConstants[I] = PoisonValue::get(OrigScalarTy);
15210 else
15211 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15212 }
15213 // Generate constants for final shuffle and build a mask for them.
15214 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15215 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
15216 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
15217 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15218 ShuffleBuilder.add(BV, BVMask);
15219 }
15220 if (all_of(NonConstants, [=](Value *V) {
15221 return isa<PoisonValue>(V) ||
15222 (IsSingleShuffle && ((IsIdentityShuffle &&
15223 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15224 }))
15225 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15226 SubVectorsMask);
15227 else
15228 Res = ShuffleBuilder.finalize(
15229 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15230 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
15231 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
15232 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
15233 });
15234 } else if (!allConstant(GatheredScalars)) {
15235 // Gather unique scalars and all constants.
15236 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
15237 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
15238 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15239 ShuffleBuilder.add(BV, ReuseMask);
15240 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15241 SubVectorsMask);
15242 } else {
15243 // Gather all constants.
15244 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15245 for (auto [I, V] : enumerate(GatheredScalars)) {
15246 if (!isa<PoisonValue>(V))
15247 Mask[I] = I;
15248 }
15249 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15250 ShuffleBuilder.add(BV, Mask);
15251 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15252 SubVectorsMask);
15253 }
15254
15255 if (NeedFreeze)
15256 Res = ShuffleBuilder.createFreeze(Res);
15257 return Res;
15258}
15259
15260Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15261 bool PostponedPHIs) {
15262 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15263 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15264 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15265 Builder, *this);
15266}
15267
15268/// \returns \p I after propagating metadata from \p VL only for instructions in
15269/// \p VL.
15272 for (Value *V : VL)
15273 if (isa<Instruction>(V))
15274 Insts.push_back(V);
15275 return llvm::propagateMetadata(Inst, Insts);
15276}
15277
15278Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15279 IRBuilderBase::InsertPointGuard Guard(Builder);
15280
15281 if (E->VectorizedValue &&
15282 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15283 E->isAltShuffle())) {
15284 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15285 return E->VectorizedValue;
15286 }
15287
15288 Value *V = E->Scalars.front();
15289 Type *ScalarTy = V->getType();
15290 if (!isa<CmpInst>(V))
15291 ScalarTy = getValueType(V);
15292 auto It = MinBWs.find(E);
15293 if (It != MinBWs.end()) {
15294 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15295 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15296 if (VecTy)
15297 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15298 }
15299 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15300 if (E->isGather()) {
15301 // Set insert point for non-reduction initial nodes.
15302 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15303 setInsertPointAfterBundle(E);
15304 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15305 E->VectorizedValue = Vec;
15306 return Vec;
15307 }
15308
15309 bool IsReverseOrder =
15310 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15311 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15312 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15313 if (E->getOpcode() == Instruction::Store &&
15314 E->State == TreeEntry::Vectorize) {
15316 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15317 E->ReorderIndices.size());
15318 ShuffleBuilder.add(V, Mask);
15319 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15320 ShuffleBuilder.addOrdered(V, {});
15321 } else {
15322 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15323 }
15325 E->CombinedEntriesWithIndices.size());
15326 transform(
15327 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15328 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15329 });
15330 assert(
15331 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15332 "Expected either combined subnodes or reordering");
15333 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15334 };
15335
15336 assert(!E->isGather() && "Unhandled state");
15337 unsigned ShuffleOrOp =
15338 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15339 Instruction *VL0 = E->getMainOp();
15340 auto GetOperandSignedness = [&](unsigned Idx) {
15341 const TreeEntry *OpE = getOperandEntry(E, Idx);
15342 bool IsSigned = false;
15343 auto It = MinBWs.find(OpE);
15344 if (It != MinBWs.end())
15345 IsSigned = It->second.second;
15346 else
15347 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15348 if (isa<PoisonValue>(V))
15349 return false;
15350 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15351 });
15352 return IsSigned;
15353 };
15354 switch (ShuffleOrOp) {
15355 case Instruction::PHI: {
15356 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15357 E != VectorizableTree.front().get() ||
15358 !E->UserTreeIndices.empty()) &&
15359 "PHI reordering is free.");
15360 if (PostponedPHIs && E->VectorizedValue)
15361 return E->VectorizedValue;
15362 auto *PH = cast<PHINode>(VL0);
15363 Builder.SetInsertPoint(PH->getParent(),
15364 PH->getParent()->getFirstNonPHIIt());
15365 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15366 if (PostponedPHIs || !E->VectorizedValue) {
15367 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15368 E->PHI = NewPhi;
15369 Value *V = NewPhi;
15370
15371 // Adjust insertion point once all PHI's have been generated.
15372 Builder.SetInsertPoint(PH->getParent(),
15373 PH->getParent()->getFirstInsertionPt());
15374 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15375
15376 V = FinalShuffle(V, E);
15377
15378 E->VectorizedValue = V;
15379 if (PostponedPHIs)
15380 return V;
15381 }
15382 PHINode *NewPhi = cast<PHINode>(E->PHI);
15383 // If phi node is fully emitted - exit.
15384 if (NewPhi->getNumIncomingValues() != 0)
15385 return NewPhi;
15386
15387 // PHINodes may have multiple entries from the same block. We want to
15388 // visit every block once.
15390
15391 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15393 BasicBlock *IBB = PH->getIncomingBlock(I);
15394
15395 // Stop emission if all incoming values are generated.
15396 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15397 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15398 return NewPhi;
15399 }
15400
15401 if (!VisitedBBs.insert(IBB).second) {
15402 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15403 continue;
15404 }
15405
15406 Builder.SetInsertPoint(IBB->getTerminator());
15407 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15408 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15409 if (VecTy != Vec->getType()) {
15410 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15411 MinBWs.contains(getOperandEntry(E, I))) &&
15412 "Expected item in MinBWs.");
15413 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15414 }
15415 NewPhi->addIncoming(Vec, IBB);
15416 }
15417
15418 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15419 "Invalid number of incoming values");
15420 assert(E->VectorizedValue && "Expected vectorized value.");
15421 return E->VectorizedValue;
15422 }
15423
15424 case Instruction::ExtractElement: {
15425 Value *V = E->getSingleOperand(0);
15426 if (const TreeEntry *TE = getTreeEntry(V))
15427 V = TE->VectorizedValue;
15428 setInsertPointAfterBundle(E);
15429 V = FinalShuffle(V, E);
15430 E->VectorizedValue = V;
15431 return V;
15432 }
15433 case Instruction::ExtractValue: {
15434 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15435 Builder.SetInsertPoint(LI);
15436 Value *Ptr = LI->getPointerOperand();
15437 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15438 Value *NewV = ::propagateMetadata(V, E->Scalars);
15439 NewV = FinalShuffle(NewV, E);
15440 E->VectorizedValue = NewV;
15441 return NewV;
15442 }
15443 case Instruction::InsertElement: {
15444 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15445 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15446 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15447 ArrayRef<Value *> Op = E->getOperand(1);
15448 Type *ScalarTy = Op.front()->getType();
15449 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15450 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15451 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15452 assert(Res.first > 0 && "Expected item in MinBWs.");
15453 V = Builder.CreateIntCast(
15454 V,
15456 ScalarTy,
15457 cast<FixedVectorType>(V->getType())->getNumElements()),
15458 Res.second);
15459 }
15460
15461 // Create InsertVector shuffle if necessary
15462 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15463 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15464 }));
15465 const unsigned NumElts =
15466 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15467 const unsigned NumScalars = E->Scalars.size();
15468
15469 unsigned Offset = *getElementIndex(VL0);
15470 assert(Offset < NumElts && "Failed to find vector index offset");
15471
15472 // Create shuffle to resize vector
15474 if (!E->ReorderIndices.empty()) {
15475 inversePermutation(E->ReorderIndices, Mask);
15476 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15477 } else {
15478 Mask.assign(NumElts, PoisonMaskElem);
15479 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15480 }
15481 // Create InsertVector shuffle if necessary
15482 bool IsIdentity = true;
15483 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15484 Mask.swap(PrevMask);
15485 for (unsigned I = 0; I < NumScalars; ++I) {
15486 Value *Scalar = E->Scalars[PrevMask[I]];
15487 unsigned InsertIdx = *getElementIndex(Scalar);
15488 IsIdentity &= InsertIdx - Offset == I;
15489 Mask[InsertIdx - Offset] = I;
15490 }
15491 if (!IsIdentity || NumElts != NumScalars) {
15492 Value *V2 = nullptr;
15493 bool IsVNonPoisonous =
15495 SmallVector<int> InsertMask(Mask);
15496 if (NumElts != NumScalars && Offset == 0) {
15497 // Follow all insert element instructions from the current buildvector
15498 // sequence.
15499 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15500 do {
15501 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15502 if (!InsertIdx)
15503 break;
15504 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15505 InsertMask[*InsertIdx] = *InsertIdx;
15506 if (!Ins->hasOneUse())
15507 break;
15508 Ins = dyn_cast_or_null<InsertElementInst>(
15509 Ins->getUniqueUndroppableUser());
15510 } while (Ins);
15511 SmallBitVector UseMask =
15512 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15513 SmallBitVector IsFirstPoison =
15514 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15515 SmallBitVector IsFirstUndef =
15516 isUndefVector(FirstInsert->getOperand(0), UseMask);
15517 if (!IsFirstPoison.all()) {
15518 unsigned Idx = 0;
15519 for (unsigned I = 0; I < NumElts; I++) {
15520 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15521 IsFirstUndef.test(I)) {
15522 if (IsVNonPoisonous) {
15523 InsertMask[I] = I < NumScalars ? I : 0;
15524 continue;
15525 }
15526 if (!V2)
15527 V2 = UndefValue::get(V->getType());
15528 if (Idx >= NumScalars)
15529 Idx = NumScalars - 1;
15530 InsertMask[I] = NumScalars + Idx;
15531 ++Idx;
15532 } else if (InsertMask[I] != PoisonMaskElem &&
15533 Mask[I] == PoisonMaskElem) {
15534 InsertMask[I] = PoisonMaskElem;
15535 }
15536 }
15537 } else {
15538 InsertMask = Mask;
15539 }
15540 }
15541 if (!V2)
15542 V2 = PoisonValue::get(V->getType());
15543 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15544 if (auto *I = dyn_cast<Instruction>(V)) {
15545 GatherShuffleExtractSeq.insert(I);
15546 CSEBlocks.insert(I->getParent());
15547 }
15548 }
15549
15550 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15551 for (unsigned I = 0; I < NumElts; I++) {
15552 if (Mask[I] != PoisonMaskElem)
15553 InsertMask[Offset + I] = I;
15554 }
15555 SmallBitVector UseMask =
15556 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15557 SmallBitVector IsFirstUndef =
15558 isUndefVector(FirstInsert->getOperand(0), UseMask);
15559 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15560 NumElts != NumScalars) {
15561 if (IsFirstUndef.all()) {
15562 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15563 SmallBitVector IsFirstPoison =
15564 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15565 if (!IsFirstPoison.all()) {
15566 for (unsigned I = 0; I < NumElts; I++) {
15567 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15568 InsertMask[I] = I + NumElts;
15569 }
15570 }
15571 V = Builder.CreateShuffleVector(
15572 V,
15573 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15574 : FirstInsert->getOperand(0),
15575 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15576 if (auto *I = dyn_cast<Instruction>(V)) {
15577 GatherShuffleExtractSeq.insert(I);
15578 CSEBlocks.insert(I->getParent());
15579 }
15580 }
15581 } else {
15582 SmallBitVector IsFirstPoison =
15583 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15584 for (unsigned I = 0; I < NumElts; I++) {
15585 if (InsertMask[I] == PoisonMaskElem)
15586 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15587 else
15588 InsertMask[I] += NumElts;
15589 }
15590 V = Builder.CreateShuffleVector(
15591 FirstInsert->getOperand(0), V, InsertMask,
15592 cast<Instruction>(E->Scalars.back())->getName());
15593 if (auto *I = dyn_cast<Instruction>(V)) {
15594 GatherShuffleExtractSeq.insert(I);
15595 CSEBlocks.insert(I->getParent());
15596 }
15597 }
15598 }
15599
15600 ++NumVectorInstructions;
15601 E->VectorizedValue = V;
15602 return V;
15603 }
15604 case Instruction::ZExt:
15605 case Instruction::SExt:
15606 case Instruction::FPToUI:
15607 case Instruction::FPToSI:
15608 case Instruction::FPExt:
15609 case Instruction::PtrToInt:
15610 case Instruction::IntToPtr:
15611 case Instruction::SIToFP:
15612 case Instruction::UIToFP:
15613 case Instruction::Trunc:
15614 case Instruction::FPTrunc:
15615 case Instruction::BitCast: {
15616 setInsertPointAfterBundle(E);
15617
15618 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15619 if (E->VectorizedValue) {
15620 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15621 return E->VectorizedValue;
15622 }
15623
15624 auto *CI = cast<CastInst>(VL0);
15625 Instruction::CastOps VecOpcode = CI->getOpcode();
15626 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15627 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15628 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15629 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15630 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15631 // Check if the values are candidates to demote.
15632 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15633 if (SrcIt != MinBWs.end())
15634 SrcBWSz = SrcIt->second.first;
15635 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15636 if (BWSz == SrcBWSz) {
15637 VecOpcode = Instruction::BitCast;
15638 } else if (BWSz < SrcBWSz) {
15639 VecOpcode = Instruction::Trunc;
15640 } else if (It != MinBWs.end()) {
15641 assert(BWSz > SrcBWSz && "Invalid cast!");
15642 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15643 } else if (SrcIt != MinBWs.end()) {
15644 assert(BWSz > SrcBWSz && "Invalid cast!");
15645 VecOpcode =
15646 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15647 }
15648 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15649 !SrcIt->second.second) {
15650 VecOpcode = Instruction::UIToFP;
15651 }
15652 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15653 ? InVec
15654 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15655 V = FinalShuffle(V, E);
15656
15657 E->VectorizedValue = V;
15658 ++NumVectorInstructions;
15659 return V;
15660 }
15661 case Instruction::FCmp:
15662 case Instruction::ICmp: {
15663 setInsertPointAfterBundle(E);
15664
15665 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15666 if (E->VectorizedValue) {
15667 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15668 return E->VectorizedValue;
15669 }
15670 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15671 if (E->VectorizedValue) {
15672 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15673 return E->VectorizedValue;
15674 }
15675 if (L->getType() != R->getType()) {
15676 assert((getOperandEntry(E, 0)->isGather() ||
15677 getOperandEntry(E, 1)->isGather() ||
15678 MinBWs.contains(getOperandEntry(E, 0)) ||
15679 MinBWs.contains(getOperandEntry(E, 1))) &&
15680 "Expected item in MinBWs.");
15681 if (cast<VectorType>(L->getType())
15682 ->getElementType()
15683 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15684 ->getElementType()
15685 ->getIntegerBitWidth()) {
15686 Type *CastTy = R->getType();
15687 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15688 } else {
15689 Type *CastTy = L->getType();
15690 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15691 }
15692 }
15693
15694 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15695 Value *V = Builder.CreateCmp(P0, L, R);
15696 propagateIRFlags(V, E->Scalars, VL0);
15697 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15698 ICmp->setSameSign(/*B=*/false);
15699 // Do not cast for cmps.
15700 VecTy = cast<FixedVectorType>(V->getType());
15701 V = FinalShuffle(V, E);
15702
15703 E->VectorizedValue = V;
15704 ++NumVectorInstructions;
15705 return V;
15706 }
15707 case Instruction::Select: {
15708 setInsertPointAfterBundle(E);
15709
15710 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15711 if (E->VectorizedValue) {
15712 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15713 return E->VectorizedValue;
15714 }
15715 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15716 if (E->VectorizedValue) {
15717 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15718 return E->VectorizedValue;
15719 }
15720 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15721 if (E->VectorizedValue) {
15722 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15723 return E->VectorizedValue;
15724 }
15725 if (True->getType() != VecTy || False->getType() != VecTy) {
15726 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15727 getOperandEntry(E, 2)->isGather() ||
15728 MinBWs.contains(getOperandEntry(E, 1)) ||
15729 MinBWs.contains(getOperandEntry(E, 2))) &&
15730 "Expected item in MinBWs.");
15731 if (True->getType() != VecTy)
15732 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15733 if (False->getType() != VecTy)
15734 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15735 }
15736
15737 unsigned CondNumElements = getNumElements(Cond->getType());
15738 unsigned TrueNumElements = getNumElements(True->getType());
15739 assert(TrueNumElements >= CondNumElements &&
15740 TrueNumElements % CondNumElements == 0 &&
15741 "Cannot vectorize Instruction::Select");
15742 assert(TrueNumElements == getNumElements(False->getType()) &&
15743 "Cannot vectorize Instruction::Select");
15744 if (CondNumElements != TrueNumElements) {
15745 // When the return type is i1 but the source is fixed vector type, we
15746 // need to duplicate the condition value.
15747 Cond = Builder.CreateShuffleVector(
15748 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15749 CondNumElements));
15750 }
15751 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15752 "Cannot vectorize Instruction::Select");
15753 Value *V = Builder.CreateSelect(Cond, True, False);
15754 V = FinalShuffle(V, E);
15755
15756 E->VectorizedValue = V;
15757 ++NumVectorInstructions;
15758 return V;
15759 }
15760 case Instruction::FNeg: {
15761 setInsertPointAfterBundle(E);
15762
15763 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15764
15765 if (E->VectorizedValue) {
15766 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15767 return E->VectorizedValue;
15768 }
15769
15770 Value *V = Builder.CreateUnOp(
15771 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15772 propagateIRFlags(V, E->Scalars, VL0);
15773 if (auto *I = dyn_cast<Instruction>(V))
15774 V = ::propagateMetadata(I, E->Scalars);
15775
15776 V = FinalShuffle(V, E);
15777
15778 E->VectorizedValue = V;
15779 ++NumVectorInstructions;
15780
15781 return V;
15782 }
15783 case Instruction::Freeze: {
15784 setInsertPointAfterBundle(E);
15785
15786 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15787
15788 if (E->VectorizedValue) {
15789 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15790 return E->VectorizedValue;
15791 }
15792
15793 if (Op->getType() != VecTy) {
15794 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15795 MinBWs.contains(getOperandEntry(E, 0))) &&
15796 "Expected item in MinBWs.");
15797 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15798 }
15799 Value *V = Builder.CreateFreeze(Op);
15800 V = FinalShuffle(V, E);
15801
15802 E->VectorizedValue = V;
15803 ++NumVectorInstructions;
15804
15805 return V;
15806 }
15807 case Instruction::Add:
15808 case Instruction::FAdd:
15809 case Instruction::Sub:
15810 case Instruction::FSub:
15811 case Instruction::Mul:
15812 case Instruction::FMul:
15813 case Instruction::UDiv:
15814 case Instruction::SDiv:
15815 case Instruction::FDiv:
15816 case Instruction::URem:
15817 case Instruction::SRem:
15818 case Instruction::FRem:
15819 case Instruction::Shl:
15820 case Instruction::LShr:
15821 case Instruction::AShr:
15822 case Instruction::And:
15823 case Instruction::Or:
15824 case Instruction::Xor: {
15825 setInsertPointAfterBundle(E);
15826
15827 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15828 if (E->VectorizedValue) {
15829 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15830 return E->VectorizedValue;
15831 }
15832 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15833 if (E->VectorizedValue) {
15834 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15835 return E->VectorizedValue;
15836 }
15837 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15838 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15839 ArrayRef<Value *> Ops = E->getOperand(I);
15840 if (all_of(Ops, [&](Value *Op) {
15841 auto *CI = dyn_cast<ConstantInt>(Op);
15842 return CI && CI->getValue().countr_one() >= It->second.first;
15843 })) {
15844 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15845 E->VectorizedValue = V;
15846 ++NumVectorInstructions;
15847 return V;
15848 }
15849 }
15850 }
15851 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15852 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15853 getOperandEntry(E, 1)->isGather() ||
15854 MinBWs.contains(getOperandEntry(E, 0)) ||
15855 MinBWs.contains(getOperandEntry(E, 1))) &&
15856 "Expected item in MinBWs.");
15857 if (LHS->getType() != VecTy)
15858 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15859 if (RHS->getType() != VecTy)
15860 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15861 }
15862
15863 Value *V = Builder.CreateBinOp(
15864 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15865 RHS);
15866 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15867 if (auto *I = dyn_cast<Instruction>(V)) {
15868 V = ::propagateMetadata(I, E->Scalars);
15869 // Drop nuw flags for abs(sub(commutative), true).
15870 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15871 any_of(E->Scalars, [](Value *V) {
15872 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15873 }))
15874 I->setHasNoUnsignedWrap(/*b=*/false);
15875 }
15876
15877 V = FinalShuffle(V, E);
15878
15879 E->VectorizedValue = V;
15880 ++NumVectorInstructions;
15881
15882 return V;
15883 }
15884 case Instruction::Load: {
15885 // Loads are inserted at the head of the tree because we don't want to
15886 // sink them all the way down past store instructions.
15887 setInsertPointAfterBundle(E);
15888
15889 LoadInst *LI = cast<LoadInst>(VL0);
15890 Instruction *NewLI;
15891 Value *PO = LI->getPointerOperand();
15892 if (E->State == TreeEntry::Vectorize) {
15893 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15894 } else if (E->State == TreeEntry::StridedVectorize) {
15895 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15896 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15897 PO = IsReverseOrder ? PtrN : Ptr0;
15898 std::optional<int> Diff = getPointersDiff(
15899 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15900 Type *StrideTy = DL->getIndexType(PO->getType());
15901 Value *StrideVal;
15902 if (Diff) {
15903 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15904 StrideVal =
15905 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15906 DL->getTypeAllocSize(ScalarTy));
15907 } else {
15908 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15909 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15910 return cast<LoadInst>(V)->getPointerOperand();
15911 });
15912 OrdersType Order;
15913 std::optional<Value *> Stride =
15914 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15915 &*Builder.GetInsertPoint());
15916 Value *NewStride =
15917 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15918 StrideVal = Builder.CreateMul(
15919 NewStride,
15920 ConstantInt::get(
15921 StrideTy,
15922 (IsReverseOrder ? -1 : 1) *
15923 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15924 }
15925 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15926 auto *Inst = Builder.CreateIntrinsic(
15927 Intrinsic::experimental_vp_strided_load,
15928 {VecTy, PO->getType(), StrideTy},
15929 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15930 Builder.getInt32(E->Scalars.size())});
15931 Inst->addParamAttr(
15932 /*ArgNo=*/0,
15933 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15934 NewLI = Inst;
15935 } else {
15936 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15937 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15938 if (E->VectorizedValue) {
15939 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15940 return E->VectorizedValue;
15941 }
15942 if (isa<FixedVectorType>(ScalarTy)) {
15943 assert(SLPReVec && "FixedVectorType is not expected.");
15944 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15945 // to expand VecPtr if ScalarTy is a vector type.
15946 unsigned ScalarTyNumElements =
15947 cast<FixedVectorType>(ScalarTy)->getNumElements();
15948 unsigned VecTyNumElements =
15949 cast<FixedVectorType>(VecTy)->getNumElements();
15950 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15951 "Cannot expand getelementptr.");
15952 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15953 SmallVector<Constant *> Indices(VecTyNumElements);
15954 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15955 return Builder.getInt64(I % ScalarTyNumElements);
15956 });
15957 VecPtr = Builder.CreateGEP(
15958 VecTy->getElementType(),
15959 Builder.CreateShuffleVector(
15960 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15961 ConstantVector::get(Indices));
15962 }
15963 // Use the minimum alignment of the gathered loads.
15964 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15965 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15966 }
15967 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15968
15969 V = FinalShuffle(V, E);
15970 E->VectorizedValue = V;
15971 ++NumVectorInstructions;
15972 return V;
15973 }
15974 case Instruction::Store: {
15975 auto *SI = cast<StoreInst>(VL0);
15976
15977 setInsertPointAfterBundle(E);
15978
15979 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15980 if (VecValue->getType() != VecTy)
15981 VecValue =
15982 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15983 VecValue = FinalShuffle(VecValue, E);
15984
15985 Value *Ptr = SI->getPointerOperand();
15986 Instruction *ST;
15987 if (E->State == TreeEntry::Vectorize) {
15988 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
15989 } else {
15990 assert(E->State == TreeEntry::StridedVectorize &&
15991 "Expected either strided or consecutive stores.");
15992 if (!E->ReorderIndices.empty()) {
15993 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15994 Ptr = SI->getPointerOperand();
15995 }
15996 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15997 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
15998 auto *Inst = Builder.CreateIntrinsic(
15999 Intrinsic::experimental_vp_strided_store,
16000 {VecTy, Ptr->getType(), StrideTy},
16001 {VecValue, Ptr,
16002 ConstantInt::get(
16003 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
16004 Builder.getAllOnesMask(VecTy->getElementCount()),
16005 Builder.getInt32(E->Scalars.size())});
16006 Inst->addParamAttr(
16007 /*ArgNo=*/1,
16008 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
16009 ST = Inst;
16010 }
16011
16012 Value *V = ::propagateMetadata(ST, E->Scalars);
16013
16014 E->VectorizedValue = V;
16015 ++NumVectorInstructions;
16016 return V;
16017 }
16018 case Instruction::GetElementPtr: {
16019 auto *GEP0 = cast<GetElementPtrInst>(VL0);
16020 setInsertPointAfterBundle(E);
16021
16022 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16023 if (E->VectorizedValue) {
16024 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16025 return E->VectorizedValue;
16026 }
16027
16028 SmallVector<Value *> OpVecs;
16029 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
16030 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16031 if (E->VectorizedValue) {
16032 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16033 return E->VectorizedValue;
16034 }
16035 OpVecs.push_back(OpVec);
16036 }
16037
16038 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16039 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
16041 for (Value *V : E->Scalars) {
16042 if (isa<GetElementPtrInst>(V))
16043 GEPs.push_back(V);
16044 }
16045 V = ::propagateMetadata(I, GEPs);
16046 }
16047
16048 V = FinalShuffle(V, E);
16049
16050 E->VectorizedValue = V;
16051 ++NumVectorInstructions;
16052
16053 return V;
16054 }
16055 case Instruction::Call: {
16056 CallInst *CI = cast<CallInst>(VL0);
16057 setInsertPointAfterBundle(E);
16058
16060
16062 CI, ID, VecTy->getNumElements(),
16063 It != MinBWs.end() ? It->second.first : 0, TTI);
16064 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16065 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
16066 VecCallCosts.first <= VecCallCosts.second;
16067
16068 Value *ScalarArg = nullptr;
16069 SmallVector<Value *> OpVecs;
16070 SmallVector<Type *, 2> TysForDecl;
16071 // Add return type if intrinsic is overloaded on it.
16072 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
16073 TysForDecl.push_back(VecTy);
16074 auto *CEI = cast<CallInst>(VL0);
16075 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
16076 ValueList OpVL;
16077 // Some intrinsics have scalar arguments. This argument should not be
16078 // vectorized.
16079 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
16080 ScalarArg = CEI->getArgOperand(I);
16081 // if decided to reduce bitwidth of abs intrinsic, it second argument
16082 // must be set false (do not return poison, if value issigned min).
16083 if (ID == Intrinsic::abs && It != MinBWs.end() &&
16084 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
16085 ScalarArg = Builder.getFalse();
16086 OpVecs.push_back(ScalarArg);
16088 TysForDecl.push_back(ScalarArg->getType());
16089 continue;
16090 }
16091
16092 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
16093 if (E->VectorizedValue) {
16094 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16095 return E->VectorizedValue;
16096 }
16097 ScalarArg = CEI->getArgOperand(I);
16098 if (cast<VectorType>(OpVec->getType())->getElementType() !=
16099 ScalarArg->getType()->getScalarType() &&
16100 It == MinBWs.end()) {
16101 auto *CastTy =
16102 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16103 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16104 } else if (It != MinBWs.end()) {
16105 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16106 }
16107 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
16108 OpVecs.push_back(OpVec);
16109 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
16110 TysForDecl.push_back(OpVec->getType());
16111 }
16112
16113 Function *CF;
16114 if (!UseIntrinsic) {
16115 VFShape Shape =
16118 static_cast<unsigned>(VecTy->getNumElements())),
16119 false /*HasGlobalPred*/);
16120 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
16121 } else {
16122 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
16123 }
16124
16126 CI->getOperandBundlesAsDefs(OpBundles);
16127 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16128
16129 propagateIRFlags(V, E->Scalars, VL0);
16130 V = FinalShuffle(V, E);
16131
16132 E->VectorizedValue = V;
16133 ++NumVectorInstructions;
16134 return V;
16135 }
16136 case Instruction::ShuffleVector: {
16137 Value *V;
16138 if (SLPReVec && !E->isAltShuffle()) {
16139 setInsertPointAfterBundle(E);
16140 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16141 if (E->VectorizedValue) {
16142 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16143 return E->VectorizedValue;
16144 }
16145 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16146 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16147 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16148 "Not supported shufflevector usage.");
16149 SmallVector<int> NewMask(ThisMask.size());
16150 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16151 return SVSrc->getShuffleMask()[Mask];
16152 });
16153 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16154 } else {
16155 V = Builder.CreateShuffleVector(Src, ThisMask);
16156 }
16157 propagateIRFlags(V, E->Scalars, VL0);
16158 if (auto *I = dyn_cast<Instruction>(V))
16159 V = ::propagateMetadata(I, E->Scalars);
16160 V = FinalShuffle(V, E);
16161 } else {
16162 assert(E->isAltShuffle() &&
16163 ((Instruction::isBinaryOp(E->getOpcode()) &&
16164 Instruction::isBinaryOp(E->getAltOpcode())) ||
16165 (Instruction::isCast(E->getOpcode()) &&
16166 Instruction::isCast(E->getAltOpcode())) ||
16167 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16168 "Invalid Shuffle Vector Operand");
16169
16170 Value *LHS = nullptr, *RHS = nullptr;
16171 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16172 setInsertPointAfterBundle(E);
16173 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16174 if (E->VectorizedValue) {
16175 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16176 return E->VectorizedValue;
16177 }
16178 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16179 } else {
16180 setInsertPointAfterBundle(E);
16181 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16182 }
16183 if (E->VectorizedValue) {
16184 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16185 return E->VectorizedValue;
16186 }
16187 if (LHS && RHS &&
16188 ((Instruction::isBinaryOp(E->getOpcode()) &&
16189 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
16190 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
16191 assert((It != MinBWs.end() ||
16192 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16193 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16194 MinBWs.contains(getOperandEntry(E, 0)) ||
16195 MinBWs.contains(getOperandEntry(E, 1))) &&
16196 "Expected item in MinBWs.");
16197 Type *CastTy = VecTy;
16198 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
16199 if (cast<VectorType>(LHS->getType())
16200 ->getElementType()
16201 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16202 ->getElementType()
16203 ->getIntegerBitWidth())
16204 CastTy = RHS->getType();
16205 else
16206 CastTy = LHS->getType();
16207 }
16208 if (LHS->getType() != CastTy)
16209 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16210 if (RHS->getType() != CastTy)
16211 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16212 }
16213
16214 Value *V0, *V1;
16215 if (Instruction::isBinaryOp(E->getOpcode())) {
16216 V0 = Builder.CreateBinOp(
16217 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16218 V1 = Builder.CreateBinOp(
16219 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16220 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16221 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16222 auto *AltCI = cast<CmpInst>(E->getAltOp());
16223 CmpInst::Predicate AltPred = AltCI->getPredicate();
16224 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16225 } else {
16226 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16227 unsigned SrcBWSz = DL->getTypeSizeInBits(
16228 cast<VectorType>(LHS->getType())->getElementType());
16229 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16230 if (BWSz <= SrcBWSz) {
16231 if (BWSz < SrcBWSz)
16232 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16233 assert(LHS->getType() == VecTy &&
16234 "Expected same type as operand.");
16235 if (auto *I = dyn_cast<Instruction>(LHS))
16236 LHS = ::propagateMetadata(I, E->Scalars);
16237 LHS = FinalShuffle(LHS, E);
16238 E->VectorizedValue = LHS;
16239 ++NumVectorInstructions;
16240 return LHS;
16241 }
16242 }
16243 V0 = Builder.CreateCast(
16244 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16245 V1 = Builder.CreateCast(
16246 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16247 }
16248 // Add V0 and V1 to later analysis to try to find and remove matching
16249 // instruction, if any.
16250 for (Value *V : {V0, V1}) {
16251 if (auto *I = dyn_cast<Instruction>(V)) {
16252 GatherShuffleExtractSeq.insert(I);
16253 CSEBlocks.insert(I->getParent());
16254 }
16255 }
16256
16257 // Create shuffle to take alternate operations from the vector.
16258 // Also, gather up main and alt scalar ops to propagate IR flags to
16259 // each vector operation.
16260 ValueList OpScalars, AltScalars;
16262 E->buildAltOpShuffleMask(
16263 [E, this](Instruction *I) {
16264 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16265 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16266 *TLI);
16267 },
16268 Mask, &OpScalars, &AltScalars);
16269
16270 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16271 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16272 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16273 // Drop nuw flags for abs(sub(commutative), true).
16274 if (auto *I = dyn_cast<Instruction>(Vec);
16275 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16276 any_of(E->Scalars, [](Value *V) {
16277 if (isa<PoisonValue>(V))
16278 return false;
16279 auto *IV = cast<Instruction>(V);
16280 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16281 }))
16282 I->setHasNoUnsignedWrap(/*b=*/false);
16283 };
16284 DropNuwFlag(V0, E->getOpcode());
16285 DropNuwFlag(V1, E->getAltOpcode());
16286
16287 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16288 assert(SLPReVec && "FixedVectorType is not expected.");
16290 }
16291 V = Builder.CreateShuffleVector(V0, V1, Mask);
16292 if (auto *I = dyn_cast<Instruction>(V)) {
16293 V = ::propagateMetadata(I, E->Scalars);
16294 GatherShuffleExtractSeq.insert(I);
16295 CSEBlocks.insert(I->getParent());
16296 }
16297 }
16298
16299 E->VectorizedValue = V;
16300 ++NumVectorInstructions;
16301
16302 return V;
16303 }
16304 default:
16305 llvm_unreachable("unknown inst");
16306 }
16307 return nullptr;
16308}
16309
16311 ExtraValueToDebugLocsMap ExternallyUsedValues;
16312 return vectorizeTree(ExternallyUsedValues);
16313}
16314
16315Value *
16317 Instruction *ReductionRoot) {
16318 // All blocks must be scheduled before any instructions are inserted.
16319 for (auto &BSIter : BlocksSchedules) {
16320 scheduleBlock(BSIter.second.get());
16321 }
16322 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16323 // need to rebuild it.
16324 EntryToLastInstruction.clear();
16325
16326 if (ReductionRoot)
16327 Builder.SetInsertPoint(ReductionRoot->getParent(),
16328 ReductionRoot->getIterator());
16329 else
16330 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16331
16332 // Emit gathered loads first to emit better code for the users of those
16333 // gathered loads.
16334 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16335 if (GatheredLoadsEntriesFirst.has_value() &&
16336 TE->Idx >= *GatheredLoadsEntriesFirst &&
16337 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16338 assert((!TE->UserTreeIndices.empty() ||
16339 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16340 "Expected gathered load node.");
16341 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16342 }
16343 }
16344 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16345 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16346 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16347 if (TE->State == TreeEntry::Vectorize &&
16348 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16349 TE->VectorizedValue)
16350 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16351 // Run through the list of postponed gathers and emit them, replacing the temp
16352 // emitted allocas with actual vector instructions.
16353 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16355 for (const TreeEntry *E : PostponedNodes) {
16356 auto *TE = const_cast<TreeEntry *>(E);
16357 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16358 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16359 TE->UserTreeIndices.front().EdgeIdx)) &&
16360 VecTE->isSame(TE->Scalars))
16361 // Found gather node which is absolutely the same as one of the
16362 // vectorized nodes. It may happen after reordering.
16363 continue;
16364 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16365 TE->VectorizedValue = nullptr;
16366 auto *UserI =
16367 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16368 // If user is a PHI node, its vector code have to be inserted right before
16369 // block terminator. Since the node was delayed, there were some unresolved
16370 // dependencies at the moment when stab instruction was emitted. In a case
16371 // when any of these dependencies turn out an operand of another PHI, coming
16372 // from this same block, position of a stab instruction will become invalid.
16373 // The is because source vector that supposed to feed this gather node was
16374 // inserted at the end of the block [after stab instruction]. So we need
16375 // to adjust insertion point again to the end of block.
16376 if (isa<PHINode>(UserI)) {
16377 // Insert before all users.
16378 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16379 for (User *U : PrevVec->users()) {
16380 if (U == UserI)
16381 continue;
16382 auto *UI = dyn_cast<Instruction>(U);
16383 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16384 continue;
16385 if (UI->comesBefore(InsertPt))
16386 InsertPt = UI;
16387 }
16388 Builder.SetInsertPoint(InsertPt);
16389 } else {
16390 Builder.SetInsertPoint(PrevVec);
16391 }
16392 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16393 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16394 if (auto *VecI = dyn_cast<Instruction>(Vec);
16395 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16396 Builder.GetInsertPoint()->comesBefore(VecI))
16397 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16398 Builder.GetInsertPoint());
16399 if (Vec->getType() != PrevVec->getType()) {
16400 assert(Vec->getType()->isIntOrIntVectorTy() &&
16401 PrevVec->getType()->isIntOrIntVectorTy() &&
16402 "Expected integer vector types only.");
16403 std::optional<bool> IsSigned;
16404 for (Value *V : TE->Scalars) {
16405 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16406 auto It = MinBWs.find(BaseTE);
16407 if (It != MinBWs.end()) {
16408 IsSigned = IsSigned.value_or(false) || It->second.second;
16409 if (*IsSigned)
16410 break;
16411 }
16412 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16413 auto It = MinBWs.find(MNTE);
16414 if (It != MinBWs.end()) {
16415 IsSigned = IsSigned.value_or(false) || It->second.second;
16416 if (*IsSigned)
16417 break;
16418 }
16419 }
16420 if (IsSigned.value_or(false))
16421 break;
16422 // Scan through gather nodes.
16423 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16424 auto It = MinBWs.find(BVE);
16425 if (It != MinBWs.end()) {
16426 IsSigned = IsSigned.value_or(false) || It->second.second;
16427 if (*IsSigned)
16428 break;
16429 }
16430 }
16431 if (IsSigned.value_or(false))
16432 break;
16433 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16434 IsSigned =
16435 IsSigned.value_or(false) ||
16436 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16437 continue;
16438 }
16439 if (IsSigned.value_or(false))
16440 break;
16441 }
16442 }
16443 if (IsSigned.value_or(false)) {
16444 // Final attempt - check user node.
16445 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16446 if (It != MinBWs.end())
16447 IsSigned = It->second.second;
16448 }
16449 assert(IsSigned &&
16450 "Expected user node or perfect diamond match in MinBWs.");
16451 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16452 }
16453 PrevVec->replaceAllUsesWith(Vec);
16454 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16455 // Replace the stub vector node, if it was used before for one of the
16456 // buildvector nodes already.
16457 auto It = PostponedValues.find(PrevVec);
16458 if (It != PostponedValues.end()) {
16459 for (TreeEntry *VTE : It->getSecond())
16460 VTE->VectorizedValue = Vec;
16461 }
16462 eraseInstruction(PrevVec);
16463 }
16464
16465 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16466 << " values .\n");
16467
16469 // Maps vector instruction to original insertelement instruction
16470 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16471 // Maps extract Scalar to the corresponding extractelement instruction in the
16472 // basic block. Only one extractelement per block should be emitted.
16474 ScalarToEEs;
16475 SmallDenseSet<Value *, 4> UsedInserts;
16477 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16479 // Extract all of the elements with the external uses.
16480 for (const auto &ExternalUse : ExternalUses) {
16481 Value *Scalar = ExternalUse.Scalar;
16482 llvm::User *User = ExternalUse.User;
16483
16484 // Skip users that we already RAUW. This happens when one instruction
16485 // has multiple uses of the same value.
16486 if (User && !is_contained(Scalar->users(), User))
16487 continue;
16488 TreeEntry *E = getTreeEntry(Scalar);
16489 assert(E && "Invalid scalar");
16490 assert(!E->isGather() && "Extracting from a gather list");
16491 // Non-instruction pointers are not deleted, just skip them.
16492 if (E->getOpcode() == Instruction::GetElementPtr &&
16493 !isa<GetElementPtrInst>(Scalar))
16494 continue;
16495
16496 Value *Vec = E->VectorizedValue;
16497 assert(Vec && "Can't find vectorizable value");
16498
16499 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16500 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16501 if (Scalar->getType() != Vec->getType()) {
16502 Value *Ex = nullptr;
16503 Value *ExV = nullptr;
16504 auto *Inst = dyn_cast<Instruction>(Scalar);
16505 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16506 auto It = ScalarToEEs.find(Scalar);
16507 if (It != ScalarToEEs.end()) {
16508 // No need to emit many extracts, just move the only one in the
16509 // current block.
16510 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16511 : Builder.GetInsertBlock());
16512 if (EEIt != It->second.end()) {
16513 Value *PrevV = EEIt->second.first;
16514 if (auto *I = dyn_cast<Instruction>(PrevV);
16515 I && !ReplaceInst &&
16516 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16517 Builder.GetInsertPoint()->comesBefore(I)) {
16518 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16519 Builder.GetInsertPoint());
16520 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16521 CI->moveAfter(I);
16522 }
16523 Ex = PrevV;
16524 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16525 }
16526 }
16527 if (!Ex) {
16528 // "Reuse" the existing extract to improve final codegen.
16529 if (ReplaceInst) {
16530 // Leave the instruction as is, if it cheaper extracts and all
16531 // operands are scalar.
16532 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16533 IgnoredExtracts.insert(EE);
16534 Ex = EE;
16535 } else {
16536 auto *CloneInst = Inst->clone();
16537 CloneInst->insertBefore(Inst->getIterator());
16538 if (Inst->hasName())
16539 CloneInst->takeName(Inst);
16540 Ex = CloneInst;
16541 }
16542 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16543 ES && isa<Instruction>(Vec)) {
16544 Value *V = ES->getVectorOperand();
16545 auto *IVec = cast<Instruction>(Vec);
16546 if (const TreeEntry *ETE = getTreeEntry(V))
16547 V = ETE->VectorizedValue;
16548 if (auto *IV = dyn_cast<Instruction>(V);
16549 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16550 IV->comesBefore(IVec))
16551 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16552 else
16553 Ex = Builder.CreateExtractElement(Vec, Lane);
16554 } else if (auto *VecTy =
16555 dyn_cast<FixedVectorType>(Scalar->getType())) {
16556 assert(SLPReVec && "FixedVectorType is not expected.");
16557 unsigned VecTyNumElements = VecTy->getNumElements();
16558 // When REVEC is enabled, we need to extract a vector.
16559 // Note: The element size of Scalar may be different from the
16560 // element size of Vec.
16561 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
16562 ExternalUse.Lane * VecTyNumElements);
16563 } else {
16564 Ex = Builder.CreateExtractElement(Vec, Lane);
16565 }
16566 // If necessary, sign-extend or zero-extend ScalarRoot
16567 // to the larger type.
16568 ExV = Ex;
16569 if (Scalar->getType() != Ex->getType())
16570 ExV = Builder.CreateIntCast(
16571 Ex, Scalar->getType(),
16572 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16573 auto *I = dyn_cast<Instruction>(Ex);
16574 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16575 : &F->getEntryBlock(),
16576 std::make_pair(Ex, ExV));
16577 }
16578 // The then branch of the previous if may produce constants, since 0
16579 // operand might be a constant.
16580 if (auto *ExI = dyn_cast<Instruction>(Ex);
16581 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16582 GatherShuffleExtractSeq.insert(ExI);
16583 CSEBlocks.insert(ExI->getParent());
16584 }
16585 return ExV;
16586 }
16587 assert(isa<FixedVectorType>(Scalar->getType()) &&
16588 isa<InsertElementInst>(Scalar) &&
16589 "In-tree scalar of vector type is not insertelement?");
16590 auto *IE = cast<InsertElementInst>(Scalar);
16591 VectorToInsertElement.try_emplace(Vec, IE);
16592 return Vec;
16593 };
16594 // If User == nullptr, the Scalar remains as scalar in vectorized
16595 // instructions or is used as extra arg. Generate ExtractElement instruction
16596 // and update the record for this scalar in ExternallyUsedValues.
16597 if (!User) {
16598 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16599 continue;
16600 assert((ExternallyUsedValues.count(Scalar) ||
16601 Scalar->hasNUsesOrMore(UsesLimit) ||
16602 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16603 any_of(Scalar->users(),
16604 [&](llvm::User *U) {
16605 if (ExternalUsesAsOriginalScalar.contains(U))
16606 return true;
16607 TreeEntry *UseEntry = getTreeEntry(U);
16608 return UseEntry &&
16609 (UseEntry->State == TreeEntry::Vectorize ||
16610 UseEntry->State ==
16611 TreeEntry::StridedVectorize) &&
16612 (E->State == TreeEntry::Vectorize ||
16613 E->State == TreeEntry::StridedVectorize) &&
16614 doesInTreeUserNeedToExtract(
16615 Scalar, getRootEntryInstruction(*UseEntry),
16616 TLI, TTI);
16617 })) &&
16618 "Scalar with nullptr User must be registered in "
16619 "ExternallyUsedValues map or remain as scalar in vectorized "
16620 "instructions");
16621 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16622 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16623 if (PHI->getParent()->isLandingPad())
16624 Builder.SetInsertPoint(
16625 PHI->getParent(),
16626 std::next(
16627 PHI->getParent()->getLandingPadInst()->getIterator()));
16628 else
16629 Builder.SetInsertPoint(PHI->getParent(),
16630 PHI->getParent()->getFirstNonPHIIt());
16631 } else {
16632 Builder.SetInsertPoint(VecI->getParent(),
16633 std::next(VecI->getIterator()));
16634 }
16635 } else {
16636 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16637 }
16638 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16639 // Required to update internally referenced instructions.
16640 if (Scalar != NewInst) {
16641 assert((!isa<ExtractElementInst>(Scalar) ||
16642 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16643 "Extractelements should not be replaced.");
16644 Scalar->replaceAllUsesWith(NewInst);
16645 }
16646 continue;
16647 }
16648
16649 if (auto *VU = dyn_cast<InsertElementInst>(User);
16650 VU && VU->getOperand(1) == Scalar) {
16651 // Skip if the scalar is another vector op or Vec is not an instruction.
16652 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16653 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16654 if (!UsedInserts.insert(VU).second)
16655 continue;
16656 // Need to use original vector, if the root is truncated.
16657 auto BWIt = MinBWs.find(E);
16658 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16659 auto *ScalarTy = FTy->getElementType();
16660 auto Key = std::make_pair(Vec, ScalarTy);
16661 auto VecIt = VectorCasts.find(Key);
16662 if (VecIt == VectorCasts.end()) {
16663 IRBuilderBase::InsertPointGuard Guard(Builder);
16664 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16665 if (IVec->getParent()->isLandingPad())
16666 Builder.SetInsertPoint(IVec->getParent(),
16667 std::next(IVec->getParent()
16668 ->getLandingPadInst()
16669 ->getIterator()));
16670 else
16671 Builder.SetInsertPoint(
16672 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16673 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16674 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16675 }
16676 Vec = Builder.CreateIntCast(
16677 Vec,
16679 ScalarTy,
16680 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16681 BWIt->second.second);
16682 VectorCasts.try_emplace(Key, Vec);
16683 } else {
16684 Vec = VecIt->second;
16685 }
16686 }
16687
16688 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16689 if (InsertIdx) {
16690 auto *It = find_if(
16691 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16692 // Checks if 2 insertelements are from the same buildvector.
16693 InsertElementInst *VecInsert = Data.InsertElements.front();
16695 VU, VecInsert,
16696 [](InsertElementInst *II) { return II->getOperand(0); });
16697 });
16698 unsigned Idx = *InsertIdx;
16699 if (It == ShuffledInserts.end()) {
16700 (void)ShuffledInserts.emplace_back();
16701 It = std::next(ShuffledInserts.begin(),
16702 ShuffledInserts.size() - 1);
16703 }
16704 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16705 if (Mask.empty())
16706 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16707 Mask[Idx] = ExternalUse.Lane;
16708 It->InsertElements.push_back(cast<InsertElementInst>(User));
16709 continue;
16710 }
16711 }
16712 }
16713 }
16714
16715 // Generate extracts for out-of-tree users.
16716 // Find the insertion point for the extractelement lane.
16717 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16718 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16719 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16720 if (PH->getIncomingValue(I) == Scalar) {
16721 Instruction *IncomingTerminator =
16722 PH->getIncomingBlock(I)->getTerminator();
16723 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16724 Builder.SetInsertPoint(VecI->getParent(),
16725 std::next(VecI->getIterator()));
16726 } else {
16727 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16728 }
16729 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16730 PH->setOperand(I, NewInst);
16731 }
16732 }
16733 } else {
16734 Builder.SetInsertPoint(cast<Instruction>(User));
16735 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16736 User->replaceUsesOfWith(Scalar, NewInst);
16737 }
16738 } else {
16739 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16740 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16741 User->replaceUsesOfWith(Scalar, NewInst);
16742 }
16743
16744 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16745 }
16746
16747 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16748 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16749 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16750 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16751 for (int I = 0, E = Mask.size(); I < E; ++I) {
16752 if (Mask[I] < VF)
16753 CombinedMask1[I] = Mask[I];
16754 else
16755 CombinedMask2[I] = Mask[I] - VF;
16756 }
16757 ShuffleInstructionBuilder ShuffleBuilder(
16758 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16759 ShuffleBuilder.add(V1, CombinedMask1);
16760 if (V2)
16761 ShuffleBuilder.add(V2, CombinedMask2);
16762 return ShuffleBuilder.finalize({}, {}, {});
16763 };
16764
16765 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16766 bool ForSingleMask) {
16767 unsigned VF = Mask.size();
16768 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16769 if (VF != VecVF) {
16770 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16771 Vec = CreateShuffle(Vec, nullptr, Mask);
16772 return std::make_pair(Vec, true);
16773 }
16774 if (!ForSingleMask) {
16775 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16776 for (unsigned I = 0; I < VF; ++I) {
16777 if (Mask[I] != PoisonMaskElem)
16778 ResizeMask[Mask[I]] = Mask[I];
16779 }
16780 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16781 }
16782 }
16783
16784 return std::make_pair(Vec, false);
16785 };
16786 // Perform shuffling of the vectorize tree entries for better handling of
16787 // external extracts.
16788 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16789 // Find the first and the last instruction in the list of insertelements.
16790 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16791 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16792 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16793 Builder.SetInsertPoint(LastInsert);
16794 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16795 Value *NewInst = performExtractsShuffleAction<Value>(
16796 MutableArrayRef(Vector.data(), Vector.size()),
16797 FirstInsert->getOperand(0),
16798 [](Value *Vec) {
16799 return cast<VectorType>(Vec->getType())
16800 ->getElementCount()
16801 .getKnownMinValue();
16802 },
16803 ResizeToVF,
16804 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16805 ArrayRef<Value *> Vals) {
16806 assert((Vals.size() == 1 || Vals.size() == 2) &&
16807 "Expected exactly 1 or 2 input values.");
16808 if (Vals.size() == 1) {
16809 // Do not create shuffle if the mask is a simple identity
16810 // non-resizing mask.
16811 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16812 ->getNumElements() ||
16813 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16814 return CreateShuffle(Vals.front(), nullptr, Mask);
16815 return Vals.front();
16816 }
16817 return CreateShuffle(Vals.front() ? Vals.front()
16818 : FirstInsert->getOperand(0),
16819 Vals.back(), Mask);
16820 });
16821 auto It = ShuffledInserts[I].InsertElements.rbegin();
16822 // Rebuild buildvector chain.
16823 InsertElementInst *II = nullptr;
16824 if (It != ShuffledInserts[I].InsertElements.rend())
16825 II = *It;
16827 while (It != ShuffledInserts[I].InsertElements.rend()) {
16828 assert(II && "Must be an insertelement instruction.");
16829 if (*It == II)
16830 ++It;
16831 else
16832 Inserts.push_back(cast<Instruction>(II));
16833 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16834 }
16835 for (Instruction *II : reverse(Inserts)) {
16836 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16837 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16838 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16839 II->moveAfter(NewI);
16840 NewInst = II;
16841 }
16842 LastInsert->replaceAllUsesWith(NewInst);
16843 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16844 IE->replaceUsesOfWith(IE->getOperand(0),
16845 PoisonValue::get(IE->getOperand(0)->getType()));
16846 IE->replaceUsesOfWith(IE->getOperand(1),
16847 PoisonValue::get(IE->getOperand(1)->getType()));
16848 eraseInstruction(IE);
16849 }
16850 CSEBlocks.insert(LastInsert->getParent());
16851 }
16852
16853 SmallVector<Instruction *> RemovedInsts;
16854 // For each vectorized value:
16855 for (auto &TEPtr : VectorizableTree) {
16856 TreeEntry *Entry = TEPtr.get();
16857
16858 // No need to handle users of gathered values.
16859 if (Entry->isGather())
16860 continue;
16861
16862 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16863
16864 // For each lane:
16865 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16866 Value *Scalar = Entry->Scalars[Lane];
16867
16868 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16869 !isa<GetElementPtrInst>(Scalar))
16870 continue;
16871 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16872 EE && IgnoredExtracts.contains(EE))
16873 continue;
16874 if (isa<PoisonValue>(Scalar))
16875 continue;
16876#ifndef NDEBUG
16877 Type *Ty = Scalar->getType();
16878 if (!Ty->isVoidTy()) {
16879 for (User *U : Scalar->users()) {
16880 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16881
16882 // It is legal to delete users in the ignorelist.
16883 assert((getTreeEntry(U) ||
16884 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16885 (isa_and_nonnull<Instruction>(U) &&
16886 isDeleted(cast<Instruction>(U)))) &&
16887 "Deleting out-of-tree value");
16888 }
16889 }
16890#endif
16891 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16892 auto *I = cast<Instruction>(Scalar);
16893 RemovedInsts.push_back(I);
16894 }
16895 }
16896
16897 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16898 // new vector instruction.
16899 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16900 V->mergeDIAssignID(RemovedInsts);
16901
16902 // Clear up reduction references, if any.
16903 if (UserIgnoreList) {
16904 for (Instruction *I : RemovedInsts) {
16905 const TreeEntry *IE = getTreeEntry(I);
16906 if (IE->Idx != 0 &&
16907 !(VectorizableTree.front()->isGather() &&
16908 !IE->UserTreeIndices.empty() &&
16909 (ValueToGatherNodes.lookup(I).contains(
16910 VectorizableTree.front().get()) ||
16911 any_of(IE->UserTreeIndices,
16912 [&](const EdgeInfo &EI) {
16913 return EI.UserTE == VectorizableTree.front().get() &&
16914 EI.EdgeIdx == UINT_MAX;
16915 }))) &&
16916 !(GatheredLoadsEntriesFirst.has_value() &&
16917 IE->Idx >= *GatheredLoadsEntriesFirst &&
16918 VectorizableTree.front()->isGather() &&
16919 is_contained(VectorizableTree.front()->Scalars, I)))
16920 continue;
16921 SmallVector<SelectInst *> LogicalOpSelects;
16922 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16923 // Do not replace condition of the logical op in form select <cond>.
16924 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16925 (match(U.getUser(), m_LogicalAnd()) ||
16926 match(U.getUser(), m_LogicalOr())) &&
16927 U.getOperandNo() == 0;
16928 if (IsPoisoningLogicalOp) {
16929 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16930 return false;
16931 }
16932 return UserIgnoreList->contains(U.getUser());
16933 });
16934 // Replace conditions of the poisoning logical ops with the non-poison
16935 // constant value.
16936 for (SelectInst *SI : LogicalOpSelects)
16937 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16938 }
16939 }
16940 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16941 // cache correctness.
16942 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16943 // - instructions are not deleted until later.
16944 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16945
16946 Builder.ClearInsertionPoint();
16947 InstrElementSize.clear();
16948
16949 const TreeEntry &RootTE = *VectorizableTree.front();
16950 Value *Vec = RootTE.VectorizedValue;
16951 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16952 It != MinBWs.end() &&
16953 ReductionBitWidth != It->second.first) {
16954 IRBuilder<>::InsertPointGuard Guard(Builder);
16955 Builder.SetInsertPoint(ReductionRoot->getParent(),
16956 ReductionRoot->getIterator());
16957 Vec = Builder.CreateIntCast(
16958 Vec,
16959 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16960 cast<VectorType>(Vec->getType())->getElementCount()),
16961 It->second.second);
16962 }
16963 return Vec;
16964}
16965
16967 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16968 << " gather sequences instructions.\n");
16969 // LICM InsertElementInst sequences.
16970 for (Instruction *I : GatherShuffleExtractSeq) {
16971 if (isDeleted(I))
16972 continue;
16973
16974 // Check if this block is inside a loop.
16975 Loop *L = LI->getLoopFor(I->getParent());
16976 if (!L)
16977 continue;
16978
16979 // Check if it has a preheader.
16980 BasicBlock *PreHeader = L->getLoopPreheader();
16981 if (!PreHeader)
16982 continue;
16983
16984 // If the vector or the element that we insert into it are
16985 // instructions that are defined in this basic block then we can't
16986 // hoist this instruction.
16987 if (any_of(I->operands(), [L](Value *V) {
16988 auto *OpI = dyn_cast<Instruction>(V);
16989 return OpI && L->contains(OpI);
16990 }))
16991 continue;
16992
16993 // We can hoist this instruction. Move it to the pre-header.
16994 I->moveBefore(PreHeader->getTerminator()->getIterator());
16995 CSEBlocks.insert(PreHeader);
16996 }
16997
16998 // Make a list of all reachable blocks in our CSE queue.
17000 CSEWorkList.reserve(CSEBlocks.size());
17001 for (BasicBlock *BB : CSEBlocks)
17002 if (DomTreeNode *N = DT->getNode(BB)) {
17004 CSEWorkList.push_back(N);
17005 }
17006
17007 // Sort blocks by domination. This ensures we visit a block after all blocks
17008 // dominating it are visited.
17009 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
17010 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
17011 "Different nodes should have different DFS numbers");
17012 return A->getDFSNumIn() < B->getDFSNumIn();
17013 });
17014
17015 // Less defined shuffles can be replaced by the more defined copies.
17016 // Between two shuffles one is less defined if it has the same vector operands
17017 // and its mask indeces are the same as in the first one or undefs. E.g.
17018 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
17019 // poison, <0, 0, 0, 0>.
17020 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
17021 Instruction *I2,
17022 SmallVectorImpl<int> &NewMask) {
17023 if (I1->getType() != I2->getType())
17024 return false;
17025 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17026 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17027 if (!SI1 || !SI2)
17028 return I1->isIdenticalTo(I2);
17029 if (SI1->isIdenticalTo(SI2))
17030 return true;
17031 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
17032 if (SI1->getOperand(I) != SI2->getOperand(I))
17033 return false;
17034 // Check if the second instruction is more defined than the first one.
17035 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17036 ArrayRef<int> SM1 = SI1->getShuffleMask();
17037 // Count trailing undefs in the mask to check the final number of used
17038 // registers.
17039 unsigned LastUndefsCnt = 0;
17040 for (int I = 0, E = NewMask.size(); I < E; ++I) {
17041 if (SM1[I] == PoisonMaskElem)
17042 ++LastUndefsCnt;
17043 else
17044 LastUndefsCnt = 0;
17045 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
17046 NewMask[I] != SM1[I])
17047 return false;
17048 if (NewMask[I] == PoisonMaskElem)
17049 NewMask[I] = SM1[I];
17050 }
17051 // Check if the last undefs actually change the final number of used vector
17052 // registers.
17053 return SM1.size() - LastUndefsCnt > 1 &&
17054 TTI->getNumberOfParts(SI1->getType()) ==
17056 getWidenedType(SI1->getType()->getElementType(),
17057 SM1.size() - LastUndefsCnt));
17058 };
17059 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
17060 // instructions. TODO: We can further optimize this scan if we split the
17061 // instructions into different buckets based on the insert lane.
17063 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
17064 assert(*I &&
17065 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17066 "Worklist not sorted properly!");
17067 BasicBlock *BB = (*I)->getBlock();
17068 // For all instructions in blocks containing gather sequences:
17069 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
17070 if (isDeleted(&In))
17071 continue;
17072 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17073 !GatherShuffleExtractSeq.contains(&In))
17074 continue;
17075
17076 // Check if we can replace this instruction with any of the
17077 // visited instructions.
17078 bool Replaced = false;
17079 for (Instruction *&V : Visited) {
17080 SmallVector<int> NewMask;
17081 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17082 DT->dominates(V->getParent(), In.getParent())) {
17083 In.replaceAllUsesWith(V);
17084 eraseInstruction(&In);
17085 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17086 if (!NewMask.empty())
17087 SI->setShuffleMask(NewMask);
17088 Replaced = true;
17089 break;
17090 }
17091 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17092 GatherShuffleExtractSeq.contains(V) &&
17093 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17094 DT->dominates(In.getParent(), V->getParent())) {
17095 In.moveAfter(V);
17096 V->replaceAllUsesWith(&In);
17098 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17099 if (!NewMask.empty())
17100 SI->setShuffleMask(NewMask);
17101 V = &In;
17102 Replaced = true;
17103 break;
17104 }
17105 }
17106 if (!Replaced) {
17107 assert(!is_contained(Visited, &In));
17108 Visited.push_back(&In);
17109 }
17110 }
17111 }
17112 CSEBlocks.clear();
17113 GatherShuffleExtractSeq.clear();
17114}
17115
17116BoUpSLP::ScheduleData *
17117BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17118 ScheduleData *Bundle = nullptr;
17119 ScheduleData *PrevInBundle = nullptr;
17120 for (Value *V : VL) {
17122 continue;
17123 ScheduleData *BundleMember = getScheduleData(V);
17124 assert(BundleMember &&
17125 "no ScheduleData for bundle member "
17126 "(maybe not in same basic block)");
17127 assert(BundleMember->isSchedulingEntity() &&
17128 "bundle member already part of other bundle");
17129 if (PrevInBundle) {
17130 PrevInBundle->NextInBundle = BundleMember;
17131 } else {
17132 Bundle = BundleMember;
17133 }
17134
17135 // Group the instructions to a bundle.
17136 BundleMember->FirstInBundle = Bundle;
17137 PrevInBundle = BundleMember;
17138 }
17139 assert(Bundle && "Failed to find schedule bundle");
17140 return Bundle;
17141}
17142
17143// Groups the instructions to a bundle (which is then a single scheduling entity)
17144// and schedules instructions until the bundle gets ready.
17145std::optional<BoUpSLP::ScheduleData *>
17146BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
17147 const InstructionsState &S) {
17148 // No need to schedule PHIs, insertelement, extractelement and extractvalue
17149 // instructions.
17150 if (isa<PHINode>(S.getMainOp()) ||
17152 return nullptr;
17153
17154 // Initialize the instruction bundle.
17155 Instruction *OldScheduleEnd = ScheduleEnd;
17156 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
17157
17158 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17159 ScheduleData *Bundle) {
17160 // The scheduling region got new instructions at the lower end (or it is a
17161 // new region for the first bundle). This makes it necessary to
17162 // recalculate all dependencies.
17163 // It is seldom that this needs to be done a second time after adding the
17164 // initial bundle to the region.
17165 if (ScheduleEnd != OldScheduleEnd) {
17166 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
17167 if (ScheduleData *SD = getScheduleData(I))
17168 SD->clearDependencies();
17169 ReSchedule = true;
17170 }
17171 if (Bundle) {
17172 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
17173 << " in block " << BB->getName() << "\n");
17174 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
17175 }
17176
17177 if (ReSchedule) {
17178 resetSchedule();
17179 initialFillReadyList(ReadyInsts);
17180 }
17181
17182 // Now try to schedule the new bundle or (if no bundle) just calculate
17183 // dependencies. As soon as the bundle is "ready" it means that there are no
17184 // cyclic dependencies and we can schedule it. Note that's important that we
17185 // don't "schedule" the bundle yet (see cancelScheduling).
17186 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17187 !ReadyInsts.empty()) {
17188 ScheduleData *Picked = ReadyInsts.pop_back_val();
17189 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17190 "must be ready to schedule");
17191 schedule(Picked, ReadyInsts);
17192 }
17193 };
17194
17195 // Make sure that the scheduling region contains all
17196 // instructions of the bundle.
17197 for (Value *V : VL) {
17199 continue;
17200 if (!extendSchedulingRegion(V, S)) {
17201 // If the scheduling region got new instructions at the lower end (or it
17202 // is a new region for the first bundle). This makes it necessary to
17203 // recalculate all dependencies.
17204 // Otherwise the compiler may crash trying to incorrectly calculate
17205 // dependencies and emit instruction in the wrong order at the actual
17206 // scheduling.
17207 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
17208 return std::nullopt;
17209 }
17210 }
17211
17212 bool ReSchedule = false;
17213 for (Value *V : VL) {
17215 continue;
17216 ScheduleData *BundleMember = getScheduleData(V);
17217 assert(BundleMember &&
17218 "no ScheduleData for bundle member (maybe not in same basic block)");
17219
17220 // Make sure we don't leave the pieces of the bundle in the ready list when
17221 // whole bundle might not be ready.
17222 ReadyInsts.remove(BundleMember);
17223
17224 if (!BundleMember->IsScheduled)
17225 continue;
17226 // A bundle member was scheduled as single instruction before and now
17227 // needs to be scheduled as part of the bundle. We just get rid of the
17228 // existing schedule.
17229 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
17230 << " was already scheduled\n");
17231 ReSchedule = true;
17232 }
17233
17234 auto *Bundle = buildBundle(VL);
17235 TryScheduleBundleImpl(ReSchedule, Bundle);
17236 if (!Bundle->isReady()) {
17237 cancelScheduling(VL, S.getMainOp());
17238 return std::nullopt;
17239 }
17240 return Bundle;
17241}
17242
17243void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17244 Value *OpValue) {
17245 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17247 return;
17248
17249 if (doesNotNeedToBeScheduled(OpValue))
17250 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17251 ScheduleData *Bundle = getScheduleData(OpValue);
17252 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17253 assert(!Bundle->IsScheduled &&
17254 "Can't cancel bundle which is already scheduled");
17255 assert(Bundle->isSchedulingEntity() &&
17256 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17257 "tried to unbundle something which is not a bundle");
17258
17259 // Remove the bundle from the ready list.
17260 if (Bundle->isReady())
17261 ReadyInsts.remove(Bundle);
17262
17263 // Un-bundle: make single instructions out of the bundle.
17264 ScheduleData *BundleMember = Bundle;
17265 while (BundleMember) {
17266 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17267 BundleMember->FirstInBundle = BundleMember;
17268 ScheduleData *Next = BundleMember->NextInBundle;
17269 BundleMember->NextInBundle = nullptr;
17270 BundleMember->TE = nullptr;
17271 if (BundleMember->unscheduledDepsInBundle() == 0) {
17272 ReadyInsts.insert(BundleMember);
17273 }
17274 BundleMember = Next;
17275 }
17276}
17277
17278BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17279 // Allocate a new ScheduleData for the instruction.
17280 if (ChunkPos >= ChunkSize) {
17281 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17282 ChunkPos = 0;
17283 }
17284 return &(ScheduleDataChunks.back()[ChunkPos++]);
17285}
17286
17287bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17288 Value *V, const InstructionsState &S) {
17289 Instruction *I = dyn_cast<Instruction>(V);
17290 assert(I && "bundle member must be an instruction");
17291 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17293 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17294 "be scheduled");
17295 if (getScheduleData(I))
17296 return true;
17297 if (!ScheduleStart) {
17298 // It's the first instruction in the new region.
17299 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17300 ScheduleStart = I;
17301 ScheduleEnd = I->getNextNode();
17302 assert(ScheduleEnd && "tried to vectorize a terminator?");
17303 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17304 return true;
17305 }
17306 // Search up and down at the same time, because we don't know if the new
17307 // instruction is above or below the existing scheduling region.
17308 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17309 // against the budget. Otherwise debug info could affect codegen.
17311 ++ScheduleStart->getIterator().getReverse();
17312 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17313 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17314 BasicBlock::iterator LowerEnd = BB->end();
17315 auto IsAssumeLikeIntr = [](const Instruction &I) {
17316 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17317 return II->isAssumeLikeIntrinsic();
17318 return false;
17319 };
17320 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17321 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17322 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17323 &*DownIter != I) {
17324 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17325 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17326 return false;
17327 }
17328
17329 ++UpIter;
17330 ++DownIter;
17331
17332 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17333 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17334 }
17335 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17336 assert(I->getParent() == ScheduleStart->getParent() &&
17337 "Instruction is in wrong basic block.");
17338 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17339 ScheduleStart = I;
17340 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17341 << "\n");
17342 return true;
17343 }
17344 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17345 "Expected to reach top of the basic block or instruction down the "
17346 "lower end.");
17347 assert(I->getParent() == ScheduleEnd->getParent() &&
17348 "Instruction is in wrong basic block.");
17349 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17350 nullptr);
17351 ScheduleEnd = I->getNextNode();
17352 assert(ScheduleEnd && "tried to vectorize a terminator?");
17353 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17354 return true;
17355}
17356
17357void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17358 Instruction *ToI,
17359 ScheduleData *PrevLoadStore,
17360 ScheduleData *NextLoadStore) {
17361 ScheduleData *CurrentLoadStore = PrevLoadStore;
17362 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17363 // No need to allocate data for non-schedulable instructions.
17365 continue;
17366 ScheduleData *SD = ScheduleDataMap.lookup(I);
17367 if (!SD) {
17368 SD = allocateScheduleDataChunks();
17369 ScheduleDataMap[I] = SD;
17370 }
17371 assert(!isInSchedulingRegion(SD) &&
17372 "new ScheduleData already in scheduling region");
17373 SD->init(SchedulingRegionID, I);
17374
17375 if (I->mayReadOrWriteMemory() &&
17376 (!isa<IntrinsicInst>(I) ||
17377 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17378 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17379 Intrinsic::pseudoprobe))) {
17380 // Update the linked list of memory accessing instructions.
17381 if (CurrentLoadStore) {
17382 CurrentLoadStore->NextLoadStore = SD;
17383 } else {
17384 FirstLoadStoreInRegion = SD;
17385 }
17386 CurrentLoadStore = SD;
17387 }
17388
17389 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17390 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17391 RegionHasStackSave = true;
17392 }
17393 if (NextLoadStore) {
17394 if (CurrentLoadStore)
17395 CurrentLoadStore->NextLoadStore = NextLoadStore;
17396 } else {
17397 LastLoadStoreInRegion = CurrentLoadStore;
17398 }
17399}
17400
17401void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17402 bool InsertInReadyList,
17403 BoUpSLP *SLP) {
17404 assert(SD->isSchedulingEntity());
17405
17407 WorkList.push_back(SD);
17408
17409 while (!WorkList.empty()) {
17410 ScheduleData *SD = WorkList.pop_back_val();
17411 for (ScheduleData *BundleMember = SD; BundleMember;
17412 BundleMember = BundleMember->NextInBundle) {
17413 assert(isInSchedulingRegion(BundleMember));
17414 if (BundleMember->hasValidDependencies())
17415 continue;
17416
17417 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17418 << "\n");
17419 BundleMember->Dependencies = 0;
17420 BundleMember->resetUnscheduledDeps();
17421
17422 // Handle def-use chain dependencies.
17423 for (User *U : BundleMember->Inst->users()) {
17424 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17425 BundleMember->Dependencies++;
17426 ScheduleData *DestBundle = UseSD->FirstInBundle;
17427 if (!DestBundle->IsScheduled)
17428 BundleMember->incrementUnscheduledDeps(1);
17429 if (!DestBundle->hasValidDependencies())
17430 WorkList.push_back(DestBundle);
17431 }
17432 }
17433
17434 auto MakeControlDependent = [&](Instruction *I) {
17435 auto *DepDest = getScheduleData(I);
17436 assert(DepDest && "must be in schedule window");
17437 DepDest->ControlDependencies.push_back(BundleMember);
17438 BundleMember->Dependencies++;
17439 ScheduleData *DestBundle = DepDest->FirstInBundle;
17440 if (!DestBundle->IsScheduled)
17441 BundleMember->incrementUnscheduledDeps(1);
17442 if (!DestBundle->hasValidDependencies())
17443 WorkList.push_back(DestBundle);
17444 };
17445
17446 // Any instruction which isn't safe to speculate at the beginning of the
17447 // block is control dependend on any early exit or non-willreturn call
17448 // which proceeds it.
17449 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17450 for (Instruction *I = BundleMember->Inst->getNextNode();
17451 I != ScheduleEnd; I = I->getNextNode()) {
17452 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17453 continue;
17454
17455 // Add the dependency
17456 MakeControlDependent(I);
17457
17459 // Everything past here must be control dependent on I.
17460 break;
17461 }
17462 }
17463
17464 if (RegionHasStackSave) {
17465 // If we have an inalloc alloca instruction, it needs to be scheduled
17466 // after any preceeding stacksave. We also need to prevent any alloca
17467 // from reordering above a preceeding stackrestore.
17468 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17469 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17470 for (Instruction *I = BundleMember->Inst->getNextNode();
17471 I != ScheduleEnd; I = I->getNextNode()) {
17472 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17473 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17474 // Any allocas past here must be control dependent on I, and I
17475 // must be memory dependend on BundleMember->Inst.
17476 break;
17477
17478 if (!isa<AllocaInst>(I))
17479 continue;
17480
17481 // Add the dependency
17482 MakeControlDependent(I);
17483 }
17484 }
17485
17486 // In addition to the cases handle just above, we need to prevent
17487 // allocas and loads/stores from moving below a stacksave or a
17488 // stackrestore. Avoiding moving allocas below stackrestore is currently
17489 // thought to be conservatism. Moving loads/stores below a stackrestore
17490 // can lead to incorrect code.
17491 if (isa<AllocaInst>(BundleMember->Inst) ||
17492 BundleMember->Inst->mayReadOrWriteMemory()) {
17493 for (Instruction *I = BundleMember->Inst->getNextNode();
17494 I != ScheduleEnd; I = I->getNextNode()) {
17495 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17496 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17497 continue;
17498
17499 // Add the dependency
17500 MakeControlDependent(I);
17501 break;
17502 }
17503 }
17504 }
17505
17506 // Handle the memory dependencies (if any).
17507 ScheduleData *DepDest = BundleMember->NextLoadStore;
17508 if (!DepDest)
17509 continue;
17510 Instruction *SrcInst = BundleMember->Inst;
17511 assert(SrcInst->mayReadOrWriteMemory() &&
17512 "NextLoadStore list for non memory effecting bundle?");
17513 MemoryLocation SrcLoc = getLocation(SrcInst);
17514 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17515 unsigned NumAliased = 0;
17516 unsigned DistToSrc = 1;
17517
17518 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17519 assert(isInSchedulingRegion(DepDest));
17520
17521 // We have two limits to reduce the complexity:
17522 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17523 // SLP->isAliased (which is the expensive part in this loop).
17524 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17525 // the whole loop (even if the loop is fast, it's quadratic).
17526 // It's important for the loop break condition (see below) to
17527 // check this limit even between two read-only instructions.
17528 if (DistToSrc >= MaxMemDepDistance ||
17529 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17530 (NumAliased >= AliasedCheckLimit ||
17531 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17532
17533 // We increment the counter only if the locations are aliased
17534 // (instead of counting all alias checks). This gives a better
17535 // balance between reduced runtime and accurate dependencies.
17536 NumAliased++;
17537
17538 DepDest->MemoryDependencies.push_back(BundleMember);
17539 BundleMember->Dependencies++;
17540 ScheduleData *DestBundle = DepDest->FirstInBundle;
17541 if (!DestBundle->IsScheduled) {
17542 BundleMember->incrementUnscheduledDeps(1);
17543 }
17544 if (!DestBundle->hasValidDependencies()) {
17545 WorkList.push_back(DestBundle);
17546 }
17547 }
17548
17549 // Example, explaining the loop break condition: Let's assume our
17550 // starting instruction is i0 and MaxMemDepDistance = 3.
17551 //
17552 // +--------v--v--v
17553 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17554 // +--------^--^--^
17555 //
17556 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17557 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17558 // Previously we already added dependencies from i3 to i6,i7,i8
17559 // (because of MaxMemDepDistance). As we added a dependency from
17560 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17561 // and we can abort this loop at i6.
17562 if (DistToSrc >= 2 * MaxMemDepDistance)
17563 break;
17564 DistToSrc++;
17565 }
17566 }
17567 if (InsertInReadyList && SD->isReady()) {
17568 ReadyInsts.insert(SD);
17569 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17570 << "\n");
17571 }
17572 }
17573}
17574
17575void BoUpSLP::BlockScheduling::resetSchedule() {
17576 assert(ScheduleStart &&
17577 "tried to reset schedule on block which has not been scheduled");
17578 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17579 if (ScheduleData *SD = getScheduleData(I)) {
17580 assert(isInSchedulingRegion(SD) &&
17581 "ScheduleData not in scheduling region");
17582 SD->IsScheduled = false;
17583 SD->resetUnscheduledDeps();
17584 }
17585 }
17586 ReadyInsts.clear();
17587}
17588
17589void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17590 if (!BS->ScheduleStart)
17591 return;
17592
17593 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17594
17595 // A key point - if we got here, pre-scheduling was able to find a valid
17596 // scheduling of the sub-graph of the scheduling window which consists
17597 // of all vector bundles and their transitive users. As such, we do not
17598 // need to reschedule anything *outside of* that subgraph.
17599
17600 BS->resetSchedule();
17601
17602 // For the real scheduling we use a more sophisticated ready-list: it is
17603 // sorted by the original instruction location. This lets the final schedule
17604 // be as close as possible to the original instruction order.
17605 // WARNING: If changing this order causes a correctness issue, that means
17606 // there is some missing dependence edge in the schedule data graph.
17607 struct ScheduleDataCompare {
17608 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17609 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17610 }
17611 };
17612 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17613
17614 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17615 // and fill the ready-list with initial instructions.
17616 int Idx = 0;
17617 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17618 I = I->getNextNode()) {
17619 if (ScheduleData *SD = BS->getScheduleData(I)) {
17620 [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);
17622 SD->isPartOfBundle() ==
17623 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17624 "scheduler and vectorizer bundle mismatch");
17625 SD->FirstInBundle->SchedulingPriority = Idx++;
17626
17627 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17628 BS->calculateDependencies(SD, false, this);
17629 }
17630 }
17631 BS->initialFillReadyList(ReadyInsts);
17632
17633 Instruction *LastScheduledInst = BS->ScheduleEnd;
17634
17635 // Do the "real" scheduling.
17636 while (!ReadyInsts.empty()) {
17637 ScheduleData *Picked = *ReadyInsts.begin();
17638 ReadyInsts.erase(ReadyInsts.begin());
17639
17640 // Move the scheduled instruction(s) to their dedicated places, if not
17641 // there yet.
17642 for (ScheduleData *BundleMember = Picked; BundleMember;
17643 BundleMember = BundleMember->NextInBundle) {
17644 Instruction *PickedInst = BundleMember->Inst;
17645 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17646 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17647 LastScheduledInst = PickedInst;
17648 }
17649
17650 BS->schedule(Picked, ReadyInsts);
17651 }
17652
17653 // Check that we didn't break any of our invariants.
17654#ifdef EXPENSIVE_CHECKS
17655 BS->verify();
17656#endif
17657
17658#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17659 // Check that all schedulable entities got scheduled
17660 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17661 ScheduleData *SD = BS->getScheduleData(I);
17662 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17663 assert(SD->IsScheduled && "must be scheduled at this point");
17664 }
17665#endif
17666
17667 // Avoid duplicate scheduling of the block.
17668 BS->ScheduleStart = nullptr;
17669}
17670
17672 // If V is a store, just return the width of the stored value (or value
17673 // truncated just before storing) without traversing the expression tree.
17674 // This is the common case.
17675 if (auto *Store = dyn_cast<StoreInst>(V))
17676 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17677
17678 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17679 return getVectorElementSize(IEI->getOperand(1));
17680
17681 auto E = InstrElementSize.find(V);
17682 if (E != InstrElementSize.end())
17683 return E->second;
17684
17685 // If V is not a store, we can traverse the expression tree to find loads
17686 // that feed it. The type of the loaded value may indicate a more suitable
17687 // width than V's type. We want to base the vector element size on the width
17688 // of memory operations where possible.
17691 if (auto *I = dyn_cast<Instruction>(V)) {
17692 Worklist.emplace_back(I, I->getParent(), 0);
17693 Visited.insert(I);
17694 }
17695
17696 // Traverse the expression tree in bottom-up order looking for loads. If we
17697 // encounter an instruction we don't yet handle, we give up.
17698 auto Width = 0u;
17699 Value *FirstNonBool = nullptr;
17700 while (!Worklist.empty()) {
17701 auto [I, Parent, Level] = Worklist.pop_back_val();
17702
17703 // We should only be looking at scalar instructions here. If the current
17704 // instruction has a vector type, skip.
17705 auto *Ty = I->getType();
17706 if (isa<VectorType>(Ty))
17707 continue;
17708 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17709 FirstNonBool = I;
17710 if (Level > RecursionMaxDepth)
17711 continue;
17712
17713 // If the current instruction is a load, update MaxWidth to reflect the
17714 // width of the loaded value.
17715 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17716 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17717
17718 // Otherwise, we need to visit the operands of the instruction. We only
17719 // handle the interesting cases from buildTree here. If an operand is an
17720 // instruction we haven't yet visited and from the same basic block as the
17721 // user or the use is a PHI node, we add it to the worklist.
17724 for (Use &U : I->operands()) {
17725 if (auto *J = dyn_cast<Instruction>(U.get()))
17726 if (Visited.insert(J).second &&
17727 (isa<PHINode>(I) || J->getParent() == Parent)) {
17728 Worklist.emplace_back(J, J->getParent(), Level + 1);
17729 continue;
17730 }
17731 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17732 FirstNonBool = U.get();
17733 }
17734 } else {
17735 break;
17736 }
17737 }
17738
17739 // If we didn't encounter a memory access in the expression tree, or if we
17740 // gave up for some reason, just return the width of V. Otherwise, return the
17741 // maximum width we found.
17742 if (!Width) {
17743 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17744 V = FirstNonBool;
17745 Width = DL->getTypeSizeInBits(V->getType());
17746 }
17747
17748 for (Instruction *I : Visited)
17749 InstrElementSize[I] = Width;
17750
17751 return Width;
17752}
17753
17754bool BoUpSLP::collectValuesToDemote(
17755 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17757 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17758 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17759 // We can always demote constants.
17760 if (all_of(E.Scalars, IsaPred<Constant>))
17761 return true;
17762
17763 unsigned OrigBitWidth =
17764 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17765 if (OrigBitWidth == BitWidth) {
17766 MaxDepthLevel = 1;
17767 return true;
17768 }
17769
17770 // Check if the node was analyzed already and must keep its original bitwidth.
17771 if (NodesToKeepBWs.contains(E.Idx))
17772 return false;
17773
17774 // If the value is not a vectorized instruction in the expression and not used
17775 // by the insertelement instruction and not used in multiple vector nodes, it
17776 // cannot be demoted.
17777 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17778 if (isa<PoisonValue>(R))
17779 return false;
17780 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17781 });
17782 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17783 if (isa<PoisonValue>(V))
17784 return true;
17785 if (MultiNodeScalars.contains(V))
17786 return false;
17787 // For lat shuffle of sext/zext with many uses need to check the extra bit
17788 // for unsigned values, otherwise may have incorrect casting for reused
17789 // scalars.
17790 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17791 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17792 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17793 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17794 return true;
17795 }
17796 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17797 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17798 if (IsSignedNode)
17799 ++BitWidth1;
17800 if (auto *I = dyn_cast<Instruction>(V)) {
17801 APInt Mask = DB->getDemandedBits(I);
17802 unsigned BitWidth2 =
17803 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17804 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17805 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17806 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17807 break;
17808 BitWidth2 *= 2;
17809 }
17810 BitWidth1 = std::min(BitWidth1, BitWidth2);
17811 }
17812 BitWidth = std::max(BitWidth, BitWidth1);
17813 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17814 };
17815 auto FinalAnalysis = [&, TTI = TTI]() {
17816 if (!IsProfitableToDemote)
17817 return false;
17818 bool Res = all_of(
17819 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17820 // Demote gathers.
17821 if (Res && E.isGather()) {
17822 // Check possible extractelement instructions bases and final vector
17823 // length.
17824 SmallPtrSet<Value *, 4> UniqueBases;
17825 for (Value *V : E.Scalars) {
17826 auto *EE = dyn_cast<ExtractElementInst>(V);
17827 if (!EE)
17828 continue;
17829 UniqueBases.insert(EE->getVectorOperand());
17830 }
17831 const unsigned VF = E.Scalars.size();
17832 Type *OrigScalarTy = E.Scalars.front()->getType();
17833 if (UniqueBases.size() <= 2 ||
17834 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17836 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17837 ToDemote.push_back(E.Idx);
17838 }
17839 return Res;
17840 };
17841 if (E.isGather() || !Visited.insert(&E).second ||
17842 any_of(E.Scalars, [&](Value *V) {
17843 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17844 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17845 });
17846 }))
17847 return FinalAnalysis();
17848
17849 if (any_of(E.Scalars, [&](Value *V) {
17850 return !all_of(V->users(), [=](User *U) {
17851 return getTreeEntry(U) ||
17852 (E.Idx == 0 && UserIgnoreList &&
17853 UserIgnoreList->contains(U)) ||
17854 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17855 !U->getType()->isScalableTy() &&
17856 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17857 }) && !IsPotentiallyTruncated(V, BitWidth);
17858 }))
17859 return false;
17860
17861 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17862 bool &NeedToExit) {
17863 NeedToExit = false;
17864 unsigned InitLevel = MaxDepthLevel;
17865 for (const TreeEntry *Op : Operands) {
17866 unsigned Level = InitLevel;
17867 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17868 ToDemote, Visited, NodesToKeepBWs, Level,
17869 IsProfitableToDemote, IsTruncRoot)) {
17870 if (!IsProfitableToDemote)
17871 return false;
17872 NeedToExit = true;
17873 if (!FinalAnalysis())
17874 return false;
17875 continue;
17876 }
17877 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17878 }
17879 return true;
17880 };
17881 auto AttemptCheckBitwidth =
17882 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17883 // Try all bitwidth < OrigBitWidth.
17884 NeedToExit = false;
17885 unsigned BestFailBitwidth = 0;
17886 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17887 if (Checker(BitWidth, OrigBitWidth))
17888 return true;
17889 if (BestFailBitwidth == 0 && FinalAnalysis())
17890 BestFailBitwidth = BitWidth;
17891 }
17892 if (BitWidth >= OrigBitWidth) {
17893 if (BestFailBitwidth == 0) {
17894 BitWidth = OrigBitWidth;
17895 return false;
17896 }
17897 MaxDepthLevel = 1;
17898 BitWidth = BestFailBitwidth;
17899 NeedToExit = true;
17900 return true;
17901 }
17902 return false;
17903 };
17904 auto TryProcessInstruction =
17905 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17906 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17907 if (Operands.empty()) {
17908 if (!IsTruncRoot)
17909 MaxDepthLevel = 1;
17910 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17911 std::ref(BitWidth)));
17912 } else {
17913 // Several vectorized uses? Check if we can truncate it, otherwise -
17914 // exit.
17915 if (E.UserTreeIndices.size() > 1 &&
17916 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17917 std::ref(BitWidth))))
17918 return false;
17919 bool NeedToExit = false;
17920 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17921 return false;
17922 if (NeedToExit)
17923 return true;
17924 if (!ProcessOperands(Operands, NeedToExit))
17925 return false;
17926 if (NeedToExit)
17927 return true;
17928 }
17929
17930 ++MaxDepthLevel;
17931 // Record the entry that we can demote.
17932 ToDemote.push_back(E.Idx);
17933 return IsProfitableToDemote;
17934 };
17935 switch (E.getOpcode()) {
17936
17937 // We can always demote truncations and extensions. Since truncations can
17938 // seed additional demotion, we save the truncated value.
17939 case Instruction::Trunc:
17940 if (IsProfitableToDemoteRoot)
17941 IsProfitableToDemote = true;
17942 return TryProcessInstruction(BitWidth);
17943 case Instruction::ZExt:
17944 case Instruction::SExt:
17945 IsProfitableToDemote = true;
17946 return TryProcessInstruction(BitWidth);
17947
17948 // We can demote certain binary operations if we can demote both of their
17949 // operands.
17950 case Instruction::Add:
17951 case Instruction::Sub:
17952 case Instruction::Mul:
17953 case Instruction::And:
17954 case Instruction::Or:
17955 case Instruction::Xor: {
17956 return TryProcessInstruction(
17957 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17958 }
17959 case Instruction::Freeze:
17960 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17961 case Instruction::Shl: {
17962 // If we are truncating the result of this SHL, and if it's a shift of an
17963 // inrange amount, we can always perform a SHL in a smaller type.
17964 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17965 return all_of(E.Scalars, [&](Value *V) {
17966 if (isa<PoisonValue>(V))
17967 return true;
17968 auto *I = cast<Instruction>(V);
17969 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17970 return AmtKnownBits.getMaxValue().ult(BitWidth);
17971 });
17972 };
17973 return TryProcessInstruction(
17974 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17975 }
17976 case Instruction::LShr: {
17977 // If this is a truncate of a logical shr, we can truncate it to a smaller
17978 // lshr iff we know that the bits we would otherwise be shifting in are
17979 // already zeros.
17980 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17981 return all_of(E.Scalars, [&](Value *V) {
17982 if (isa<PoisonValue>(V))
17983 return true;
17984 auto *I = cast<Instruction>(V);
17985 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17986 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17987 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17988 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17989 SimplifyQuery(*DL));
17990 });
17991 };
17992 return TryProcessInstruction(
17993 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17994 LShrChecker);
17995 }
17996 case Instruction::AShr: {
17997 // If this is a truncate of an arithmetic shr, we can truncate it to a
17998 // smaller ashr iff we know that all the bits from the sign bit of the
17999 // original type and the sign bit of the truncate type are similar.
18000 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18001 return all_of(E.Scalars, [&](Value *V) {
18002 if (isa<PoisonValue>(V))
18003 return true;
18004 auto *I = cast<Instruction>(V);
18005 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
18006 unsigned ShiftedBits = OrigBitWidth - BitWidth;
18007 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18008 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18009 nullptr, DT);
18010 });
18011 };
18012 return TryProcessInstruction(
18013 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18014 AShrChecker);
18015 }
18016 case Instruction::UDiv:
18017 case Instruction::URem: {
18018 // UDiv and URem can be truncated if all the truncated bits are zero.
18019 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18020 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18021 return all_of(E.Scalars, [&](Value *V) {
18022 auto *I = cast<Instruction>(V);
18023 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18024 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18025 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18026 });
18027 };
18028 return TryProcessInstruction(
18029 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18030 }
18031
18032 // We can demote selects if we can demote their true and false values.
18033 case Instruction::Select: {
18034 return TryProcessInstruction(
18035 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18036 }
18037
18038 // We can demote phis if we can demote all their incoming operands. Note that
18039 // we don't need to worry about cycles since we ensure single use above.
18040 case Instruction::PHI: {
18041 const unsigned NumOps = E.getNumOperands();
18043 transform(seq<unsigned>(0, NumOps), Ops.begin(),
18044 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
18045
18046 return TryProcessInstruction(BitWidth, Ops);
18047 }
18048
18049 case Instruction::Call: {
18050 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18051 if (!IC)
18052 break;
18054 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
18055 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
18056 break;
18057 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
18058 function_ref<bool(unsigned, unsigned)> CallChecker;
18059 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18060 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18061 return all_of(E.Scalars, [&](Value *V) {
18062 auto *I = cast<Instruction>(V);
18063 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18064 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18065 return MaskedValueIsZero(I->getOperand(0), Mask,
18066 SimplifyQuery(*DL)) &&
18067 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18068 }
18069 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
18070 "Expected min/max intrinsics only.");
18071 unsigned SignBits = OrigBitWidth - BitWidth;
18072 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18073 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18074 nullptr, DT);
18075 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18076 nullptr, DT);
18077 return SignBits <= Op0SignBits &&
18078 ((SignBits != Op0SignBits &&
18079 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18080 MaskedValueIsZero(I->getOperand(0), Mask,
18081 SimplifyQuery(*DL))) &&
18082 SignBits <= Op1SignBits &&
18083 ((SignBits != Op1SignBits &&
18084 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
18085 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
18086 });
18087 };
18088 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18089 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18090 return all_of(E.Scalars, [&](Value *V) {
18091 auto *I = cast<Instruction>(V);
18092 unsigned SignBits = OrigBitWidth - BitWidth;
18093 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18094 unsigned Op0SignBits =
18095 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18096 return SignBits <= Op0SignBits &&
18097 ((SignBits != Op0SignBits &&
18098 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18099 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18100 });
18101 };
18102 if (ID != Intrinsic::abs) {
18103 Operands.push_back(getOperandEntry(&E, 1));
18104 CallChecker = CompChecker;
18105 } else {
18106 CallChecker = AbsChecker;
18107 }
18108 InstructionCost BestCost =
18109 std::numeric_limits<InstructionCost::CostType>::max();
18110 unsigned BestBitWidth = BitWidth;
18111 unsigned VF = E.Scalars.size();
18112 // Choose the best bitwidth based on cost estimations.
18113 auto Checker = [&](unsigned BitWidth, unsigned) {
18114 unsigned MinBW = PowerOf2Ceil(BitWidth);
18115 SmallVector<Type *> ArgTys =
18116 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
18117 auto VecCallCosts = getVectorCallCosts(
18118 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18119 TTI, TLI, ArgTys);
18120 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
18121 if (Cost < BestCost) {
18122 BestCost = Cost;
18123 BestBitWidth = BitWidth;
18124 }
18125 return false;
18126 };
18127 [[maybe_unused]] bool NeedToExit;
18128 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18129 BitWidth = BestBitWidth;
18130 return TryProcessInstruction(BitWidth, Operands, CallChecker);
18131 }
18132
18133 // Otherwise, conservatively give up.
18134 default:
18135 break;
18136 }
18137 MaxDepthLevel = 1;
18138 return FinalAnalysis();
18139}
18140
18141static RecurKind getRdxKind(Value *V);
18142
18144 // We only attempt to truncate integer expressions.
18145 bool IsStoreOrInsertElt =
18146 VectorizableTree.front()->hasState() &&
18147 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18148 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18149 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18150 ExtraBitWidthNodes.size() <= 1 &&
18151 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18152 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18153 return;
18154
18155 unsigned NodeIdx = 0;
18156 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18157 NodeIdx = 1;
18158
18159 // Ensure the roots of the vectorizable tree don't form a cycle.
18160 if (VectorizableTree[NodeIdx]->isGather() ||
18161 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18162 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18163 [NodeIdx](const EdgeInfo &EI) {
18164 return EI.UserTE->Idx > NodeIdx;
18165 })))
18166 return;
18167
18168 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
18169 // resize to the final type.
18170 bool IsTruncRoot = false;
18171 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18172 SmallVector<unsigned> RootDemotes;
18173 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18174 if (NodeIdx != 0 &&
18175 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18176 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18177 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
18178 IsTruncRoot = true;
18179 RootDemotes.push_back(NodeIdx);
18180 IsProfitableToDemoteRoot = true;
18181 ++NodeIdx;
18182 }
18183
18184 // Analyzed the reduction already and not profitable - exit.
18185 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18186 return;
18187
18188 SmallVector<unsigned> ToDemote;
18189 auto ComputeMaxBitWidth =
18190 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
18191 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
18192 ToDemote.clear();
18193 // Check if the root is trunc and the next node is gather/buildvector, then
18194 // keep trunc in scalars, which is free in most cases.
18195 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18196 !NodesToKeepBWs.contains(E.Idx) &&
18197 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18198 all_of(E.Scalars, [&](Value *V) {
18199 return V->hasOneUse() || isa<Constant>(V) ||
18200 (!V->hasNUsesOrMore(UsesLimit) &&
18201 none_of(V->users(), [&](User *U) {
18202 const TreeEntry *TE = getTreeEntry(U);
18203 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18204 if (TE == UserTE || !TE)
18205 return false;
18206 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18207 SelectInst>(U) ||
18208 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18209 SelectInst>(UserTE->getMainOp()))
18210 return true;
18211 unsigned UserTESz = DL->getTypeSizeInBits(
18212 UserTE->Scalars.front()->getType());
18213 auto It = MinBWs.find(TE);
18214 if (It != MinBWs.end() && It->second.first > UserTESz)
18215 return true;
18216 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18217 }));
18218 })) {
18219 ToDemote.push_back(E.Idx);
18220 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18221 auto It = MinBWs.find(UserTE);
18222 if (It != MinBWs.end())
18223 return It->second.first;
18224 unsigned MaxBitWidth =
18225 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18226 MaxBitWidth = bit_ceil(MaxBitWidth);
18227 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18228 MaxBitWidth = 8;
18229 return MaxBitWidth;
18230 }
18231
18232 if (!E.hasState())
18233 return 0u;
18234
18235 unsigned VF = E.getVectorFactor();
18236 Type *ScalarTy = E.Scalars.front()->getType();
18237 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18238 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18239 if (!TreeRootIT)
18240 return 0u;
18241
18242 if (any_of(E.Scalars,
18243 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18244 return 0u;
18245
18246 unsigned NumParts = TTI->getNumberOfParts(
18247 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18248
18249 // The maximum bit width required to represent all the values that can be
18250 // demoted without loss of precision. It would be safe to truncate the roots
18251 // of the expression to this width.
18252 unsigned MaxBitWidth = 1u;
18253
18254 // True if the roots can be zero-extended back to their original type,
18255 // rather than sign-extended. We know that if the leading bits are not
18256 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18257 // True.
18258 // Determine if the sign bit of all the roots is known to be zero. If not,
18259 // IsKnownPositive is set to False.
18260 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18261 if (isa<PoisonValue>(R))
18262 return true;
18263 KnownBits Known = computeKnownBits(R, *DL);
18264 return Known.isNonNegative();
18265 });
18266
18267 // We first check if all the bits of the roots are demanded. If they're not,
18268 // we can truncate the roots to this narrower type.
18269 for (Value *Root : E.Scalars) {
18270 if (isa<PoisonValue>(Root))
18271 continue;
18272 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18273 TypeSize NumTypeBits =
18274 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18275 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18276 // If we can't prove that the sign bit is zero, we must add one to the
18277 // maximum bit width to account for the unknown sign bit. This preserves
18278 // the existing sign bit so we can safely sign-extend the root back to the
18279 // original type. Otherwise, if we know the sign bit is zero, we will
18280 // zero-extend the root instead.
18281 //
18282 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18283 // one to the maximum bit width will yield a larger-than-necessary
18284 // type. In general, we need to add an extra bit only if we can't
18285 // prove that the upper bit of the original type is equal to the
18286 // upper bit of the proposed smaller type. If these two bits are
18287 // the same (either zero or one) we know that sign-extending from
18288 // the smaller type will result in the same value. Here, since we
18289 // can't yet prove this, we are just making the proposed smaller
18290 // type larger to ensure correctness.
18291 if (!IsKnownPositive)
18292 ++BitWidth1;
18293
18294 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18295 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18296 MaxBitWidth =
18297 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18298 }
18299
18300 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18301 MaxBitWidth = 8;
18302
18303 // If the original type is large, but reduced type does not improve the reg
18304 // use - ignore it.
18305 if (NumParts > 1 &&
18306 NumParts ==
18308 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18309 return 0u;
18310
18311 unsigned Opcode = E.getOpcode();
18312 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18313 Opcode == Instruction::SExt ||
18314 Opcode == Instruction::ZExt || NumParts > 1;
18315 // Conservatively determine if we can actually truncate the roots of the
18316 // expression. Collect the values that can be demoted in ToDemote and
18317 // additional roots that require investigating in Roots.
18319 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18320 bool NeedToDemote = IsProfitableToDemote;
18321
18322 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18323 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18324 NeedToDemote, IsTruncRoot) ||
18325 (MaxDepthLevel <= Limit &&
18326 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18327 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18328 DL->getTypeSizeInBits(TreeRootIT) /
18329 DL->getTypeSizeInBits(
18330 E.getMainOp()->getOperand(0)->getType()) >
18331 2)))))
18332 return 0u;
18333 // Round MaxBitWidth up to the next power-of-two.
18334 MaxBitWidth = bit_ceil(MaxBitWidth);
18335
18336 return MaxBitWidth;
18337 };
18338
18339 // If we can truncate the root, we must collect additional values that might
18340 // be demoted as a result. That is, those seeded by truncations we will
18341 // modify.
18342 // Add reduction ops sizes, if any.
18343 if (UserIgnoreList &&
18344 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18345 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18346 // x i1> to in)).
18347 if (all_of(*UserIgnoreList,
18348 [](Value *V) {
18349 return isa<PoisonValue>(V) ||
18350 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18351 }) &&
18352 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18353 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18354 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18355 Builder.getInt1Ty()) {
18356 ReductionBitWidth = 1;
18357 } else {
18358 for (Value *V : *UserIgnoreList) {
18359 if (isa<PoisonValue>(V))
18360 continue;
18361 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18362 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18363 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18365 ++BitWidth1;
18366 unsigned BitWidth2 = BitWidth1;
18368 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18369 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18370 }
18371 ReductionBitWidth =
18372 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18373 }
18374 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18375 ReductionBitWidth = 8;
18376
18377 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18378 }
18379 }
18380 bool IsTopRoot = NodeIdx == 0;
18381 while (NodeIdx < VectorizableTree.size() &&
18382 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18383 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18384 RootDemotes.push_back(NodeIdx);
18385 ++NodeIdx;
18386 IsTruncRoot = true;
18387 }
18388 bool IsSignedCmp = false;
18389 while (NodeIdx < VectorizableTree.size()) {
18390 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18391 unsigned Limit = 2;
18392 if (IsTopRoot &&
18393 ReductionBitWidth ==
18394 DL->getTypeSizeInBits(
18395 VectorizableTree.front()->Scalars.front()->getType()))
18396 Limit = 3;
18397 unsigned MaxBitWidth = ComputeMaxBitWidth(
18398 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18399 IsTruncRoot, IsSignedCmp);
18400 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18401 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18402 ReductionBitWidth = bit_ceil(MaxBitWidth);
18403 else if (MaxBitWidth == 0)
18404 ReductionBitWidth = 0;
18405 }
18406
18407 for (unsigned Idx : RootDemotes) {
18408 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18409 uint32_t OrigBitWidth =
18410 DL->getTypeSizeInBits(V->getType()->getScalarType());
18411 if (OrigBitWidth > MaxBitWidth) {
18412 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18413 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18414 }
18415 return false;
18416 }))
18417 ToDemote.push_back(Idx);
18418 }
18419 RootDemotes.clear();
18420 IsTopRoot = false;
18421 IsProfitableToDemoteRoot = true;
18422
18423 if (ExtraBitWidthNodes.empty()) {
18424 NodeIdx = VectorizableTree.size();
18425 } else {
18426 unsigned NewIdx = 0;
18427 do {
18428 NewIdx = *ExtraBitWidthNodes.begin();
18429 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18430 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18431 NodeIdx = NewIdx;
18432 IsTruncRoot =
18433 NodeIdx < VectorizableTree.size() &&
18434 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18435 [](const EdgeInfo &EI) {
18436 return EI.EdgeIdx == 0 &&
18437 EI.UserTE->getOpcode() == Instruction::Trunc &&
18438 !EI.UserTE->isAltShuffle();
18439 });
18440 IsSignedCmp =
18441 NodeIdx < VectorizableTree.size() &&
18442 any_of(
18443 VectorizableTree[NodeIdx]->UserTreeIndices,
18444 [&](const EdgeInfo &EI) {
18445 return (EI.UserTE->hasState() &&
18446 EI.UserTE->getOpcode() == Instruction::ICmp) &&
18447 any_of(EI.UserTE->Scalars, [&](Value *V) {
18448 auto *IC = dyn_cast<ICmpInst>(V);
18449 return IC &&
18450 (IC->isSigned() ||
18451 !isKnownNonNegative(IC->getOperand(0),
18452 SimplifyQuery(*DL)) ||
18453 !isKnownNonNegative(IC->getOperand(1),
18454 SimplifyQuery(*DL)));
18455 });
18456 });
18457 }
18458
18459 // If the maximum bit width we compute is less than the width of the roots'
18460 // type, we can proceed with the narrowing. Otherwise, do nothing.
18461 if (MaxBitWidth == 0 ||
18462 MaxBitWidth >=
18463 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18464 ->getBitWidth()) {
18465 if (UserIgnoreList)
18466 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18467 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18468 continue;
18469 }
18470
18471 // Finally, map the values we can demote to the maximum bit with we
18472 // computed.
18473 for (unsigned Idx : ToDemote) {
18474 TreeEntry *TE = VectorizableTree[Idx].get();
18475 if (MinBWs.contains(TE))
18476 continue;
18477 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18478 if (isa<PoisonValue>(R))
18479 return false;
18480 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18481 });
18482 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18483 }
18484 }
18485}
18486
18488 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18489 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18490 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18491 auto *AA = &AM.getResult<AAManager>(F);
18492 auto *LI = &AM.getResult<LoopAnalysis>(F);
18493 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18494 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18495 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18497
18498 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18499 if (!Changed)
18500 return PreservedAnalyses::all();
18501
18504 return PA;
18505}
18506
18508 TargetTransformInfo *TTI_,
18509 TargetLibraryInfo *TLI_, AAResults *AA_,
18510 LoopInfo *LI_, DominatorTree *DT_,
18511 AssumptionCache *AC_, DemandedBits *DB_,
18514 return false;
18515 SE = SE_;
18516 TTI = TTI_;
18517 TLI = TLI_;
18518 AA = AA_;
18519 LI = LI_;
18520 DT = DT_;
18521 AC = AC_;
18522 DB = DB_;
18523 DL = &F.getDataLayout();
18524
18525 Stores.clear();
18526 GEPs.clear();
18527 bool Changed = false;
18528
18529 // If the target claims to have no vector registers don't attempt
18530 // vectorization.
18532 LLVM_DEBUG(
18533 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18534 return false;
18535 }
18536
18537 // Don't vectorize when the attribute NoImplicitFloat is used.
18538 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18539 return false;
18540
18541 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18542
18543 // Use the bottom up slp vectorizer to construct chains that start with
18544 // store instructions.
18545 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18546
18547 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18548 // delete instructions.
18549
18550 // Update DFS numbers now so that we can use them for ordering.
18551 DT->updateDFSNumbers();
18552
18553 // Scan the blocks in the function in post order.
18554 for (auto *BB : post_order(&F.getEntryBlock())) {
18555 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18556 continue;
18557
18558 // Start new block - clear the list of reduction roots.
18559 R.clearReductionData();
18560 collectSeedInstructions(BB);
18561
18562 // Vectorize trees that end at stores.
18563 if (!Stores.empty()) {
18564 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18565 << " underlying objects.\n");
18566 Changed |= vectorizeStoreChains(R);
18567 }
18568
18569 // Vectorize trees that end at reductions.
18570 Changed |= vectorizeChainsInBlock(BB, R);
18571
18572 // Vectorize the index computations of getelementptr instructions. This
18573 // is primarily intended to catch gather-like idioms ending at
18574 // non-consecutive loads.
18575 if (!GEPs.empty()) {
18576 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18577 << " underlying objects.\n");
18578 Changed |= vectorizeGEPIndices(BB, R);
18579 }
18580 }
18581
18582 if (Changed) {
18583 R.optimizeGatherSequence();
18584 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18585 }
18586 return Changed;
18587}
18588
18589std::optional<bool>
18590SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18591 unsigned Idx, unsigned MinVF,
18592 unsigned &Size) {
18593 Size = 0;
18594 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18595 << "\n");
18596 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18597 unsigned VF = Chain.size();
18598
18599 if (!has_single_bit(Sz) ||
18601 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18602 VF) ||
18603 VF < 2 || VF < MinVF) {
18604 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18605 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18606 // all vector lanes are used.
18607 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18608 return false;
18609 }
18610
18611 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18612 << "\n");
18613
18614 SetVector<Value *> ValOps;
18615 for (Value *V : Chain)
18616 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18617 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18618 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18619 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18620 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18621 bool IsAllowedSize =
18622 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18623 ValOps.size()) ||
18624 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18625 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18626 (!S.getMainOp()->isSafeToRemove() ||
18627 any_of(ValOps.getArrayRef(),
18628 [&](Value *V) {
18629 return !isa<ExtractElementInst>(V) &&
18630 (V->getNumUses() > Chain.size() ||
18631 any_of(V->users(), [&](User *U) {
18632 return !Stores.contains(U);
18633 }));
18634 }))) ||
18635 (ValOps.size() > Chain.size() / 2 && !S)) {
18636 Size = (!IsAllowedSize && S) ? 1 : 2;
18637 return false;
18638 }
18639 }
18640 if (R.isLoadCombineCandidate(Chain))
18641 return true;
18642 R.buildTree(Chain);
18643 // Check if tree tiny and store itself or its value is not vectorized.
18644 if (R.isTreeTinyAndNotFullyVectorizable()) {
18645 if (R.isGathered(Chain.front()) ||
18646 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18647 return std::nullopt;
18648 Size = R.getCanonicalGraphSize();
18649 return false;
18650 }
18651 R.reorderTopToBottom();
18652 R.reorderBottomToTop();
18653 R.transformNodes();
18654 R.buildExternalUses();
18655
18656 R.computeMinimumValueSizes();
18657
18658 Size = R.getCanonicalGraphSize();
18659 if (S && S.getOpcode() == Instruction::Load)
18660 Size = 2; // cut off masked gather small trees
18661 InstructionCost Cost = R.getTreeCost();
18662
18663 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18664 if (Cost < -SLPCostThreshold) {
18665 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18666
18667 using namespace ore;
18668
18669 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18670 cast<StoreInst>(Chain[0]))
18671 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18672 << " and with tree size "
18673 << NV("TreeSize", R.getTreeSize()));
18674
18675 R.vectorizeTree();
18676 return true;
18677 }
18678
18679 return false;
18680}
18681
18682/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18683static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18684 bool First) {
18685 unsigned Num = 0;
18686 uint64_t Sum = std::accumulate(
18687 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18688 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18689 unsigned Size = First ? Val.first : Val.second;
18690 if (Size == 1)
18691 return V;
18692 ++Num;
18693 return V + Size;
18694 });
18695 if (Num == 0)
18696 return true;
18697 uint64_t Mean = Sum / Num;
18698 if (Mean == 0)
18699 return true;
18700 uint64_t Dev = std::accumulate(
18701 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18702 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18703 unsigned P = First ? Val.first : Val.second;
18704 if (P == 1)
18705 return V;
18706 return V + (P - Mean) * (P - Mean);
18707 }) /
18708 Num;
18709 return Dev * 81 / (Mean * Mean) == 0;
18710}
18711
18712bool SLPVectorizerPass::vectorizeStores(
18713 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18714 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18715 &Visited) {
18716 // We may run into multiple chains that merge into a single chain. We mark the
18717 // stores that we vectorized so that we don't visit the same store twice.
18718 BoUpSLP::ValueSet VectorizedStores;
18719 bool Changed = false;
18720
18721 struct StoreDistCompare {
18722 bool operator()(const std::pair<unsigned, int> &Op1,
18723 const std::pair<unsigned, int> &Op2) const {
18724 return Op1.second < Op2.second;
18725 }
18726 };
18727 // A set of pairs (index of store in Stores array ref, Distance of the store
18728 // address relative to base store address in units).
18729 using StoreIndexToDistSet =
18730 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18731 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18732 int PrevDist = -1;
18734 // Collect the chain into a list.
18735 for (auto [Idx, Data] : enumerate(Set)) {
18736 if (Operands.empty() || Data.second - PrevDist == 1) {
18737 Operands.push_back(Stores[Data.first]);
18738 PrevDist = Data.second;
18739 if (Idx != Set.size() - 1)
18740 continue;
18741 }
18742 auto E = make_scope_exit([&, &DataVar = Data]() {
18743 Operands.clear();
18744 Operands.push_back(Stores[DataVar.first]);
18745 PrevDist = DataVar.second;
18746 });
18747
18748 if (Operands.size() <= 1 ||
18749 !Visited
18750 .insert({Operands.front(),
18751 cast<StoreInst>(Operands.front())->getValueOperand(),
18752 Operands.back(),
18753 cast<StoreInst>(Operands.back())->getValueOperand(),
18754 Operands.size()})
18755 .second)
18756 continue;
18757
18758 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18759 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18760 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18761
18762 unsigned MaxVF =
18763 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18764 auto *Store = cast<StoreInst>(Operands[0]);
18765 Type *StoreTy = Store->getValueOperand()->getType();
18766 Type *ValueTy = StoreTy;
18767 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18768 ValueTy = Trunc->getSrcTy();
18769 unsigned MinVF = std::max<unsigned>(
18771 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18772 ValueTy)));
18773
18774 if (MaxVF < MinVF) {
18775 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18776 << ") < "
18777 << "MinVF (" << MinVF << ")\n");
18778 continue;
18779 }
18780
18781 unsigned NonPowerOf2VF = 0;
18783 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18784 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18785 // lanes are used.
18786 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18787 if (has_single_bit(CandVF + 1)) {
18788 NonPowerOf2VF = CandVF;
18789 assert(NonPowerOf2VF != MaxVF &&
18790 "Non-power-of-2 VF should not be equal to MaxVF");
18791 }
18792 }
18793
18794 unsigned MaxRegVF = MaxVF;
18795 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18796 if (MaxVF < MinVF) {
18797 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18798 << ") < "
18799 << "MinVF (" << MinVF << ")\n");
18800 continue;
18801 }
18802
18803 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18804 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18805 unsigned Size = MinVF;
18806 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18807 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18808 Size *= 2;
18809 });
18810 unsigned End = Operands.size();
18811 unsigned Repeat = 0;
18812 constexpr unsigned MaxAttempts = 4;
18814 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18815 P.first = P.second = 1;
18816 });
18818 auto IsNotVectorized = [](bool First,
18819 const std::pair<unsigned, unsigned> &P) {
18820 return First ? P.first > 0 : P.second > 0;
18821 };
18822 auto IsVectorized = [](bool First,
18823 const std::pair<unsigned, unsigned> &P) {
18824 return First ? P.first == 0 : P.second == 0;
18825 };
18826 auto VFIsProfitable = [](bool First, unsigned Size,
18827 const std::pair<unsigned, unsigned> &P) {
18828 return First ? Size >= P.first : Size >= P.second;
18829 };
18830 auto FirstSizeSame = [](unsigned Size,
18831 const std::pair<unsigned, unsigned> &P) {
18832 return Size == P.first;
18833 };
18834 while (true) {
18835 ++Repeat;
18836 bool RepeatChanged = false;
18837 bool AnyProfitableGraph = false;
18838 for (unsigned Size : CandidateVFs) {
18839 AnyProfitableGraph = false;
18840 unsigned StartIdx = std::distance(
18841 RangeSizes.begin(),
18842 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18843 std::placeholders::_1)));
18844 while (StartIdx < End) {
18845 unsigned EndIdx =
18846 std::distance(RangeSizes.begin(),
18847 find_if(RangeSizes.drop_front(StartIdx),
18848 std::bind(IsVectorized, Size >= MaxRegVF,
18849 std::placeholders::_1)));
18850 unsigned Sz = EndIdx >= End ? End : EndIdx;
18851 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18852 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18853 Size >= MaxRegVF)) {
18854 ++Cnt;
18855 continue;
18856 }
18858 assert(all_of(Slice,
18859 [&](Value *V) {
18860 return cast<StoreInst>(V)
18861 ->getValueOperand()
18862 ->getType() ==
18863 cast<StoreInst>(Slice.front())
18864 ->getValueOperand()
18865 ->getType();
18866 }) &&
18867 "Expected all operands of same type.");
18868 if (!NonSchedulable.empty()) {
18869 auto [NonSchedSizeMax, NonSchedSizeMin] =
18870 NonSchedulable.lookup(Slice.front());
18871 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18872 Cnt += NonSchedSizeMax;
18873 continue;
18874 }
18875 }
18876 unsigned TreeSize;
18877 std::optional<bool> Res =
18878 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18879 if (!Res) {
18880 NonSchedulable
18881 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18882 .first->getSecond()
18883 .second = Size;
18884 } else if (*Res) {
18885 // Mark the vectorized stores so that we don't vectorize them
18886 // again.
18887 VectorizedStores.insert(Slice.begin(), Slice.end());
18888 // Mark the vectorized stores so that we don't vectorize them
18889 // again.
18890 AnyProfitableGraph = RepeatChanged = Changed = true;
18891 // If we vectorized initial block, no need to try to vectorize
18892 // it again.
18893 for_each(RangeSizes.slice(Cnt, Size),
18894 [](std::pair<unsigned, unsigned> &P) {
18895 P.first = P.second = 0;
18896 });
18897 if (Cnt < StartIdx + MinVF) {
18898 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18899 [](std::pair<unsigned, unsigned> &P) {
18900 P.first = P.second = 0;
18901 });
18902 StartIdx = Cnt + Size;
18903 }
18904 if (Cnt > Sz - Size - MinVF) {
18905 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18906 [](std::pair<unsigned, unsigned> &P) {
18907 P.first = P.second = 0;
18908 });
18909 if (Sz == End)
18910 End = Cnt;
18911 Sz = Cnt;
18912 }
18913 Cnt += Size;
18914 continue;
18915 }
18916 if (Size > 2 && Res &&
18917 !all_of(RangeSizes.slice(Cnt, Size),
18918 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18919 std::placeholders::_1))) {
18920 Cnt += Size;
18921 continue;
18922 }
18923 // Check for the very big VFs that we're not rebuilding same
18924 // trees, just with larger number of elements.
18925 if (Size > MaxRegVF && TreeSize > 1 &&
18926 all_of(RangeSizes.slice(Cnt, Size),
18927 std::bind(FirstSizeSame, TreeSize,
18928 std::placeholders::_1))) {
18929 Cnt += Size;
18930 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18931 ++Cnt;
18932 continue;
18933 }
18934 if (TreeSize > 1)
18935 for_each(RangeSizes.slice(Cnt, Size),
18936 [&](std::pair<unsigned, unsigned> &P) {
18937 if (Size >= MaxRegVF)
18938 P.second = std::max(P.second, TreeSize);
18939 else
18940 P.first = std::max(P.first, TreeSize);
18941 });
18942 ++Cnt;
18943 AnyProfitableGraph = true;
18944 }
18945 if (StartIdx >= End)
18946 break;
18947 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18948 AnyProfitableGraph = true;
18949 StartIdx = std::distance(
18950 RangeSizes.begin(),
18951 find_if(RangeSizes.drop_front(Sz),
18952 std::bind(IsNotVectorized, Size >= MaxRegVF,
18953 std::placeholders::_1)));
18954 }
18955 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18956 break;
18957 }
18958 // All values vectorized - exit.
18959 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18960 return P.first == 0 && P.second == 0;
18961 }))
18962 break;
18963 // Check if tried all attempts or no need for the last attempts at all.
18964 if (Repeat >= MaxAttempts ||
18965 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18966 break;
18967 constexpr unsigned StoresLimit = 64;
18968 const unsigned MaxTotalNum = std::min<unsigned>(
18969 Operands.size(),
18970 static_cast<unsigned>(
18971 End -
18972 std::distance(
18973 RangeSizes.begin(),
18974 find_if(RangeSizes, std::bind(IsNotVectorized, true,
18975 std::placeholders::_1))) +
18976 1));
18977 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
18978 unsigned Limit =
18979 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18980 CandidateVFs.clear();
18981 if (bit_floor(Limit) == VF)
18982 CandidateVFs.push_back(Limit);
18983 if (VF > MaxTotalNum || VF >= StoresLimit)
18984 break;
18985 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18986 if (P.first != 0)
18987 P.first = std::max(P.second, P.first);
18988 });
18989 // Last attempt to vectorize max number of elements, if all previous
18990 // attempts were unsuccessful because of the cost issues.
18991 CandidateVFs.push_back(VF);
18992 }
18993 }
18994 };
18995
18996 // Stores pair (first: index of the store into Stores array ref, address of
18997 // which taken as base, second: sorted set of pairs {index, dist}, which are
18998 // indices of stores in the set and their store location distances relative to
18999 // the base address).
19000
19001 // Need to store the index of the very first store separately, since the set
19002 // may be reordered after the insertion and the first store may be moved. This
19003 // container allows to reduce number of calls of getPointersDiff() function.
19005 // Inserts the specified store SI with the given index Idx to the set of the
19006 // stores. If the store with the same distance is found already - stop
19007 // insertion, try to vectorize already found stores. If some stores from this
19008 // sequence were not vectorized - try to vectorize them with the new store
19009 // later. But this logic is applied only to the stores, that come before the
19010 // previous store with the same distance.
19011 // Example:
19012 // 1. store x, %p
19013 // 2. store y, %p+1
19014 // 3. store z, %p+2
19015 // 4. store a, %p
19016 // 5. store b, %p+3
19017 // - Scan this from the last to first store. The very first bunch of stores is
19018 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
19019 // vector).
19020 // - The next store in the list - #1 - has the same distance from store #5 as
19021 // the store #4.
19022 // - Try to vectorize sequence of stores 4,2,3,5.
19023 // - If all these stores are vectorized - just drop them.
19024 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
19025 // - Start new stores sequence.
19026 // The new bunch of stores is {1, {1, 0}}.
19027 // - Add the stores from previous sequence, that were not vectorized.
19028 // Here we consider the stores in the reversed order, rather they are used in
19029 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
19030 // Store #3 can be added -> comes after store #4 with the same distance as
19031 // store #1.
19032 // Store #5 cannot be added - comes before store #4.
19033 // This logic allows to improve the compile time, we assume that the stores
19034 // after previous store with the same distance most likely have memory
19035 // dependencies and no need to waste compile time to try to vectorize them.
19036 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
19037 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
19038 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19039 std::optional<int> Diff = getPointersDiff(
19040 Stores[Set.first]->getValueOperand()->getType(),
19041 Stores[Set.first]->getPointerOperand(),
19042 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
19043 /*StrictCheck=*/true);
19044 if (!Diff)
19045 continue;
19046 auto It = Set.second.find(std::make_pair(Idx, *Diff));
19047 if (It == Set.second.end()) {
19048 Set.second.emplace(Idx, *Diff);
19049 return;
19050 }
19051 // Try to vectorize the first found set to avoid duplicate analysis.
19052 TryToVectorize(Set.second);
19053 unsigned ItIdx = It->first;
19054 int ItDist = It->second;
19055 StoreIndexToDistSet PrevSet;
19056 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
19057 [&](const std::pair<unsigned, int> &Pair) {
19058 return Pair.first > ItIdx;
19059 });
19060 Set.second.clear();
19061 Set.first = Idx;
19062 Set.second.emplace(Idx, 0);
19063 // Insert stores that followed previous match to try to vectorize them
19064 // with this store.
19065 unsigned StartIdx = ItIdx + 1;
19066 SmallBitVector UsedStores(Idx - StartIdx);
19067 // Distances to previously found dup store (or this store, since they
19068 // store to the same addresses).
19069 SmallVector<int> Dists(Idx - StartIdx, 0);
19070 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
19071 // Do not try to vectorize sequences, we already tried.
19072 if (VectorizedStores.contains(Stores[Pair.first]))
19073 break;
19074 unsigned BI = Pair.first - StartIdx;
19075 UsedStores.set(BI);
19076 Dists[BI] = Pair.second - ItDist;
19077 }
19078 for (unsigned I = StartIdx; I < Idx; ++I) {
19079 unsigned BI = I - StartIdx;
19080 if (UsedStores.test(BI))
19081 Set.second.emplace(I, Dists[BI]);
19082 }
19083 return;
19084 }
19085 auto &Res = SortedStores.emplace_back();
19086 Res.first = Idx;
19087 Res.second.emplace(Idx, 0);
19088 };
19089 Type *PrevValTy = nullptr;
19090 for (auto [I, SI] : enumerate(Stores)) {
19091 if (R.isDeleted(SI))
19092 continue;
19093 if (!PrevValTy)
19094 PrevValTy = SI->getValueOperand()->getType();
19095 // Check that we do not try to vectorize stores of different types.
19096 if (PrevValTy != SI->getValueOperand()->getType()) {
19097 for (auto &Set : SortedStores)
19098 TryToVectorize(Set.second);
19099 SortedStores.clear();
19100 PrevValTy = SI->getValueOperand()->getType();
19101 }
19102 FillStoresSet(I, SI);
19103 }
19104
19105 // Final vectorization attempt.
19106 for (auto &Set : SortedStores)
19107 TryToVectorize(Set.second);
19108
19109 return Changed;
19110}
19111
19112void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19113 // Initialize the collections. We will make a single pass over the block.
19114 Stores.clear();
19115 GEPs.clear();
19116
19117 // Visit the store and getelementptr instructions in BB and organize them in
19118 // Stores and GEPs according to the underlying objects of their pointer
19119 // operands.
19120 for (Instruction &I : *BB) {
19121 // Ignore store instructions that are volatile or have a pointer operand
19122 // that doesn't point to a scalar type.
19123 if (auto *SI = dyn_cast<StoreInst>(&I)) {
19124 if (!SI->isSimple())
19125 continue;
19126 if (!isValidElementType(SI->getValueOperand()->getType()))
19127 continue;
19128 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19129 }
19130
19131 // Ignore getelementptr instructions that have more than one index, a
19132 // constant index, or a pointer operand that doesn't point to a scalar
19133 // type.
19134 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19135 if (GEP->getNumIndices() != 1)
19136 continue;
19137 Value *Idx = GEP->idx_begin()->get();
19138 if (isa<Constant>(Idx))
19139 continue;
19140 if (!isValidElementType(Idx->getType()))
19141 continue;
19142 if (GEP->getType()->isVectorTy())
19143 continue;
19144 GEPs[GEP->getPointerOperand()].push_back(GEP);
19145 }
19146 }
19147}
19148
19149bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19150 bool MaxVFOnly) {
19151 if (VL.size() < 2)
19152 return false;
19153
19154 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
19155 << VL.size() << ".\n");
19156
19157 // Check that all of the parts are instructions of the same type,
19158 // we permit an alternate opcode via InstructionsState.
19159 InstructionsState S = getSameOpcode(VL, *TLI);
19160 if (!S)
19161 return false;
19162
19163 Instruction *I0 = S.getMainOp();
19164 // Make sure invalid types (including vector type) are rejected before
19165 // determining vectorization factor for scalar instructions.
19166 for (Value *V : VL) {
19167 Type *Ty = V->getType();
19168 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19169 // NOTE: the following will give user internal llvm type name, which may
19170 // not be useful.
19171 R.getORE()->emit([&]() {
19172 std::string TypeStr;
19173 llvm::raw_string_ostream rso(TypeStr);
19174 Ty->print(rso);
19175 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
19176 << "Cannot SLP vectorize list: type "
19177 << TypeStr + " is unsupported by vectorizer";
19178 });
19179 return false;
19180 }
19181 }
19182
19183 Type *ScalarTy = getValueType(VL[0]);
19184 unsigned Sz = R.getVectorElementSize(I0);
19185 unsigned MinVF = R.getMinVF(Sz);
19186 unsigned MaxVF = std::max<unsigned>(
19187 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
19188 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19189 if (MaxVF < 2) {
19190 R.getORE()->emit([&]() {
19191 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
19192 << "Cannot SLP vectorize list: vectorization factor "
19193 << "less than 2 is not supported";
19194 });
19195 return false;
19196 }
19197
19198 bool Changed = false;
19199 bool CandidateFound = false;
19200 InstructionCost MinCost = SLPCostThreshold.getValue();
19201
19202 unsigned NextInst = 0, MaxInst = VL.size();
19203 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19204 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
19205 // No actual vectorization should happen, if number of parts is the same as
19206 // provided vectorization factor (i.e. the scalar type is used for vector
19207 // code during codegen).
19208 auto *VecTy = getWidenedType(ScalarTy, VF);
19209 if (TTI->getNumberOfParts(VecTy) == VF)
19210 continue;
19211 for (unsigned I = NextInst; I < MaxInst; ++I) {
19212 unsigned ActualVF = std::min(MaxInst - I, VF);
19213
19214 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19215 continue;
19216
19217 if (MaxVFOnly && ActualVF < MaxVF)
19218 break;
19219 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19220 break;
19221
19222 SmallVector<Value *> Ops(ActualVF, nullptr);
19223 unsigned Idx = 0;
19224 for (Value *V : VL.drop_front(I)) {
19225 // Check that a previous iteration of this loop did not delete the
19226 // Value.
19227 if (auto *Inst = dyn_cast<Instruction>(V);
19228 !Inst || !R.isDeleted(Inst)) {
19229 Ops[Idx] = V;
19230 ++Idx;
19231 if (Idx == ActualVF)
19232 break;
19233 }
19234 }
19235 // Not enough vectorizable instructions - exit.
19236 if (Idx != ActualVF)
19237 break;
19238
19239 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
19240 << "\n");
19241
19242 R.buildTree(Ops);
19243 if (R.isTreeTinyAndNotFullyVectorizable())
19244 continue;
19245 R.reorderTopToBottom();
19246 R.reorderBottomToTop(
19247 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19248 !R.doesRootHaveInTreeUses());
19249 R.transformNodes();
19250 R.buildExternalUses();
19251
19252 R.computeMinimumValueSizes();
19253 InstructionCost Cost = R.getTreeCost();
19254 CandidateFound = true;
19255 MinCost = std::min(MinCost, Cost);
19256
19257 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19258 << " for VF=" << ActualVF << "\n");
19259 if (Cost < -SLPCostThreshold) {
19260 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19261 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19262 cast<Instruction>(Ops[0]))
19263 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19264 << " and with tree size "
19265 << ore::NV("TreeSize", R.getTreeSize()));
19266
19267 R.vectorizeTree();
19268 // Move to the next bundle.
19269 I += VF - 1;
19270 NextInst = I + 1;
19271 Changed = true;
19272 }
19273 }
19274 }
19275
19276 if (!Changed && CandidateFound) {
19277 R.getORE()->emit([&]() {
19278 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19279 << "List vectorization was possible but not beneficial with cost "
19280 << ore::NV("Cost", MinCost) << " >= "
19281 << ore::NV("Treshold", -SLPCostThreshold);
19282 });
19283 } else if (!Changed) {
19284 R.getORE()->emit([&]() {
19285 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19286 << "Cannot SLP vectorize list: vectorization was impossible"
19287 << " with available vectorization factors";
19288 });
19289 }
19290 return Changed;
19291}
19292
19293bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19294 if (!I)
19295 return false;
19296
19297 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19298 return false;
19299
19300 Value *P = I->getParent();
19301
19302 // Vectorize in current basic block only.
19303 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19304 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19305 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19306 R.isDeleted(Op0) || R.isDeleted(Op1))
19307 return false;
19308
19309 // First collect all possible candidates
19311 Candidates.emplace_back(Op0, Op1);
19312
19313 auto *A = dyn_cast<BinaryOperator>(Op0);
19314 auto *B = dyn_cast<BinaryOperator>(Op1);
19315 // Try to skip B.
19316 if (A && B && B->hasOneUse()) {
19317 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19318 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19319 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19320 Candidates.emplace_back(A, B0);
19321 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19322 Candidates.emplace_back(A, B1);
19323 }
19324 // Try to skip A.
19325 if (B && A && A->hasOneUse()) {
19326 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19327 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19328 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19329 Candidates.emplace_back(A0, B);
19330 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19331 Candidates.emplace_back(A1, B);
19332 }
19333
19334 if (Candidates.size() == 1)
19335 return tryToVectorizeList({Op0, Op1}, R);
19336
19337 // We have multiple options. Try to pick the single best.
19338 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19339 if (!BestCandidate)
19340 return false;
19341 return tryToVectorizeList(
19342 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19343}
19344
19345namespace {
19346
19347/// Model horizontal reductions.
19348///
19349/// A horizontal reduction is a tree of reduction instructions that has values
19350/// that can be put into a vector as its leaves. For example:
19351///
19352/// mul mul mul mul
19353/// \ / \ /
19354/// + +
19355/// \ /
19356/// +
19357/// This tree has "mul" as its leaf values and "+" as its reduction
19358/// instructions. A reduction can feed into a store or a binary operation
19359/// feeding a phi.
19360/// ...
19361/// \ /
19362/// +
19363/// |
19364/// phi +=
19365///
19366/// Or:
19367/// ...
19368/// \ /
19369/// +
19370/// |
19371/// *p =
19372///
19373class HorizontalReduction {
19374 using ReductionOpsType = SmallVector<Value *, 16>;
19375 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19376 ReductionOpsListType ReductionOps;
19377 /// List of possibly reduced values.
19379 /// Maps reduced value to the corresponding reduction operation.
19381 WeakTrackingVH ReductionRoot;
19382 /// The type of reduction operation.
19383 RecurKind RdxKind;
19384 /// Checks if the optimization of original scalar identity operations on
19385 /// matched horizontal reductions is enabled and allowed.
19386 bool IsSupportedHorRdxIdentityOp = false;
19387
19388 static bool isCmpSelMinMax(Instruction *I) {
19389 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19391 }
19392
19393 // And/or are potentially poison-safe logical patterns like:
19394 // select x, y, false
19395 // select x, true, y
19396 static bool isBoolLogicOp(Instruction *I) {
19397 return isa<SelectInst>(I) &&
19398 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19399 }
19400
19401 /// Checks if instruction is associative and can be vectorized.
19402 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19403 if (Kind == RecurKind::None)
19404 return false;
19405
19406 // Integer ops that map to select instructions or intrinsics are fine.
19408 isBoolLogicOp(I))
19409 return true;
19410
19411 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19412 // FP min/max are associative except for NaN and -0.0. We do not
19413 // have to rule out -0.0 here because the intrinsic semantics do not
19414 // specify a fixed result for it.
19415 return I->getFastMathFlags().noNaNs();
19416 }
19417
19418 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19419 return true;
19420
19421 return I->isAssociative();
19422 }
19423
19424 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19425 // Poison-safe 'or' takes the form: select X, true, Y
19426 // To make that work with the normal operand processing, we skip the
19427 // true value operand.
19428 // TODO: Change the code and data structures to handle this without a hack.
19429 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19430 return I->getOperand(2);
19431 return I->getOperand(Index);
19432 }
19433
19434 /// Creates reduction operation with the current opcode.
19435 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19436 Value *RHS, const Twine &Name, bool UseSelect) {
19437 switch (Kind) {
19438 case RecurKind::Or: {
19439 if (UseSelect &&
19441 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19442 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19443 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19444 Name);
19445 }
19446 case RecurKind::And: {
19447 if (UseSelect &&
19449 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19450 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19451 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19452 Name);
19453 }
19454 case RecurKind::Add:
19455 case RecurKind::Mul:
19456 case RecurKind::Xor:
19457 case RecurKind::FAdd:
19458 case RecurKind::FMul: {
19459 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19460 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19461 Name);
19462 }
19463 case RecurKind::SMax:
19464 case RecurKind::SMin:
19465 case RecurKind::UMax:
19466 case RecurKind::UMin:
19467 if (UseSelect) {
19469 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
19470 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19471 }
19472 [[fallthrough]];
19473 case RecurKind::FMax:
19474 case RecurKind::FMin:
19475 case RecurKind::FMaximum:
19476 case RecurKind::FMinimum: {
19478 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
19479 }
19480 default:
19481 llvm_unreachable("Unknown reduction operation.");
19482 }
19483 }
19484
19485 /// Creates reduction operation with the current opcode with the IR flags
19486 /// from \p ReductionOps, dropping nuw/nsw flags.
19487 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19488 Value *RHS, const Twine &Name,
19489 const ReductionOpsListType &ReductionOps) {
19490 bool UseSelect = ReductionOps.size() == 2 ||
19491 // Logical or/and.
19492 (ReductionOps.size() == 1 &&
19493 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19494 assert((!UseSelect || ReductionOps.size() != 2 ||
19495 isa<SelectInst>(ReductionOps[1][0])) &&
19496 "Expected cmp + select pairs for reduction");
19497 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19499 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19500 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19501 /*IncludeWrapFlags=*/false);
19502 propagateIRFlags(Op, ReductionOps[1], nullptr,
19503 /*IncludeWrapFlags=*/false);
19504 return Op;
19505 }
19506 }
19507 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19508 return Op;
19509 }
19510
19511public:
19512 static RecurKind getRdxKind(Value *V) {
19513 auto *I = dyn_cast<Instruction>(V);
19514 if (!I)
19515 return RecurKind::None;
19516 if (match(I, m_Add(m_Value(), m_Value())))
19517 return RecurKind::Add;
19518 if (match(I, m_Mul(m_Value(), m_Value())))
19519 return RecurKind::Mul;
19520 if (match(I, m_And(m_Value(), m_Value())) ||
19522 return RecurKind::And;
19523 if (match(I, m_Or(m_Value(), m_Value())) ||
19525 return RecurKind::Or;
19526 if (match(I, m_Xor(m_Value(), m_Value())))
19527 return RecurKind::Xor;
19528 if (match(I, m_FAdd(m_Value(), m_Value())))
19529 return RecurKind::FAdd;
19530 if (match(I, m_FMul(m_Value(), m_Value())))
19531 return RecurKind::FMul;
19532
19533 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19534 return RecurKind::FMax;
19535 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19536 return RecurKind::FMin;
19537
19538 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19539 return RecurKind::FMaximum;
19540 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19541 return RecurKind::FMinimum;
19542 // This matches either cmp+select or intrinsics. SLP is expected to handle
19543 // either form.
19544 // TODO: If we are canonicalizing to intrinsics, we can remove several
19545 // special-case paths that deal with selects.
19546 if (match(I, m_SMax(m_Value(), m_Value())))
19547 return RecurKind::SMax;
19548 if (match(I, m_SMin(m_Value(), m_Value())))
19549 return RecurKind::SMin;
19550 if (match(I, m_UMax(m_Value(), m_Value())))
19551 return RecurKind::UMax;
19552 if (match(I, m_UMin(m_Value(), m_Value())))
19553 return RecurKind::UMin;
19554
19555 if (auto *Select = dyn_cast<SelectInst>(I)) {
19556 // Try harder: look for min/max pattern based on instructions producing
19557 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19558 // During the intermediate stages of SLP, it's very common to have
19559 // pattern like this (since optimizeGatherSequence is run only once
19560 // at the end):
19561 // %1 = extractelement <2 x i32> %a, i32 0
19562 // %2 = extractelement <2 x i32> %a, i32 1
19563 // %cond = icmp sgt i32 %1, %2
19564 // %3 = extractelement <2 x i32> %a, i32 0
19565 // %4 = extractelement <2 x i32> %a, i32 1
19566 // %select = select i1 %cond, i32 %3, i32 %4
19567 CmpPredicate Pred;
19568 Instruction *L1;
19569 Instruction *L2;
19570
19571 Value *LHS = Select->getTrueValue();
19572 Value *RHS = Select->getFalseValue();
19573 Value *Cond = Select->getCondition();
19574
19575 // TODO: Support inverse predicates.
19576 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19577 if (!isa<ExtractElementInst>(RHS) ||
19578 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19579 return RecurKind::None;
19580 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19581 if (!isa<ExtractElementInst>(LHS) ||
19582 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19583 return RecurKind::None;
19584 } else {
19585 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19586 return RecurKind::None;
19587 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19588 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19589 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19590 return RecurKind::None;
19591 }
19592
19593 switch (Pred) {
19594 default:
19595 return RecurKind::None;
19596 case CmpInst::ICMP_SGT:
19597 case CmpInst::ICMP_SGE:
19598 return RecurKind::SMax;
19599 case CmpInst::ICMP_SLT:
19600 case CmpInst::ICMP_SLE:
19601 return RecurKind::SMin;
19602 case CmpInst::ICMP_UGT:
19603 case CmpInst::ICMP_UGE:
19604 return RecurKind::UMax;
19605 case CmpInst::ICMP_ULT:
19606 case CmpInst::ICMP_ULE:
19607 return RecurKind::UMin;
19608 }
19609 }
19610 return RecurKind::None;
19611 }
19612
19613 /// Get the index of the first operand.
19614 static unsigned getFirstOperandIndex(Instruction *I) {
19615 return isCmpSelMinMax(I) ? 1 : 0;
19616 }
19617
19618private:
19619 /// Total number of operands in the reduction operation.
19620 static unsigned getNumberOfOperands(Instruction *I) {
19621 return isCmpSelMinMax(I) ? 3 : 2;
19622 }
19623
19624 /// Checks if the instruction is in basic block \p BB.
19625 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19626 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19627 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19628 auto *Sel = cast<SelectInst>(I);
19629 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19630 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19631 }
19632 return I->getParent() == BB;
19633 }
19634
19635 /// Expected number of uses for reduction operations/reduced values.
19636 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19637 if (IsCmpSelMinMax) {
19638 // SelectInst must be used twice while the condition op must have single
19639 // use only.
19640 if (auto *Sel = dyn_cast<SelectInst>(I))
19641 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19642 return I->hasNUses(2);
19643 }
19644
19645 // Arithmetic reduction operation must be used once only.
19646 return I->hasOneUse();
19647 }
19648
19649 /// Initializes the list of reduction operations.
19650 void initReductionOps(Instruction *I) {
19651 if (isCmpSelMinMax(I))
19652 ReductionOps.assign(2, ReductionOpsType());
19653 else
19654 ReductionOps.assign(1, ReductionOpsType());
19655 }
19656
19657 /// Add all reduction operations for the reduction instruction \p I.
19658 void addReductionOps(Instruction *I) {
19659 if (isCmpSelMinMax(I)) {
19660 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19661 ReductionOps[1].emplace_back(I);
19662 } else {
19663 ReductionOps[0].emplace_back(I);
19664 }
19665 }
19666
19667 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19668 int Sz = Data.size();
19669 auto *I = dyn_cast<Instruction>(Data.front());
19670 return Sz > 1 || isConstant(Data.front()) ||
19671 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19672 }
19673
19674public:
19675 HorizontalReduction() = default;
19676
19677 /// Try to find a reduction tree.
19678 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19679 ScalarEvolution &SE, const DataLayout &DL,
19680 const TargetLibraryInfo &TLI) {
19681 RdxKind = HorizontalReduction::getRdxKind(Root);
19682 if (!isVectorizable(RdxKind, Root))
19683 return false;
19684
19685 // Analyze "regular" integer/FP types for reductions - no target-specific
19686 // types or pointers.
19687 Type *Ty = Root->getType();
19688 if (!isValidElementType(Ty) || Ty->isPointerTy())
19689 return false;
19690
19691 // Though the ultimate reduction may have multiple uses, its condition must
19692 // have only single use.
19693 if (auto *Sel = dyn_cast<SelectInst>(Root))
19694 if (!Sel->getCondition()->hasOneUse())
19695 return false;
19696
19697 ReductionRoot = Root;
19698
19699 // Iterate through all the operands of the possible reduction tree and
19700 // gather all the reduced values, sorting them by their value id.
19701 BasicBlock *BB = Root->getParent();
19702 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19704 1, std::make_pair(Root, 0));
19705 // Checks if the operands of the \p TreeN instruction are also reduction
19706 // operations or should be treated as reduced values or an extra argument,
19707 // which is not part of the reduction.
19708 auto CheckOperands = [&](Instruction *TreeN,
19709 SmallVectorImpl<Value *> &PossibleReducedVals,
19710 SmallVectorImpl<Instruction *> &ReductionOps,
19711 unsigned Level) {
19712 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19713 getNumberOfOperands(TreeN)))) {
19714 Value *EdgeVal = getRdxOperand(TreeN, I);
19715 ReducedValsToOps[EdgeVal].push_back(TreeN);
19716 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19717 // If the edge is not an instruction, or it is different from the main
19718 // reduction opcode or has too many uses - possible reduced value.
19719 // Also, do not try to reduce const values, if the operation is not
19720 // foldable.
19721 if (!EdgeInst || Level > RecursionMaxDepth ||
19722 getRdxKind(EdgeInst) != RdxKind ||
19723 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19724 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19725 !isVectorizable(RdxKind, EdgeInst) ||
19726 (R.isAnalyzedReductionRoot(EdgeInst) &&
19727 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19728 PossibleReducedVals.push_back(EdgeVal);
19729 continue;
19730 }
19731 ReductionOps.push_back(EdgeInst);
19732 }
19733 };
19734 // Try to regroup reduced values so that it gets more profitable to try to
19735 // reduce them. Values are grouped by their value ids, instructions - by
19736 // instruction op id and/or alternate op id, plus do extra analysis for
19737 // loads (grouping them by the distabce between pointers) and cmp
19738 // instructions (grouping them by the predicate).
19741 8>
19742 PossibleReducedVals;
19743 initReductionOps(Root);
19745 SmallSet<size_t, 2> LoadKeyUsed;
19746
19747 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19748 Key = hash_combine(hash_value(LI->getParent()), Key);
19749 Value *Ptr =
19751 if (!LoadKeyUsed.insert(Key).second) {
19752 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19753 if (LIt != LoadsMap.end()) {
19754 for (LoadInst *RLI : LIt->second) {
19755 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19756 LI->getType(), LI->getPointerOperand(), DL, SE,
19757 /*StrictCheck=*/true))
19758 return hash_value(RLI->getPointerOperand());
19759 }
19760 for (LoadInst *RLI : LIt->second) {
19762 LI->getPointerOperand(), TLI)) {
19763 hash_code SubKey = hash_value(RLI->getPointerOperand());
19764 return SubKey;
19765 }
19766 }
19767 if (LIt->second.size() > 2) {
19768 hash_code SubKey =
19769 hash_value(LIt->second.back()->getPointerOperand());
19770 return SubKey;
19771 }
19772 }
19773 }
19774 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19775 .first->second.push_back(LI);
19776 return hash_value(LI->getPointerOperand());
19777 };
19778
19779 while (!Worklist.empty()) {
19780 auto [TreeN, Level] = Worklist.pop_back_val();
19781 SmallVector<Value *> PossibleRedVals;
19782 SmallVector<Instruction *> PossibleReductionOps;
19783 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19784 addReductionOps(TreeN);
19785 // Add reduction values. The values are sorted for better vectorization
19786 // results.
19787 for (Value *V : PossibleRedVals) {
19788 size_t Key, Idx;
19789 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19790 /*AllowAlternate=*/false);
19791 ++PossibleReducedVals[Key][Idx]
19792 .insert(std::make_pair(V, 0))
19793 .first->second;
19794 }
19795 for (Instruction *I : reverse(PossibleReductionOps))
19796 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19797 }
19798 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19799 // Sort values by the total number of values kinds to start the reduction
19800 // from the longest possible reduced values sequences.
19801 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19802 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19803 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19804 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19805 It != E; ++It) {
19806 PossibleRedValsVect.emplace_back();
19807 auto RedValsVect = It->second.takeVector();
19808 stable_sort(RedValsVect, llvm::less_second());
19809 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19810 PossibleRedValsVect.back().append(Data.second, Data.first);
19811 }
19812 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19813 return P1.size() > P2.size();
19814 });
19815 int NewIdx = -1;
19816 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19817 if (NewIdx < 0 ||
19818 (!isGoodForReduction(Data) &&
19819 (!isa<LoadInst>(Data.front()) ||
19820 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19822 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19824 cast<LoadInst>(ReducedVals[NewIdx].front())
19825 ->getPointerOperand())))) {
19826 NewIdx = ReducedVals.size();
19827 ReducedVals.emplace_back();
19828 }
19829 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19830 }
19831 }
19832 // Sort the reduced values by number of same/alternate opcode and/or pointer
19833 // operand.
19834 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19835 return P1.size() > P2.size();
19836 });
19837 return true;
19838 }
19839
19840 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19841 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19842 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
19843 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19844 constexpr unsigned RegMaxNumber = 4;
19845 constexpr unsigned RedValsMaxNumber = 128;
19846 // If there are a sufficient number of reduction values, reduce
19847 // to a nearby power-of-2. We can safely generate oversized
19848 // vectors and rely on the backend to split them to legal sizes.
19849 if (unsigned NumReducedVals = std::accumulate(
19850 ReducedVals.begin(), ReducedVals.end(), 0,
19851 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19852 if (!isGoodForReduction(Vals))
19853 return Num;
19854 return Num + Vals.size();
19855 });
19856 NumReducedVals < ReductionLimit &&
19857 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19858 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19859 })) {
19860 for (ReductionOpsType &RdxOps : ReductionOps)
19861 for (Value *RdxOp : RdxOps)
19862 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19863 return nullptr;
19864 }
19865
19866 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19867 TargetFolder(DL));
19868 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19869
19870 // Track the reduced values in case if they are replaced by extractelement
19871 // because of the vectorization.
19872 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19873 ReducedVals.front().size());
19874
19875 // The compare instruction of a min/max is the insertion point for new
19876 // instructions and may be replaced with a new compare instruction.
19877 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19878 assert(isa<SelectInst>(RdxRootInst) &&
19879 "Expected min/max reduction to have select root instruction");
19880 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19881 assert(isa<Instruction>(ScalarCond) &&
19882 "Expected min/max reduction to have compare condition");
19883 return cast<Instruction>(ScalarCond);
19884 };
19885
19886 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19887 return isBoolLogicOp(cast<Instruction>(V));
19888 });
19889 // Return new VectorizedTree, based on previous value.
19890 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19891 if (VectorizedTree) {
19892 // Update the final value in the reduction.
19894 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19895 if (AnyBoolLogicOp) {
19896 auto It = ReducedValsToOps.find(VectorizedTree);
19897 auto It1 = ReducedValsToOps.find(Res);
19898 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19899 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19900 (It != ReducedValsToOps.end() &&
19901 any_of(It->getSecond(), [&](Instruction *I) {
19902 return isBoolLogicOp(I) &&
19903 getRdxOperand(I, 0) == VectorizedTree;
19904 }))) {
19905 ;
19906 } else if (isGuaranteedNotToBePoison(Res, AC) ||
19907 (It1 != ReducedValsToOps.end() &&
19908 any_of(It1->getSecond(), [&](Instruction *I) {
19909 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19910 }))) {
19911 std::swap(VectorizedTree, Res);
19912 } else {
19913 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19914 }
19915 }
19916
19917 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19918 ReductionOps);
19919 }
19920 // Initialize the final value in the reduction.
19921 return Res;
19922 };
19923 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19924 ReductionOps.front().size());
19925 for (ReductionOpsType &RdxOps : ReductionOps)
19926 for (Value *RdxOp : RdxOps) {
19927 if (!RdxOp)
19928 continue;
19929 IgnoreList.insert(RdxOp);
19930 }
19931 // Intersect the fast-math-flags from all reduction operations.
19932 FastMathFlags RdxFMF;
19933 RdxFMF.set();
19934 for (Value *U : IgnoreList)
19935 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19936 RdxFMF &= FPMO->getFastMathFlags();
19937 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19938
19939 // Need to track reduced vals, they may be changed during vectorization of
19940 // subvectors.
19941 for (ArrayRef<Value *> Candidates : ReducedVals)
19942 for (Value *V : Candidates)
19943 TrackedVals.try_emplace(V, V);
19944
19946 Value *V) -> unsigned & {
19947 auto *It = MV.find(V);
19948 assert(It != MV.end() && "Unable to find given key.");
19949 return It->second;
19950 };
19951
19952 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19953 // List of the values that were reduced in other trees as part of gather
19954 // nodes and thus requiring extract if fully vectorized in other trees.
19955 SmallPtrSet<Value *, 4> RequiredExtract;
19956 WeakTrackingVH VectorizedTree = nullptr;
19957 bool CheckForReusedReductionOps = false;
19958 // Try to vectorize elements based on their type.
19960 for (ArrayRef<Value *> RV : ReducedVals)
19961 States.push_back(getSameOpcode(RV, TLI));
19962 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19963 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19964 InstructionsState S = States[I];
19965 SmallVector<Value *> Candidates;
19966 Candidates.reserve(2 * OrigReducedVals.size());
19967 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19968 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19969 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19970 // Check if the reduction value was not overriden by the extractelement
19971 // instruction because of the vectorization and exclude it, if it is not
19972 // compatible with other values.
19973 // Also check if the instruction was folded to constant/other value.
19974 auto *Inst = dyn_cast<Instruction>(RdxVal);
19975 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
19976 (!S || !S.isOpcodeOrAlt(Inst))) ||
19977 (S && !Inst))
19978 continue;
19979 Candidates.push_back(RdxVal);
19980 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19981 }
19982 bool ShuffledExtracts = false;
19983 // Try to handle shuffled extractelements.
19984 if (S && S.getOpcode() == Instruction::ExtractElement &&
19985 !S.isAltShuffle() && I + 1 < E) {
19986 SmallVector<Value *> CommonCandidates(Candidates);
19987 for (Value *RV : ReducedVals[I + 1]) {
19988 Value *RdxVal = TrackedVals.at(RV);
19989 // Check if the reduction value was not overriden by the
19990 // extractelement instruction because of the vectorization and
19991 // exclude it, if it is not compatible with other values.
19992 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19993 if (!Inst)
19994 continue;
19995 CommonCandidates.push_back(RdxVal);
19996 TrackedToOrig.try_emplace(RdxVal, RV);
19997 }
19999 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
20000 ++I;
20001 Candidates.swap(CommonCandidates);
20002 ShuffledExtracts = true;
20003 }
20004 }
20005
20006 // Emit code for constant values.
20007 if (Candidates.size() > 1 && allConstant(Candidates)) {
20008 Value *Res = Candidates.front();
20009 Value *OrigV = TrackedToOrig.at(Candidates.front());
20010 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20011 for (Value *VC : ArrayRef(Candidates).drop_front()) {
20012 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
20013 Value *OrigV = TrackedToOrig.at(VC);
20014 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20015 if (auto *ResI = dyn_cast<Instruction>(Res))
20016 V.analyzedReductionRoot(ResI);
20017 }
20018 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20019 continue;
20020 }
20021
20022 unsigned NumReducedVals = Candidates.size();
20023 if (NumReducedVals < ReductionLimit &&
20024 (NumReducedVals < 2 || !isSplat(Candidates)))
20025 continue;
20026
20027 // Check if we support repeated scalar values processing (optimization of
20028 // original scalar identity operations on matched horizontal reductions).
20029 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20030 RdxKind != RecurKind::FMul &&
20031 RdxKind != RecurKind::FMulAdd;
20032 // Gather same values.
20033 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
20034 if (IsSupportedHorRdxIdentityOp)
20035 for (Value *V : Candidates) {
20036 Value *OrigV = TrackedToOrig.at(V);
20037 ++SameValuesCounter.try_emplace(OrigV).first->second;
20038 }
20039 // Used to check if the reduced values used same number of times. In this
20040 // case the compiler may produce better code. E.g. if reduced values are
20041 // aabbccdd (8 x values), then the first node of the tree will have a node
20042 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
20043 // Plus, the final reduction will be performed on <8 x aabbccdd>.
20044 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
20045 // x abcd) * 2.
20046 // Currently it only handles add/fadd/xor. and/or/min/max do not require
20047 // this analysis, other operations may require an extra estimation of
20048 // the profitability.
20049 bool SameScaleFactor = false;
20050 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20051 SameValuesCounter.size() != Candidates.size();
20052 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
20053 if (OptReusedScalars) {
20054 SameScaleFactor =
20055 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20056 RdxKind == RecurKind::Xor) &&
20057 all_of(drop_begin(SameValuesCounter),
20058 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
20059 return P.second == SameValuesCounter.front().second;
20060 });
20061 Candidates.resize(SameValuesCounter.size());
20062 transform(SameValuesCounter, Candidates.begin(),
20063 [&](const auto &P) { return TrackedVals.at(P.first); });
20064 NumReducedVals = Candidates.size();
20065 // Have a reduction of the same element.
20066 if (NumReducedVals == 1) {
20067 Value *OrigV = TrackedToOrig.at(Candidates.front());
20068 unsigned Cnt = At(SameValuesCounter, OrigV);
20069 Value *RedVal =
20070 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20071 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20072 VectorizedVals.try_emplace(OrigV, Cnt);
20073 ExternallyUsedValues.insert(OrigV);
20074 continue;
20075 }
20076 }
20077
20078 unsigned MaxVecRegSize = V.getMaxVecRegSize();
20079 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
20080 const unsigned MaxElts = std::clamp<unsigned>(
20081 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20082 RegMaxNumber * RedValsMaxNumber);
20083
20084 unsigned ReduxWidth = NumReducedVals;
20085 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20086 unsigned NumParts, NumRegs;
20087 Type *ScalarTy = Candidates.front()->getType();
20088 ReduxWidth =
20089 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20090 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20091 NumParts = TTI.getNumberOfParts(Tp);
20092 NumRegs =
20094 while (NumParts > NumRegs) {
20095 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
20096 ReduxWidth = bit_floor(ReduxWidth - 1);
20097 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20098 NumParts = TTI.getNumberOfParts(Tp);
20099 NumRegs =
20101 }
20102 if (NumParts > NumRegs / 2)
20103 ReduxWidth = bit_floor(ReduxWidth);
20104 return ReduxWidth;
20105 };
20106 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20107 ReduxWidth = GetVectorFactor(ReduxWidth);
20108 ReduxWidth = std::min(ReduxWidth, MaxElts);
20109
20110 unsigned Start = 0;
20111 unsigned Pos = Start;
20112 // Restarts vectorization attempt with lower vector factor.
20113 unsigned PrevReduxWidth = ReduxWidth;
20114 bool CheckForReusedReductionOpsLocal = false;
20115 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
20116 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
20117 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20118 // Check if any of the reduction ops are gathered. If so, worth
20119 // trying again with less number of reduction ops.
20120 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20121 }
20122 ++Pos;
20123 if (Pos < NumReducedVals - ReduxWidth + 1)
20124 return IsAnyRedOpGathered;
20125 Pos = Start;
20126 --ReduxWidth;
20127 if (ReduxWidth > 1)
20128 ReduxWidth = GetVectorFactor(ReduxWidth);
20129 return IsAnyRedOpGathered;
20130 };
20131 bool AnyVectorized = false;
20132 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20133 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20134 ReduxWidth >= ReductionLimit) {
20135 // Dependency in tree of the reduction ops - drop this attempt, try
20136 // later.
20137 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20138 Start == 0) {
20139 CheckForReusedReductionOps = true;
20140 break;
20141 }
20142 PrevReduxWidth = ReduxWidth;
20143 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20144 // Been analyzed already - skip.
20145 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20146 (!has_single_bit(ReduxWidth) &&
20147 (IgnoredCandidates.contains(
20148 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
20149 IgnoredCandidates.contains(
20150 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
20151 bit_floor(ReduxWidth))))) ||
20152 V.areAnalyzedReductionVals(VL)) {
20153 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20154 continue;
20155 }
20156 // Early exit if any of the reduction values were deleted during
20157 // previous vectorization attempts.
20158 if (any_of(VL, [&V](Value *RedVal) {
20159 auto *RedValI = dyn_cast<Instruction>(RedVal);
20160 if (!RedValI)
20161 return false;
20162 return V.isDeleted(RedValI);
20163 }))
20164 break;
20165 V.buildTree(VL, IgnoreList);
20166 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20167 if (!AdjustReducedVals())
20168 V.analyzedReductionVals(VL);
20169 continue;
20170 }
20171 if (V.isLoadCombineReductionCandidate(RdxKind)) {
20172 if (!AdjustReducedVals())
20173 V.analyzedReductionVals(VL);
20174 continue;
20175 }
20176 V.reorderTopToBottom();
20177 // No need to reorder the root node at all.
20178 V.reorderBottomToTop(/*IgnoreReorder=*/true);
20179 // Keep extracted other reduction values, if they are used in the
20180 // vectorization trees.
20181 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20182 ExternallyUsedValues);
20183 // The reduction root is used as the insertion point for new
20184 // instructions, so set it as externally used to prevent it from being
20185 // deleted.
20186 LocalExternallyUsedValues.insert(ReductionRoot);
20187 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20188 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
20189 continue;
20190 for (Value *V : ReducedVals[Cnt])
20191 if (isa<Instruction>(V))
20192 LocalExternallyUsedValues.insert(TrackedVals[V]);
20193 }
20194 if (!IsSupportedHorRdxIdentityOp) {
20195 // Number of uses of the candidates in the vector of values.
20196 assert(SameValuesCounter.empty() &&
20197 "Reused values counter map is not empty");
20198 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20199 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20200 continue;
20201 Value *V = Candidates[Cnt];
20202 Value *OrigV = TrackedToOrig.at(V);
20203 ++SameValuesCounter.try_emplace(OrigV).first->second;
20204 }
20205 }
20206 V.transformNodes();
20207 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20208 // Gather externally used values.
20210 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20211 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20212 continue;
20213 Value *RdxVal = Candidates[Cnt];
20214 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20215 RdxVal = It->second;
20216 if (!Visited.insert(RdxVal).second)
20217 continue;
20218 // Check if the scalar was vectorized as part of the vectorization
20219 // tree but not the top node.
20220 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
20221 LocalExternallyUsedValues.insert(RdxVal);
20222 continue;
20223 }
20224 Value *OrigV = TrackedToOrig.at(RdxVal);
20225 unsigned NumOps =
20226 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20227 if (NumOps != ReducedValsToOps.at(OrigV).size())
20228 LocalExternallyUsedValues.insert(RdxVal);
20229 }
20230 // Do not need the list of reused scalars in regular mode anymore.
20231 if (!IsSupportedHorRdxIdentityOp)
20232 SameValuesCounter.clear();
20233 for (Value *RdxVal : VL)
20234 if (RequiredExtract.contains(RdxVal))
20235 LocalExternallyUsedValues.insert(RdxVal);
20236 V.buildExternalUses(LocalExternallyUsedValues);
20237
20238 V.computeMinimumValueSizes();
20239
20240 // Estimate cost.
20241 InstructionCost TreeCost = V.getTreeCost(VL);
20242 InstructionCost ReductionCost =
20243 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20244 InstructionCost Cost = TreeCost + ReductionCost;
20245 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
20246 << " for reduction\n");
20247 if (!Cost.isValid())
20248 break;
20249 if (Cost >= -SLPCostThreshold) {
20250 V.getORE()->emit([&]() {
20251 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20252 ReducedValsToOps.at(VL[0]).front())
20253 << "Vectorizing horizontal reduction is possible "
20254 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20255 << " and threshold "
20256 << ore::NV("Threshold", -SLPCostThreshold);
20257 });
20258 if (!AdjustReducedVals()) {
20259 V.analyzedReductionVals(VL);
20260 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20261 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20262 // Add subvectors of VL to the list of the analyzed values.
20263 for (unsigned VF = getFloorFullVectorNumberOfElements(
20264 *TTI, VL.front()->getType(), ReduxWidth - 1);
20265 VF >= ReductionLimit;
20267 *TTI, VL.front()->getType(), VF - 1)) {
20268 if (has_single_bit(VF) &&
20269 V.getCanonicalGraphSize() != V.getTreeSize())
20270 continue;
20271 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20272 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20273 }
20274 }
20275 }
20276 continue;
20277 }
20278
20279 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20280 << Cost << ". (HorRdx)\n");
20281 V.getORE()->emit([&]() {
20282 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20283 ReducedValsToOps.at(VL[0]).front())
20284 << "Vectorized horizontal reduction with cost "
20285 << ore::NV("Cost", Cost) << " and with tree size "
20286 << ore::NV("TreeSize", V.getTreeSize());
20287 });
20288
20289 Builder.setFastMathFlags(RdxFMF);
20290
20291 // Emit a reduction. If the root is a select (min/max idiom), the insert
20292 // point is the compare condition of that select.
20293 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20294 Instruction *InsertPt = RdxRootInst;
20295 if (IsCmpSelMinMax)
20296 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20297
20298 // Vectorize a tree.
20299 Value *VectorizedRoot =
20300 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20301 // Update TrackedToOrig mapping, since the tracked values might be
20302 // updated.
20303 for (Value *RdxVal : Candidates) {
20304 Value *OrigVal = TrackedToOrig.at(RdxVal);
20305 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20306 if (TransformedRdxVal != RdxVal)
20307 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20308 }
20309
20310 Builder.SetInsertPoint(InsertPt);
20311
20312 // To prevent poison from leaking across what used to be sequential,
20313 // safe, scalar boolean logic operations, the reduction operand must be
20314 // frozen.
20315 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20316 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20317
20318 // Emit code to correctly handle reused reduced values, if required.
20319 if (OptReusedScalars && !SameScaleFactor) {
20320 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20321 SameValuesCounter, TrackedToOrig);
20322 }
20323
20324 Value *ReducedSubTree;
20325 Type *ScalarTy = VL.front()->getType();
20326 if (isa<FixedVectorType>(ScalarTy)) {
20327 assert(SLPReVec && "FixedVectorType is not expected.");
20328 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20329 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20330 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20331 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20332 // Do reduction for each lane.
20333 // e.g., do reduce add for
20334 // VL[0] = <4 x Ty> <a, b, c, d>
20335 // VL[1] = <4 x Ty> <e, f, g, h>
20336 // Lane[0] = <2 x Ty> <a, e>
20337 // Lane[1] = <2 x Ty> <b, f>
20338 // Lane[2] = <2 x Ty> <c, g>
20339 // Lane[3] = <2 x Ty> <d, h>
20340 // result[0] = reduce add Lane[0]
20341 // result[1] = reduce add Lane[1]
20342 // result[2] = reduce add Lane[2]
20343 // result[3] = reduce add Lane[3]
20345 createStrideMask(I, ScalarTyNumElements, VL.size());
20346 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20347 ReducedSubTree = Builder.CreateInsertElement(
20348 ReducedSubTree,
20349 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20350 }
20351 } else {
20352 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20353 RdxRootInst->getType());
20354 }
20355 if (ReducedSubTree->getType() != VL.front()->getType()) {
20356 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20357 "Expected different reduction type.");
20358 ReducedSubTree =
20359 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20360 V.isSignedMinBitwidthRootNode());
20361 }
20362
20363 // Improved analysis for add/fadd/xor reductions with same scale factor
20364 // for all operands of reductions. We can emit scalar ops for them
20365 // instead.
20366 if (OptReusedScalars && SameScaleFactor)
20367 ReducedSubTree = emitScaleForReusedOps(
20368 ReducedSubTree, Builder, SameValuesCounter.front().second);
20369
20370 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20371 // Count vectorized reduced values to exclude them from final reduction.
20372 for (Value *RdxVal : VL) {
20373 Value *OrigV = TrackedToOrig.at(RdxVal);
20374 if (IsSupportedHorRdxIdentityOp) {
20375 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20376 continue;
20377 }
20378 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20379 if (!V.isVectorized(RdxVal))
20380 RequiredExtract.insert(RdxVal);
20381 }
20382 Pos += ReduxWidth;
20383 Start = Pos;
20384 ReduxWidth = NumReducedVals - Pos;
20385 if (ReduxWidth > 1)
20386 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20387 AnyVectorized = true;
20388 }
20389 if (OptReusedScalars && !AnyVectorized) {
20390 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20391 Value *RdxVal = TrackedVals.at(P.first);
20392 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20393 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20394 VectorizedVals.try_emplace(P.first, P.second);
20395 }
20396 continue;
20397 }
20398 }
20399 if (VectorizedTree) {
20400 // Reorder operands of bool logical op in the natural order to avoid
20401 // possible problem with poison propagation. If not possible to reorder
20402 // (both operands are originally RHS), emit an extra freeze instruction
20403 // for the LHS operand.
20404 // I.e., if we have original code like this:
20405 // RedOp1 = select i1 ?, i1 LHS, i1 false
20406 // RedOp2 = select i1 RHS, i1 ?, i1 false
20407
20408 // Then, we swap LHS/RHS to create a new op that matches the poison
20409 // semantics of the original code.
20410
20411 // If we have original code like this and both values could be poison:
20412 // RedOp1 = select i1 ?, i1 LHS, i1 false
20413 // RedOp2 = select i1 ?, i1 RHS, i1 false
20414
20415 // Then, we must freeze LHS in the new op.
20416 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20417 Instruction *RedOp1,
20418 Instruction *RedOp2,
20419 bool InitStep) {
20420 if (!AnyBoolLogicOp)
20421 return;
20422 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
20423 getRdxOperand(RedOp1, 0) == LHS ||
20425 return;
20426 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20427 getRdxOperand(RedOp2, 0) == RHS ||
20429 std::swap(LHS, RHS);
20430 return;
20431 }
20432 if (LHS != VectorizedTree)
20433 LHS = Builder.CreateFreeze(LHS);
20434 };
20435 // Finish the reduction.
20436 // Need to add extra arguments and not vectorized possible reduction
20437 // values.
20438 // Try to avoid dependencies between the scalar remainders after
20439 // reductions.
20440 auto FinalGen =
20442 bool InitStep) {
20443 unsigned Sz = InstVals.size();
20445 Sz % 2);
20446 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20447 Instruction *RedOp = InstVals[I + 1].first;
20448 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20449 Value *RdxVal1 = InstVals[I].second;
20450 Value *StableRdxVal1 = RdxVal1;
20451 auto It1 = TrackedVals.find(RdxVal1);
20452 if (It1 != TrackedVals.end())
20453 StableRdxVal1 = It1->second;
20454 Value *RdxVal2 = InstVals[I + 1].second;
20455 Value *StableRdxVal2 = RdxVal2;
20456 auto It2 = TrackedVals.find(RdxVal2);
20457 if (It2 != TrackedVals.end())
20458 StableRdxVal2 = It2->second;
20459 // To prevent poison from leaking across what used to be
20460 // sequential, safe, scalar boolean logic operations, the
20461 // reduction operand must be frozen.
20462 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20463 RedOp, InitStep);
20464 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20465 StableRdxVal2, "op.rdx", ReductionOps);
20466 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20467 }
20468 if (Sz % 2 == 1)
20469 ExtraReds[Sz / 2] = InstVals.back();
20470 return ExtraReds;
20471 };
20473 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20474 VectorizedTree);
20476 for (ArrayRef<Value *> Candidates : ReducedVals) {
20477 for (Value *RdxVal : Candidates) {
20478 if (!Visited.insert(RdxVal).second)
20479 continue;
20480 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20481 for (Instruction *RedOp :
20482 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20483 ExtraReductions.emplace_back(RedOp, RdxVal);
20484 }
20485 }
20486 // Iterate through all not-vectorized reduction values/extra arguments.
20487 bool InitStep = true;
20488 while (ExtraReductions.size() > 1) {
20490 FinalGen(ExtraReductions, InitStep);
20491 ExtraReductions.swap(NewReds);
20492 InitStep = false;
20493 }
20494 VectorizedTree = ExtraReductions.front().second;
20495
20496 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20497
20498 // The original scalar reduction is expected to have no remaining
20499 // uses outside the reduction tree itself. Assert that we got this
20500 // correct, replace internal uses with undef, and mark for eventual
20501 // deletion.
20502#ifndef NDEBUG
20503 SmallSet<Value *, 4> IgnoreSet;
20504 for (ArrayRef<Value *> RdxOps : ReductionOps)
20505 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20506#endif
20507 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20508 for (Value *Ignore : RdxOps) {
20509 if (!Ignore)
20510 continue;
20511#ifndef NDEBUG
20512 for (auto *U : Ignore->users()) {
20513 assert(IgnoreSet.count(U) &&
20514 "All users must be either in the reduction ops list.");
20515 }
20516#endif
20517 if (!Ignore->use_empty()) {
20518 Value *P = PoisonValue::get(Ignore->getType());
20519 Ignore->replaceAllUsesWith(P);
20520 }
20521 }
20522 V.removeInstructionsAndOperands(RdxOps);
20523 }
20524 } else if (!CheckForReusedReductionOps) {
20525 for (ReductionOpsType &RdxOps : ReductionOps)
20526 for (Value *RdxOp : RdxOps)
20527 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20528 }
20529 return VectorizedTree;
20530 }
20531
20532private:
20533 /// Calculate the cost of a reduction.
20534 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20535 ArrayRef<Value *> ReducedVals,
20536 bool IsCmpSelMinMax, FastMathFlags FMF,
20537 const BoUpSLP &R) {
20539 Type *ScalarTy = ReducedVals.front()->getType();
20540 unsigned ReduxWidth = ReducedVals.size();
20541 FixedVectorType *VectorTy = R.getReductionType();
20542 InstructionCost VectorCost = 0, ScalarCost;
20543 // If all of the reduced values are constant, the vector cost is 0, since
20544 // the reduction value can be calculated at the compile time.
20545 bool AllConsts = allConstant(ReducedVals);
20546 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20548 // Scalar cost is repeated for N-1 elements.
20549 int Cnt = ReducedVals.size();
20550 for (Value *RdxVal : ReducedVals) {
20551 if (Cnt == 1)
20552 break;
20553 --Cnt;
20554 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20555 Cost += GenCostFn();
20556 continue;
20557 }
20558 InstructionCost ScalarCost = 0;
20559 for (User *U : RdxVal->users()) {
20560 auto *RdxOp = cast<Instruction>(U);
20561 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20562 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20563 continue;
20564 }
20565 ScalarCost = InstructionCost::getInvalid();
20566 break;
20567 }
20568 if (ScalarCost.isValid())
20569 Cost += ScalarCost;
20570 else
20571 Cost += GenCostFn();
20572 }
20573 return Cost;
20574 };
20575 switch (RdxKind) {
20576 case RecurKind::Add:
20577 case RecurKind::Mul:
20578 case RecurKind::Or:
20579 case RecurKind::And:
20580 case RecurKind::Xor:
20581 case RecurKind::FAdd:
20582 case RecurKind::FMul: {
20583 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20584 if (!AllConsts) {
20585 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20586 assert(SLPReVec && "FixedVectorType is not expected.");
20587 unsigned ScalarTyNumElements = VecTy->getNumElements();
20588 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20589 VectorCost += TTI->getShuffleCost(
20590 TTI::SK_PermuteSingleSrc, VectorTy,
20591 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20592 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20593 CostKind);
20594 }
20595 VectorCost += TTI->getScalarizationOverhead(
20596 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20597 /*Extract*/ false, TTI::TCK_RecipThroughput);
20598 } else {
20599 Type *RedTy = VectorTy->getElementType();
20600 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20601 std::make_pair(RedTy, true));
20602 if (RType == RedTy) {
20603 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20604 FMF, CostKind);
20605 } else {
20606 VectorCost = TTI->getExtendedReductionCost(
20607 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20608 FMF, CostKind);
20609 }
20610 }
20611 }
20612 ScalarCost = EvaluateScalarCost([&]() {
20613 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20614 });
20615 break;
20616 }
20617 case RecurKind::FMax:
20618 case RecurKind::FMin:
20619 case RecurKind::FMaximum:
20620 case RecurKind::FMinimum:
20621 case RecurKind::SMax:
20622 case RecurKind::SMin:
20623 case RecurKind::UMax:
20624 case RecurKind::UMin: {
20626 if (!AllConsts)
20627 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20628 ScalarCost = EvaluateScalarCost([&]() {
20629 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20630 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20631 });
20632 break;
20633 }
20634 default:
20635 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20636 }
20637
20638 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20639 << " for reduction of " << shortBundleName(ReducedVals)
20640 << " (It is a splitting reduction)\n");
20641 return VectorCost - ScalarCost;
20642 }
20643
20644 /// Emit a horizontal reduction of the vectorized value.
20645 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20646 const TargetTransformInfo *TTI, Type *DestTy) {
20647 assert(VectorizedValue && "Need to have a vectorized tree node");
20648 assert(RdxKind != RecurKind::FMulAdd &&
20649 "A call to the llvm.fmuladd intrinsic is not handled yet");
20650
20651 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20652 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20653 RdxKind == RecurKind::Add &&
20654 DestTy->getScalarType() != FTy->getScalarType()) {
20655 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20656 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20657 Value *V = Builder.CreateBitCast(
20658 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20659 ++NumVectorInstructions;
20660 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20661 }
20662 ++NumVectorInstructions;
20663 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20664 }
20665
20666 /// Emits optimized code for unique scalar value reused \p Cnt times.
20667 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20668 unsigned Cnt) {
20669 assert(IsSupportedHorRdxIdentityOp &&
20670 "The optimization of matched scalar identity horizontal reductions "
20671 "must be supported.");
20672 if (Cnt == 1)
20673 return VectorizedValue;
20674 switch (RdxKind) {
20675 case RecurKind::Add: {
20676 // res = mul vv, n
20677 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20678 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20679 << VectorizedValue << ". (HorRdx)\n");
20680 return Builder.CreateMul(VectorizedValue, Scale);
20681 }
20682 case RecurKind::Xor: {
20683 // res = n % 2 ? 0 : vv
20684 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20685 << ". (HorRdx)\n");
20686 if (Cnt % 2 == 0)
20687 return Constant::getNullValue(VectorizedValue->getType());
20688 return VectorizedValue;
20689 }
20690 case RecurKind::FAdd: {
20691 // res = fmul v, n
20692 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20693 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20694 << VectorizedValue << ". (HorRdx)\n");
20695 return Builder.CreateFMul(VectorizedValue, Scale);
20696 }
20697 case RecurKind::And:
20698 case RecurKind::Or:
20699 case RecurKind::SMax:
20700 case RecurKind::SMin:
20701 case RecurKind::UMax:
20702 case RecurKind::UMin:
20703 case RecurKind::FMax:
20704 case RecurKind::FMin:
20705 case RecurKind::FMaximum:
20706 case RecurKind::FMinimum:
20707 // res = vv
20708 return VectorizedValue;
20709 case RecurKind::Mul:
20710 case RecurKind::FMul:
20711 case RecurKind::FMulAdd:
20712 case RecurKind::IAnyOf:
20713 case RecurKind::FAnyOf:
20714 case RecurKind::IFindLastIV:
20715 case RecurKind::FFindLastIV:
20716 case RecurKind::None:
20717 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20718 }
20719 return nullptr;
20720 }
20721
20722 /// Emits actual operation for the scalar identity values, found during
20723 /// horizontal reduction analysis.
20724 Value *
20725 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20726 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20727 const DenseMap<Value *, Value *> &TrackedToOrig) {
20728 assert(IsSupportedHorRdxIdentityOp &&
20729 "The optimization of matched scalar identity horizontal reductions "
20730 "must be supported.");
20731 ArrayRef<Value *> VL = R.getRootNodeScalars();
20732 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20733 if (VTy->getElementType() != VL.front()->getType()) {
20734 VectorizedValue = Builder.CreateIntCast(
20735 VectorizedValue,
20736 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20737 R.isSignedMinBitwidthRootNode());
20738 }
20739 switch (RdxKind) {
20740 case RecurKind::Add: {
20741 // root = mul prev_root, <1, 1, n, 1>
20743 for (Value *V : VL) {
20744 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20745 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20746 }
20747 auto *Scale = ConstantVector::get(Vals);
20748 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20749 << VectorizedValue << ". (HorRdx)\n");
20750 return Builder.CreateMul(VectorizedValue, Scale);
20751 }
20752 case RecurKind::And:
20753 case RecurKind::Or:
20754 // No need for multiple or/and(s).
20755 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20756 << ". (HorRdx)\n");
20757 return VectorizedValue;
20758 case RecurKind::SMax:
20759 case RecurKind::SMin:
20760 case RecurKind::UMax:
20761 case RecurKind::UMin:
20762 case RecurKind::FMax:
20763 case RecurKind::FMin:
20764 case RecurKind::FMaximum:
20765 case RecurKind::FMinimum:
20766 // No need for multiple min/max(s) of the same value.
20767 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20768 << ". (HorRdx)\n");
20769 return VectorizedValue;
20770 case RecurKind::Xor: {
20771 // Replace values with even number of repeats with 0, since
20772 // x xor x = 0.
20773 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20774 // 7>, if elements 4th and 6th elements have even number of repeats.
20776 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20778 std::iota(Mask.begin(), Mask.end(), 0);
20779 bool NeedShuffle = false;
20780 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20781 Value *V = VL[I];
20782 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20783 if (Cnt % 2 == 0) {
20784 Mask[I] = VF;
20785 NeedShuffle = true;
20786 }
20787 }
20788 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20789 : Mask) dbgs()
20790 << I << " ";
20791 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20792 if (NeedShuffle)
20793 VectorizedValue = Builder.CreateShuffleVector(
20794 VectorizedValue,
20795 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20796 return VectorizedValue;
20797 }
20798 case RecurKind::FAdd: {
20799 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20801 for (Value *V : VL) {
20802 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20803 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20804 }
20805 auto *Scale = ConstantVector::get(Vals);
20806 return Builder.CreateFMul(VectorizedValue, Scale);
20807 }
20808 case RecurKind::Mul:
20809 case RecurKind::FMul:
20810 case RecurKind::FMulAdd:
20811 case RecurKind::IAnyOf:
20812 case RecurKind::FAnyOf:
20813 case RecurKind::IFindLastIV:
20814 case RecurKind::FFindLastIV:
20815 case RecurKind::None:
20816 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20817 }
20818 return nullptr;
20819 }
20820};
20821} // end anonymous namespace
20822
20823/// Gets recurrence kind from the specified value.
20825 return HorizontalReduction::getRdxKind(V);
20826}
20827static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20828 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20829 return cast<FixedVectorType>(IE->getType())->getNumElements();
20830
20831 unsigned AggregateSize = 1;
20832 auto *IV = cast<InsertValueInst>(InsertInst);
20833 Type *CurrentType = IV->getType();
20834 do {
20835 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20836 for (auto *Elt : ST->elements())
20837 if (Elt != ST->getElementType(0)) // check homogeneity
20838 return std::nullopt;
20839 AggregateSize *= ST->getNumElements();
20840 CurrentType = ST->getElementType(0);
20841 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20842 AggregateSize *= AT->getNumElements();
20843 CurrentType = AT->getElementType();
20844 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20845 AggregateSize *= VT->getNumElements();
20846 return AggregateSize;
20847 } else if (CurrentType->isSingleValueType()) {
20848 return AggregateSize;
20849 } else {
20850 return std::nullopt;
20851 }
20852 } while (true);
20853}
20854
20855static void findBuildAggregate_rec(Instruction *LastInsertInst,
20857 SmallVectorImpl<Value *> &BuildVectorOpds,
20858 SmallVectorImpl<Value *> &InsertElts,
20859 unsigned OperandOffset, const BoUpSLP &R) {
20860 do {
20861 Value *InsertedOperand = LastInsertInst->getOperand(1);
20862 std::optional<unsigned> OperandIndex =
20863 getElementIndex(LastInsertInst, OperandOffset);
20864 if (!OperandIndex || R.isDeleted(LastInsertInst))
20865 return;
20866 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20867 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20868 BuildVectorOpds, InsertElts, *OperandIndex, R);
20869
20870 } else {
20871 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20872 InsertElts[*OperandIndex] = LastInsertInst;
20873 }
20874 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20875 } while (LastInsertInst != nullptr &&
20876 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20877 LastInsertInst->hasOneUse());
20878}
20879
20880/// Recognize construction of vectors like
20881/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20882/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20883/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20884/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20885/// starting from the last insertelement or insertvalue instruction.
20886///
20887/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20888/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20889/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20890///
20891/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20892///
20893/// \return true if it matches.
20894static bool findBuildAggregate(Instruction *LastInsertInst,
20896 SmallVectorImpl<Value *> &BuildVectorOpds,
20897 SmallVectorImpl<Value *> &InsertElts,
20898 const BoUpSLP &R) {
20899
20900 assert((isa<InsertElementInst>(LastInsertInst) ||
20901 isa<InsertValueInst>(LastInsertInst)) &&
20902 "Expected insertelement or insertvalue instruction!");
20903
20904 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20905 "Expected empty result vectors!");
20906
20907 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20908 if (!AggregateSize)
20909 return false;
20910 BuildVectorOpds.resize(*AggregateSize);
20911 InsertElts.resize(*AggregateSize);
20912
20913 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20914 R);
20915 llvm::erase(BuildVectorOpds, nullptr);
20916 llvm::erase(InsertElts, nullptr);
20917 if (BuildVectorOpds.size() >= 2)
20918 return true;
20919
20920 return false;
20921}
20922
20923/// Try and get a reduction instruction from a phi node.
20924///
20925/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20926/// if they come from either \p ParentBB or a containing loop latch.
20927///
20928/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20929/// if not possible.
20931 BasicBlock *ParentBB, LoopInfo *LI) {
20932 // There are situations where the reduction value is not dominated by the
20933 // reduction phi. Vectorizing such cases has been reported to cause
20934 // miscompiles. See PR25787.
20935 auto DominatedReduxValue = [&](Value *R) {
20936 return isa<Instruction>(R) &&
20937 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20938 };
20939
20940 Instruction *Rdx = nullptr;
20941
20942 // Return the incoming value if it comes from the same BB as the phi node.
20943 if (P->getIncomingBlock(0) == ParentBB) {
20944 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20945 } else if (P->getIncomingBlock(1) == ParentBB) {
20946 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20947 }
20948
20949 if (Rdx && DominatedReduxValue(Rdx))
20950 return Rdx;
20951
20952 // Otherwise, check whether we have a loop latch to look at.
20953 Loop *BBL = LI->getLoopFor(ParentBB);
20954 if (!BBL)
20955 return nullptr;
20956 BasicBlock *BBLatch = BBL->getLoopLatch();
20957 if (!BBLatch)
20958 return nullptr;
20959
20960 // There is a loop latch, return the incoming value if it comes from
20961 // that. This reduction pattern occasionally turns up.
20962 if (P->getIncomingBlock(0) == BBLatch) {
20963 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20964 } else if (P->getIncomingBlock(1) == BBLatch) {
20965 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20966 }
20967
20968 if (Rdx && DominatedReduxValue(Rdx))
20969 return Rdx;
20970
20971 return nullptr;
20972}
20973
20974static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
20975 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
20976 return true;
20977 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
20978 return true;
20979 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
20980 return true;
20981 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
20982 return true;
20983 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
20984 return true;
20985 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
20986 return true;
20987 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
20988 return true;
20989 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
20990 return true;
20991 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
20992 return true;
20993 return false;
20994}
20995
20996/// We could have an initial reduction that is not an add.
20997/// r *= v1 + v2 + v3 + v4
20998/// In such a case start looking for a tree rooted in the first '+'.
20999/// \Returns the new root if found, which may be nullptr if not an instruction.
21001 Instruction *Root) {
21002 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
21003 isa<IntrinsicInst>(Root)) &&
21004 "Expected binop, select, or intrinsic for reduction matching");
21005 Value *LHS =
21006 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
21007 Value *RHS =
21008 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
21009 if (LHS == Phi)
21010 return dyn_cast<Instruction>(RHS);
21011 if (RHS == Phi)
21012 return dyn_cast<Instruction>(LHS);
21013 return nullptr;
21014}
21015
21016/// \p Returns the first operand of \p I that does not match \p Phi. If
21017/// operand is not an instruction it returns nullptr.
21019 Value *Op0 = nullptr;
21020 Value *Op1 = nullptr;
21021 if (!matchRdxBop(I, Op0, Op1))
21022 return nullptr;
21023 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21024}
21025
21026/// \Returns true if \p I is a candidate instruction for reduction vectorization.
21028 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
21029 Value *B0 = nullptr, *B1 = nullptr;
21030 bool IsBinop = matchRdxBop(I, B0, B1);
21031 return IsBinop || IsSelect;
21032}
21033
21034bool SLPVectorizerPass::vectorizeHorReduction(
21035 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
21036 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
21037 if (!ShouldVectorizeHor)
21038 return false;
21039 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
21040
21041 if (Root->getParent() != BB || isa<PHINode>(Root))
21042 return false;
21043
21044 // If we can find a secondary reduction root, use that instead.
21045 auto SelectRoot = [&]() {
21046 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
21047 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
21048 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
21049 return NewRoot;
21050 return Root;
21051 };
21052
21053 // Start analysis starting from Root instruction. If horizontal reduction is
21054 // found, try to vectorize it. If it is not a horizontal reduction or
21055 // vectorization is not possible or not effective, and currently analyzed
21056 // instruction is a binary operation, try to vectorize the operands, using
21057 // pre-order DFS traversal order. If the operands were not vectorized, repeat
21058 // the same procedure considering each operand as a possible root of the
21059 // horizontal reduction.
21060 // Interrupt the process if the Root instruction itself was vectorized or all
21061 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21062 // If a horizintal reduction was not matched or vectorized we collect
21063 // instructions for possible later attempts for vectorization.
21064 std::queue<std::pair<Instruction *, unsigned>> Stack;
21065 Stack.emplace(SelectRoot(), 0);
21066 SmallPtrSet<Value *, 8> VisitedInstrs;
21067 bool Res = false;
21068 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
21069 if (R.isAnalyzedReductionRoot(Inst))
21070 return nullptr;
21071 if (!isReductionCandidate(Inst))
21072 return nullptr;
21073 HorizontalReduction HorRdx;
21074 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21075 return nullptr;
21076 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
21077 };
21078 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21079 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21080 FutureSeed = getNonPhiOperand(Root, P);
21081 if (!FutureSeed)
21082 return false;
21083 }
21084 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21085 // analysis is done separately.
21086 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21087 PostponedInsts.push_back(FutureSeed);
21088 return true;
21089 };
21090
21091 while (!Stack.empty()) {
21092 Instruction *Inst;
21093 unsigned Level;
21094 std::tie(Inst, Level) = Stack.front();
21095 Stack.pop();
21096 // Do not try to analyze instruction that has already been vectorized.
21097 // This may happen when we vectorize instruction operands on a previous
21098 // iteration while stack was populated before that happened.
21099 if (R.isDeleted(Inst))
21100 continue;
21101 if (Value *VectorizedV = TryToReduce(Inst)) {
21102 Res = true;
21103 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21104 // Try to find another reduction.
21105 Stack.emplace(I, Level);
21106 continue;
21107 }
21108 if (R.isDeleted(Inst))
21109 continue;
21110 } else {
21111 // We could not vectorize `Inst` so try to use it as a future seed.
21112 if (!TryAppendToPostponedInsts(Inst)) {
21113 assert(Stack.empty() && "Expected empty stack");
21114 break;
21115 }
21116 }
21117
21118 // Try to vectorize operands.
21119 // Continue analysis for the instruction from the same basic block only to
21120 // save compile time.
21121 if (++Level < RecursionMaxDepth)
21122 for (auto *Op : Inst->operand_values())
21123 if (VisitedInstrs.insert(Op).second)
21124 if (auto *I = dyn_cast<Instruction>(Op))
21125 // Do not try to vectorize CmpInst operands, this is done
21126 // separately.
21127 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21128 !R.isDeleted(I) && I->getParent() == BB)
21129 Stack.emplace(I, Level);
21130 }
21131 return Res;
21132}
21133
21134bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
21135 BasicBlock *BB, BoUpSLP &R) {
21136 SmallVector<WeakTrackingVH> PostponedInsts;
21137 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21138 Res |= tryToVectorize(PostponedInsts, R);
21139 return Res;
21140}
21141
21142bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21143 BoUpSLP &R) {
21144 bool Res = false;
21145 for (Value *V : Insts)
21146 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21147 Res |= tryToVectorize(Inst, R);
21148 return Res;
21149}
21150
21151bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21152 BasicBlock *BB, BoUpSLP &R,
21153 bool MaxVFOnly) {
21154 if (!R.canMapToVector(IVI->getType()))
21155 return false;
21156
21157 SmallVector<Value *, 16> BuildVectorOpds;
21158 SmallVector<Value *, 16> BuildVectorInsts;
21159 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
21160 return false;
21161
21162 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21163 R.getORE()->emit([&]() {
21164 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
21165 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
21166 "trying reduction first.";
21167 });
21168 return false;
21169 }
21170 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
21171 // Aggregate value is unlikely to be processed in vector register.
21172 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21173}
21174
21175bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21176 BasicBlock *BB, BoUpSLP &R,
21177 bool MaxVFOnly) {
21178 SmallVector<Value *, 16> BuildVectorInsts;
21179 SmallVector<Value *, 16> BuildVectorOpds;
21181 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21182 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21183 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21184 return false;
21185
21186 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21187 R.getORE()->emit([&]() {
21188 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
21189 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
21190 "trying reduction first.";
21191 });
21192 return false;
21193 }
21194 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
21195 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21196}
21197
21198template <typename T>
21200 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
21201 function_ref<bool(T *, T *)> AreCompatible,
21202 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
21203 bool MaxVFOnly, BoUpSLP &R) {
21204 bool Changed = false;
21205 // Sort by type, parent, operands.
21206 stable_sort(Incoming, Comparator);
21207
21208 // Try to vectorize elements base on their type.
21209 SmallVector<T *> Candidates;
21211 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
21212 VL.clear()) {
21213 // Look for the next elements with the same type, parent and operand
21214 // kinds.
21215 auto *I = dyn_cast<Instruction>(*IncIt);
21216 if (!I || R.isDeleted(I)) {
21217 ++IncIt;
21218 continue;
21219 }
21220 auto *SameTypeIt = IncIt;
21221 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21222 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21223 AreCompatible(*SameTypeIt, *IncIt))) {
21224 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21225 ++SameTypeIt;
21226 if (I && !R.isDeleted(I))
21227 VL.push_back(cast<T>(I));
21228 }
21229
21230 // Try to vectorize them.
21231 unsigned NumElts = VL.size();
21232 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
21233 << NumElts << ")\n");
21234 // The vectorization is a 3-state attempt:
21235 // 1. Try to vectorize instructions with the same/alternate opcodes with the
21236 // size of maximal register at first.
21237 // 2. Try to vectorize remaining instructions with the same type, if
21238 // possible. This may result in the better vectorization results rather than
21239 // if we try just to vectorize instructions with the same/alternate opcodes.
21240 // 3. Final attempt to try to vectorize all instructions with the
21241 // same/alternate ops only, this may result in some extra final
21242 // vectorization.
21243 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21244 // Success start over because instructions might have been changed.
21245 Changed = true;
21246 VL.swap(Candidates);
21247 Candidates.clear();
21248 for (T *V : VL) {
21249 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21250 Candidates.push_back(V);
21251 }
21252 } else {
21253 /// \Returns the minimum number of elements that we will attempt to
21254 /// vectorize.
21255 auto GetMinNumElements = [&R](Value *V) {
21256 unsigned EltSize = R.getVectorElementSize(V);
21257 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21258 };
21259 if (NumElts < GetMinNumElements(*IncIt) &&
21260 (Candidates.empty() ||
21261 Candidates.front()->getType() == (*IncIt)->getType())) {
21262 for (T *V : VL) {
21263 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21264 Candidates.push_back(V);
21265 }
21266 }
21267 }
21268 // Final attempt to vectorize instructions with the same types.
21269 if (Candidates.size() > 1 &&
21270 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21271 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21272 // Success start over because instructions might have been changed.
21273 Changed = true;
21274 } else if (MaxVFOnly) {
21275 // Try to vectorize using small vectors.
21277 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21278 VL.clear()) {
21279 auto *I = dyn_cast<Instruction>(*It);
21280 if (!I || R.isDeleted(I)) {
21281 ++It;
21282 continue;
21283 }
21284 auto *SameTypeIt = It;
21285 while (SameTypeIt != End &&
21286 (!isa<Instruction>(*SameTypeIt) ||
21287 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21288 AreCompatible(*SameTypeIt, *It))) {
21289 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21290 ++SameTypeIt;
21291 if (I && !R.isDeleted(I))
21292 VL.push_back(cast<T>(I));
21293 }
21294 unsigned NumElts = VL.size();
21295 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21296 /*MaxVFOnly=*/false))
21297 Changed = true;
21298 It = SameTypeIt;
21299 }
21300 }
21301 Candidates.clear();
21302 }
21303
21304 // Start over at the next instruction of a different type (or the end).
21305 IncIt = SameTypeIt;
21306 }
21307 return Changed;
21308}
21309
21310/// Compare two cmp instructions. If IsCompatibility is true, function returns
21311/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21312/// operands. If IsCompatibility is false, function implements strict weak
21313/// ordering relation between two cmp instructions, returning true if the first
21314/// instruction is "less" than the second, i.e. its predicate is less than the
21315/// predicate of the second or the operands IDs are less than the operands IDs
21316/// of the second cmp instruction.
21317template <bool IsCompatibility>
21318static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21319 const DominatorTree &DT) {
21320 assert(isValidElementType(V->getType()) &&
21321 isValidElementType(V2->getType()) &&
21322 "Expected valid element types only.");
21323 if (V == V2)
21324 return IsCompatibility;
21325 auto *CI1 = cast<CmpInst>(V);
21326 auto *CI2 = cast<CmpInst>(V2);
21327 if (CI1->getOperand(0)->getType()->getTypeID() <
21328 CI2->getOperand(0)->getType()->getTypeID())
21329 return !IsCompatibility;
21330 if (CI1->getOperand(0)->getType()->getTypeID() >
21331 CI2->getOperand(0)->getType()->getTypeID())
21332 return false;
21333 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21335 return !IsCompatibility;
21336 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21338 return false;
21339 CmpInst::Predicate Pred1 = CI1->getPredicate();
21340 CmpInst::Predicate Pred2 = CI2->getPredicate();
21343 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21344 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21345 if (BasePred1 < BasePred2)
21346 return !IsCompatibility;
21347 if (BasePred1 > BasePred2)
21348 return false;
21349 // Compare operands.
21350 bool CI1Preds = Pred1 == BasePred1;
21351 bool CI2Preds = Pred2 == BasePred1;
21352 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21353 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21354 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21355 if (Op1 == Op2)
21356 continue;
21357 if (Op1->getValueID() < Op2->getValueID())
21358 return !IsCompatibility;
21359 if (Op1->getValueID() > Op2->getValueID())
21360 return false;
21361 if (auto *I1 = dyn_cast<Instruction>(Op1))
21362 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21363 if (IsCompatibility) {
21364 if (I1->getParent() != I2->getParent())
21365 return false;
21366 } else {
21367 // Try to compare nodes with same parent.
21368 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21369 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21370 if (!NodeI1)
21371 return NodeI2 != nullptr;
21372 if (!NodeI2)
21373 return false;
21374 assert((NodeI1 == NodeI2) ==
21375 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21376 "Different nodes should have different DFS numbers");
21377 if (NodeI1 != NodeI2)
21378 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21379 }
21380 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21381 if (S && (IsCompatibility || !S.isAltShuffle()))
21382 continue;
21383 if (IsCompatibility)
21384 return false;
21385 if (I1->getOpcode() != I2->getOpcode())
21386 return I1->getOpcode() < I2->getOpcode();
21387 }
21388 }
21389 return IsCompatibility;
21390}
21391
21392template <typename ItT>
21393bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21394 BasicBlock *BB, BoUpSLP &R) {
21395 bool Changed = false;
21396 // Try to find reductions first.
21397 for (CmpInst *I : CmpInsts) {
21398 if (R.isDeleted(I))
21399 continue;
21400 for (Value *Op : I->operands())
21401 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21402 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21403 if (R.isDeleted(I))
21404 break;
21405 }
21406 }
21407 // Try to vectorize operands as vector bundles.
21408 for (CmpInst *I : CmpInsts) {
21409 if (R.isDeleted(I))
21410 continue;
21411 Changed |= tryToVectorize(I, R);
21412 }
21413 // Try to vectorize list of compares.
21414 // Sort by type, compare predicate, etc.
21415 auto CompareSorter = [&](Value *V, Value *V2) {
21416 if (V == V2)
21417 return false;
21418 return compareCmp<false>(V, V2, *TLI, *DT);
21419 };
21420
21421 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21422 if (V1 == V2)
21423 return true;
21424 return compareCmp<true>(V1, V2, *TLI, *DT);
21425 };
21426
21428 for (Instruction *V : CmpInsts)
21429 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21430 Vals.push_back(V);
21431 if (Vals.size() <= 1)
21432 return Changed;
21433 Changed |= tryToVectorizeSequence<Value>(
21434 Vals, CompareSorter, AreCompatibleCompares,
21435 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21436 // Exclude possible reductions from other blocks.
21437 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21438 return any_of(V->users(), [V](User *U) {
21439 auto *Select = dyn_cast<SelectInst>(U);
21440 return Select &&
21441 Select->getParent() != cast<Instruction>(V)->getParent();
21442 });
21443 });
21444 if (ArePossiblyReducedInOtherBlock)
21445 return false;
21446 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21447 },
21448 /*MaxVFOnly=*/true, R);
21449 return Changed;
21450}
21451
21452bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21453 BasicBlock *BB, BoUpSLP &R) {
21454 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21455 "This function only accepts Insert instructions");
21456 bool OpsChanged = false;
21457 SmallVector<WeakTrackingVH> PostponedInsts;
21458 for (auto *I : reverse(Instructions)) {
21459 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21460 if (R.isDeleted(I) || isa<CmpInst>(I))
21461 continue;
21462 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21463 OpsChanged |=
21464 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21465 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21466 OpsChanged |=
21467 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21468 }
21469 // pass2 - try to vectorize reductions only
21470 if (R.isDeleted(I))
21471 continue;
21472 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21473 if (R.isDeleted(I) || isa<CmpInst>(I))
21474 continue;
21475 // pass3 - try to match and vectorize a buildvector sequence.
21476 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21477 OpsChanged |=
21478 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21479 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21480 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21481 /*MaxVFOnly=*/false);
21482 }
21483 }
21484 // Now try to vectorize postponed instructions.
21485 OpsChanged |= tryToVectorize(PostponedInsts, R);
21486
21487 Instructions.clear();
21488 return OpsChanged;
21489}
21490
21491bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21492 bool Changed = false;
21494 SmallPtrSet<Value *, 16> VisitedInstrs;
21495 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21496 // node. Allows better to identify the chains that can be vectorized in the
21497 // better way.
21499 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21501 isValidElementType(V2->getType()) &&
21502 "Expected vectorizable types only.");
21503 // It is fine to compare type IDs here, since we expect only vectorizable
21504 // types, like ints, floats and pointers, we don't care about other type.
21505 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21506 return true;
21507 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21508 return false;
21509 if (V1->getType()->getScalarSizeInBits() <
21510 V2->getType()->getScalarSizeInBits())
21511 return true;
21512 if (V1->getType()->getScalarSizeInBits() >
21513 V2->getType()->getScalarSizeInBits())
21514 return false;
21515 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21516 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21517 if (Opcodes1.size() < Opcodes2.size())
21518 return true;
21519 if (Opcodes1.size() > Opcodes2.size())
21520 return false;
21521 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21522 {
21523 // Instructions come first.
21524 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21525 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21526 if (I1 && I2) {
21527 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21528 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21529 if (!NodeI1)
21530 return NodeI2 != nullptr;
21531 if (!NodeI2)
21532 return false;
21533 assert((NodeI1 == NodeI2) ==
21534 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21535 "Different nodes should have different DFS numbers");
21536 if (NodeI1 != NodeI2)
21537 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21538 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21539 if (S && !S.isAltShuffle())
21540 continue;
21541 return I1->getOpcode() < I2->getOpcode();
21542 }
21543 if (I1)
21544 return true;
21545 if (I2)
21546 return false;
21547 }
21548 {
21549 // Non-undef constants come next.
21550 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21551 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21552 if (C1 && C2)
21553 continue;
21554 if (C1)
21555 return true;
21556 if (C2)
21557 return false;
21558 }
21559 bool U1 = isa<UndefValue>(Opcodes1[I]);
21560 bool U2 = isa<UndefValue>(Opcodes2[I]);
21561 {
21562 // Non-constant non-instructions come next.
21563 if (!U1 && !U2) {
21564 auto ValID1 = Opcodes1[I]->getValueID();
21565 auto ValID2 = Opcodes2[I]->getValueID();
21566 if (ValID1 == ValID2)
21567 continue;
21568 if (ValID1 < ValID2)
21569 return true;
21570 if (ValID1 > ValID2)
21571 return false;
21572 }
21573 if (!U1)
21574 return true;
21575 if (!U2)
21576 return false;
21577 }
21578 // Undefs come last.
21579 assert(U1 && U2 && "The only thing left should be undef & undef.");
21580 }
21581 return false;
21582 };
21583 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21584 if (V1 == V2)
21585 return true;
21586 if (V1->getType() != V2->getType())
21587 return false;
21588 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21589 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21590 if (Opcodes1.size() != Opcodes2.size())
21591 return false;
21592 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21593 // Undefs are compatible with any other value.
21594 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21595 continue;
21596 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21597 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21598 if (R.isDeleted(I1) || R.isDeleted(I2))
21599 return false;
21600 if (I1->getParent() != I2->getParent())
21601 return false;
21602 if (getSameOpcode({I1, I2}, *TLI))
21603 continue;
21604 return false;
21605 }
21606 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21607 continue;
21608 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21609 return false;
21610 }
21611 return true;
21612 };
21613
21614 bool HaveVectorizedPhiNodes = false;
21615 do {
21616 // Collect the incoming values from the PHIs.
21617 Incoming.clear();
21618 for (Instruction &I : *BB) {
21619 auto *P = dyn_cast<PHINode>(&I);
21620 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21621 break;
21622
21623 // No need to analyze deleted, vectorized and non-vectorizable
21624 // instructions.
21625 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21626 isValidElementType(P->getType()))
21627 Incoming.push_back(P);
21628 }
21629
21630 if (Incoming.size() <= 1)
21631 break;
21632
21633 // Find the corresponding non-phi nodes for better matching when trying to
21634 // build the tree.
21635 for (Value *V : Incoming) {
21636 SmallVectorImpl<Value *> &Opcodes =
21637 PHIToOpcodes.try_emplace(V).first->getSecond();
21638 if (!Opcodes.empty())
21639 continue;
21640 SmallVector<Value *, 4> Nodes(1, V);
21642 while (!Nodes.empty()) {
21643 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21644 if (!Visited.insert(PHI).second)
21645 continue;
21646 for (Value *V : PHI->incoming_values()) {
21647 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21648 Nodes.push_back(PHI1);
21649 continue;
21650 }
21651 Opcodes.emplace_back(V);
21652 }
21653 }
21654 }
21655
21656 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21657 Incoming, PHICompare, AreCompatiblePHIs,
21658 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21659 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21660 },
21661 /*MaxVFOnly=*/true, R);
21662 Changed |= HaveVectorizedPhiNodes;
21663 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21664 auto *PHI = dyn_cast<PHINode>(P.first);
21665 return !PHI || R.isDeleted(PHI);
21666 }))
21667 PHIToOpcodes.clear();
21668 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21669 } while (HaveVectorizedPhiNodes);
21670
21671 VisitedInstrs.clear();
21672
21673 InstSetVector PostProcessInserts;
21674 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21675 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21676 // also vectorizes `PostProcessCmps`.
21677 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21678 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21679 if (VectorizeCmps) {
21680 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21681 PostProcessCmps.clear();
21682 }
21683 PostProcessInserts.clear();
21684 return Changed;
21685 };
21686 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21687 auto IsInPostProcessInstrs = [&](Instruction *I) {
21688 if (auto *Cmp = dyn_cast<CmpInst>(I))
21689 return PostProcessCmps.contains(Cmp);
21690 return isa<InsertElementInst, InsertValueInst>(I) &&
21691 PostProcessInserts.contains(I);
21692 };
21693 // Returns true if `I` is an instruction without users, like terminator, or
21694 // function call with ignored return value, store. Ignore unused instructions
21695 // (basing on instruction type, except for CallInst and InvokeInst).
21696 auto HasNoUsers = [](Instruction *I) {
21697 return I->use_empty() &&
21698 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21699 };
21700 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21701 // Skip instructions with scalable type. The num of elements is unknown at
21702 // compile-time for scalable type.
21703 if (isa<ScalableVectorType>(It->getType()))
21704 continue;
21705
21706 // Skip instructions marked for the deletion.
21707 if (R.isDeleted(&*It))
21708 continue;
21709 // We may go through BB multiple times so skip the one we have checked.
21710 if (!VisitedInstrs.insert(&*It).second) {
21711 if (HasNoUsers(&*It) &&
21712 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21713 // We would like to start over since some instructions are deleted
21714 // and the iterator may become invalid value.
21715 Changed = true;
21716 It = BB->begin();
21717 E = BB->end();
21718 }
21719 continue;
21720 }
21721
21722 if (isa<DbgInfoIntrinsic>(It))
21723 continue;
21724
21725 // Try to vectorize reductions that use PHINodes.
21726 if (PHINode *P = dyn_cast<PHINode>(It)) {
21727 // Check that the PHI is a reduction PHI.
21728 if (P->getNumIncomingValues() == 2) {
21729 // Try to match and vectorize a horizontal reduction.
21730 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21731 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21732 Changed = true;
21733 It = BB->begin();
21734 E = BB->end();
21735 continue;
21736 }
21737 }
21738 // Try to vectorize the incoming values of the PHI, to catch reductions
21739 // that feed into PHIs.
21740 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21741 // Skip if the incoming block is the current BB for now. Also, bypass
21742 // unreachable IR for efficiency and to avoid crashing.
21743 // TODO: Collect the skipped incoming values and try to vectorize them
21744 // after processing BB.
21745 if (BB == P->getIncomingBlock(I) ||
21746 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21747 continue;
21748
21749 // Postponed instructions should not be vectorized here, delay their
21750 // vectorization.
21751 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21752 PI && !IsInPostProcessInstrs(PI)) {
21753 bool Res =
21754 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21755 Changed |= Res;
21756 if (Res && R.isDeleted(P)) {
21757 It = BB->begin();
21758 E = BB->end();
21759 break;
21760 }
21761 }
21762 }
21763 continue;
21764 }
21765
21766 if (HasNoUsers(&*It)) {
21767 bool OpsChanged = false;
21768 auto *SI = dyn_cast<StoreInst>(It);
21769 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21770 if (SI) {
21771 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21772 // Try to vectorize chain in store, if this is the only store to the
21773 // address in the block.
21774 // TODO: This is just a temporarily solution to save compile time. Need
21775 // to investigate if we can safely turn on slp-vectorize-hor-store
21776 // instead to allow lookup for reduction chains in all non-vectorized
21777 // stores (need to check side effects and compile time).
21778 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21779 SI->getValueOperand()->hasOneUse();
21780 }
21781 if (TryToVectorizeRoot) {
21782 for (auto *V : It->operand_values()) {
21783 // Postponed instructions should not be vectorized here, delay their
21784 // vectorization.
21785 if (auto *VI = dyn_cast<Instruction>(V);
21786 VI && !IsInPostProcessInstrs(VI))
21787 // Try to match and vectorize a horizontal reduction.
21788 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21789 }
21790 }
21791 // Start vectorization of post-process list of instructions from the
21792 // top-tree instructions to try to vectorize as many instructions as
21793 // possible.
21794 OpsChanged |=
21795 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21796 if (OpsChanged) {
21797 // We would like to start over since some instructions are deleted
21798 // and the iterator may become invalid value.
21799 Changed = true;
21800 It = BB->begin();
21801 E = BB->end();
21802 continue;
21803 }
21804 }
21805
21806 if (isa<InsertElementInst, InsertValueInst>(It))
21807 PostProcessInserts.insert(&*It);
21808 else if (isa<CmpInst>(It))
21809 PostProcessCmps.insert(cast<CmpInst>(&*It));
21810 }
21811
21812 return Changed;
21813}
21814
21815bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21816 auto Changed = false;
21817 for (auto &Entry : GEPs) {
21818 // If the getelementptr list has fewer than two elements, there's nothing
21819 // to do.
21820 if (Entry.second.size() < 2)
21821 continue;
21822
21823 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21824 << Entry.second.size() << ".\n");
21825
21826 // Process the GEP list in chunks suitable for the target's supported
21827 // vector size. If a vector register can't hold 1 element, we are done. We
21828 // are trying to vectorize the index computations, so the maximum number of
21829 // elements is based on the size of the index expression, rather than the
21830 // size of the GEP itself (the target's pointer size).
21831 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21832 return !R.isDeleted(GEP);
21833 });
21834 if (It == Entry.second.end())
21835 continue;
21836 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21837 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21838 if (MaxVecRegSize < EltSize)
21839 continue;
21840
21841 unsigned MaxElts = MaxVecRegSize / EltSize;
21842 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21843 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21844 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21845
21846 // Initialize a set a candidate getelementptrs. Note that we use a
21847 // SetVector here to preserve program order. If the index computations
21848 // are vectorizable and begin with loads, we want to minimize the chance
21849 // of having to reorder them later.
21850 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21851
21852 // Some of the candidates may have already been vectorized after we
21853 // initially collected them or their index is optimized to constant value.
21854 // If so, they are marked as deleted, so remove them from the set of
21855 // candidates.
21856 Candidates.remove_if([&R](Value *I) {
21857 return R.isDeleted(cast<Instruction>(I)) ||
21858 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21859 });
21860
21861 // Remove from the set of candidates all pairs of getelementptrs with
21862 // constant differences. Such getelementptrs are likely not good
21863 // candidates for vectorization in a bottom-up phase since one can be
21864 // computed from the other. We also ensure all candidate getelementptr
21865 // indices are unique.
21866 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21867 auto *GEPI = GEPList[I];
21868 if (!Candidates.count(GEPI))
21869 continue;
21870 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21871 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21872 auto *GEPJ = GEPList[J];
21873 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21874 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21875 Candidates.remove(GEPI);
21876 Candidates.remove(GEPJ);
21877 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21878 Candidates.remove(GEPJ);
21879 }
21880 }
21881 }
21882
21883 // We break out of the above computation as soon as we know there are
21884 // fewer than two candidates remaining.
21885 if (Candidates.size() < 2)
21886 continue;
21887
21888 // Add the single, non-constant index of each candidate to the bundle. We
21889 // ensured the indices met these constraints when we originally collected
21890 // the getelementptrs.
21891 SmallVector<Value *, 16> Bundle(Candidates.size());
21892 auto BundleIndex = 0u;
21893 for (auto *V : Candidates) {
21894 auto *GEP = cast<GetElementPtrInst>(V);
21895 auto *GEPIdx = GEP->idx_begin()->get();
21896 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21897 Bundle[BundleIndex++] = GEPIdx;
21898 }
21899
21900 // Try and vectorize the indices. We are currently only interested in
21901 // gather-like cases of the form:
21902 //
21903 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21904 //
21905 // where the loads of "a", the loads of "b", and the subtractions can be
21906 // performed in parallel. It's likely that detecting this pattern in a
21907 // bottom-up phase will be simpler and less costly than building a
21908 // full-blown top-down phase beginning at the consecutive loads.
21909 Changed |= tryToVectorizeList(Bundle, R);
21910 }
21911 }
21912 return Changed;
21913}
21914
21915bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21916 bool Changed = false;
21917 // Sort by type, base pointers and values operand. Value operands must be
21918 // compatible (have the same opcode, same parent), otherwise it is
21919 // definitely not profitable to try to vectorize them.
21920 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21921 if (V->getValueOperand()->getType()->getTypeID() <
21922 V2->getValueOperand()->getType()->getTypeID())
21923 return true;
21924 if (V->getValueOperand()->getType()->getTypeID() >
21925 V2->getValueOperand()->getType()->getTypeID())
21926 return false;
21927 if (V->getPointerOperandType()->getTypeID() <
21928 V2->getPointerOperandType()->getTypeID())
21929 return true;
21930 if (V->getPointerOperandType()->getTypeID() >
21931 V2->getPointerOperandType()->getTypeID())
21932 return false;
21933 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21934 V2->getValueOperand()->getType()->getScalarSizeInBits())
21935 return true;
21936 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21937 V2->getValueOperand()->getType()->getScalarSizeInBits())
21938 return false;
21939 // UndefValues are compatible with all other values.
21940 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21941 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21943 DT->getNode(I1->getParent());
21945 DT->getNode(I2->getParent());
21946 assert(NodeI1 && "Should only process reachable instructions");
21947 assert(NodeI2 && "Should only process reachable instructions");
21948 assert((NodeI1 == NodeI2) ==
21949 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21950 "Different nodes should have different DFS numbers");
21951 if (NodeI1 != NodeI2)
21952 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21953 return I1->getOpcode() < I2->getOpcode();
21954 }
21955 return V->getValueOperand()->getValueID() <
21956 V2->getValueOperand()->getValueID();
21957 };
21958
21959 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21960 if (V1 == V2)
21961 return true;
21962 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21963 return false;
21964 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21965 return false;
21966 // Undefs are compatible with any other value.
21967 if (isa<UndefValue>(V1->getValueOperand()) ||
21968 isa<UndefValue>(V2->getValueOperand()))
21969 return true;
21970 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21971 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21972 if (I1->getParent() != I2->getParent())
21973 return false;
21974 return getSameOpcode({I1, I2}, *TLI).valid();
21975 }
21976 if (isa<Constant>(V1->getValueOperand()) &&
21977 isa<Constant>(V2->getValueOperand()))
21978 return true;
21979 return V1->getValueOperand()->getValueID() ==
21980 V2->getValueOperand()->getValueID();
21981 };
21982
21983 // Attempt to sort and vectorize each of the store-groups.
21985 for (auto &Pair : Stores) {
21986 if (Pair.second.size() < 2)
21987 continue;
21988
21989 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
21990 << Pair.second.size() << ".\n");
21991
21992 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21993 continue;
21994
21995 // Reverse stores to do bottom-to-top analysis. This is important if the
21996 // values are stores to the same addresses several times, in this case need
21997 // to follow the stores order (reversed to meet the memory dependecies).
21998 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21999 Pair.second.rend());
22000 Changed |= tryToVectorizeSequence<StoreInst>(
22001 ReversedStores, StoreSorter, AreCompatibleStores,
22002 [&](ArrayRef<StoreInst *> Candidates, bool) {
22003 return vectorizeStores(Candidates, R, Attempted);
22004 },
22005 /*MaxVFOnly=*/false, R);
22006 }
22007 return Changed;
22008}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:462
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:449
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
reverse_iterator rend()
Definition: BasicBlock.h:467
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:676
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1980
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1875
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2117
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1974
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1199
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1494
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1971
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:980
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:859
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1072
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:530
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2499
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2574
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2186
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:867
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1761
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:889
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2404
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:881
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1671
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1614
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:300
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:780
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:492
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:296
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:291
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:297
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:168
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
bool erase(const ValueT &V)
Definition: DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7299
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1683
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:557
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:396
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:256
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1054
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2138
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.