LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107
108#define SV_NAME "slp-vectorizer"
109#define DEBUG_TYPE "SLP"
110
111STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
112
113DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
114 "Controls which SLP graphs should be vectorized.");
115
116static cl::opt<bool>
117 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
118 cl::desc("Run the SLP vectorization passes"));
119
120static cl::opt<bool>
121 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
122 cl::desc("Enable vectorization for wider vector utilization"));
123
124static cl::opt<int>
126 cl::desc("Only vectorize if you gain more than this "
127 "number "));
128
130 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
131 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
132 "heuristics and makes vectorization decision via cost modeling."));
133
134static cl::opt<bool>
135ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
136 cl::desc("Attempt to vectorize horizontal reductions"));
137
139 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
140 cl::desc(
141 "Attempt to vectorize horizontal reductions feeding into a store"));
142
143static cl::opt<int>
145 cl::desc("Attempt to vectorize for this register size in bits"));
146
149 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
150
151/// Limits the size of scheduling regions in a block.
152/// It avoid long compile times for _very_ large blocks where vector
153/// instructions are spread over a wide range.
154/// This limit is way higher than needed by real-world functions.
155static cl::opt<int>
156ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
157 cl::desc("Limit the size of the SLP scheduling region per block"));
158
160 "slp-min-reg-size", cl::init(128), cl::Hidden,
161 cl::desc("Attempt to vectorize for this register size in bits"));
162
164 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
165 cl::desc("Limit the recursion depth when building a vectorizable tree"));
166
168 "slp-min-tree-size", cl::init(3), cl::Hidden,
169 cl::desc("Only vectorize small trees if they are fully vectorizable"));
170
171// The maximum depth that the look-ahead score heuristic will explore.
172// The higher this value, the higher the compilation time overhead.
174 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
175 cl::desc("The maximum look-ahead depth for operand reordering scores"));
176
177// The maximum depth that the look-ahead score heuristic will explore
178// when it probing among candidates for vectorization tree roots.
179// The higher this value, the higher the compilation time overhead but unlike
180// similar limit for operands ordering this is less frequently used, hence
181// impact of higher value is less noticeable.
183 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
184 cl::desc("The maximum look-ahead depth for searching best rooting option"));
185
187 "slp-min-strided-loads", cl::init(2), cl::Hidden,
188 cl::desc("The minimum number of loads, which should be considered strided, "
189 "if the stride is > 1 or is runtime value"));
190
192 "slp-max-stride", cl::init(8), cl::Hidden,
193 cl::desc("The maximum stride, considered to be profitable."));
194
195static cl::opt<bool>
196 ViewSLPTree("view-slp-tree", cl::Hidden,
197 cl::desc("Display the SLP trees with Graphviz"));
198
200 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
201 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
202
203// Limit the number of alias checks. The limit is chosen so that
204// it has no negative effect on the llvm benchmarks.
205static const unsigned AliasedCheckLimit = 10;
206
207// Limit of the number of uses for potentially transformed instructions/values,
208// used in checks to avoid compile-time explode.
209static constexpr int UsesLimit = 64;
210
211// Another limit for the alias checks: The maximum distance between load/store
212// instructions where alias checks are done.
213// This limit is useful for very large basic blocks.
214static const unsigned MaxMemDepDistance = 160;
215
216/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
217/// regions to be handled.
218static const int MinScheduleRegionSize = 16;
219
220/// Maximum allowed number of operands in the PHI nodes.
221static const unsigned MaxPHINumOperands = 128;
222
223/// Predicate for the element types that the SLP vectorizer supports.
224///
225/// The most important thing to filter here are types which are invalid in LLVM
226/// vectors. We also filter target specific types which have absolutely no
227/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
228/// avoids spending time checking the cost model and realizing that they will
229/// be inevitably scalarized.
230static bool isValidElementType(Type *Ty) {
231 // TODO: Support ScalableVectorType.
232 if (SLPReVec && isa<FixedVectorType>(Ty))
233 Ty = Ty->getScalarType();
234 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
235 !Ty->isPPC_FP128Ty();
236}
237
238/// Returns the type of the given value/instruction \p V. If it is store,
239/// returns the type of its value operand, for Cmp - the types of the compare
240/// operands and for insertelement - the type os the inserted operand.
241/// Otherwise, just the type of the value is returned.
243 if (auto *SI = dyn_cast<StoreInst>(V))
244 return SI->getValueOperand()->getType();
245 if (auto *CI = dyn_cast<CmpInst>(V))
246 return CI->getOperand(0)->getType();
247 if (auto *IE = dyn_cast<InsertElementInst>(V))
248 return IE->getOperand(1)->getType();
249 return V->getType();
250}
251
252/// \returns the number of elements for Ty.
253static unsigned getNumElements(Type *Ty) {
254 assert(!isa<ScalableVectorType>(Ty) &&
255 "ScalableVectorType is not supported.");
256 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
257 return VecTy->getNumElements();
258 return 1;
259}
260
261/// \returns the vector type of ScalarTy based on vectorization factor.
262static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
263 return FixedVectorType::get(ScalarTy->getScalarType(),
264 VF * getNumElements(ScalarTy));
265}
266
267/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
268/// which forms type, which splits by \p TTI into whole vector types during
269/// legalization.
271 Type *Ty, unsigned Sz) {
272 if (!isValidElementType(Ty))
273 return bit_ceil(Sz);
274 // Find the number of elements, which forms full vectors.
275 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
276 if (NumParts == 0 || NumParts >= Sz)
277 return bit_ceil(Sz);
278 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
279}
280
281/// Returns the number of elements of the given type \p Ty, not greater than \p
282/// Sz, which forms type, which splits by \p TTI into whole vector types during
283/// legalization.
284static unsigned
286 unsigned Sz) {
287 if (!isValidElementType(Ty))
288 return bit_floor(Sz);
289 // Find the number of elements, which forms full vectors.
290 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
291 if (NumParts == 0 || NumParts >= Sz)
292 return bit_floor(Sz);
293 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294 if (RegVF > Sz)
295 return bit_floor(Sz);
296 return (Sz / RegVF) * RegVF;
297}
298
299static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
300 SmallVectorImpl<int> &Mask) {
301 // The ShuffleBuilder implementation use shufflevector to splat an "element".
302 // But the element have different meaning for SLP (scalar) and REVEC
303 // (vector). We need to expand Mask into masks which shufflevector can use
304 // directly.
305 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
306 for (unsigned I : seq<unsigned>(Mask.size()))
307 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
308 I * VecTyNumElements, VecTyNumElements)))
309 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
310 : Mask[I] * VecTyNumElements + J;
311 Mask.swap(NewMask);
312}
313
314/// \returns the number of groups of shufflevector
315/// A group has the following features
316/// 1. All of value in a group are shufflevector.
317/// 2. The mask of all shufflevector is isExtractSubvectorMask.
318/// 3. The mask of all shufflevector uses all of the elements of the source.
319/// e.g., it is 1 group (%0)
320/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
321/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
322/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
323/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
324/// it is 2 groups (%3 and %4)
325/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
326/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
327/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
328/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
329/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
330/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
331/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
332/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
333/// it is 0 group
334/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
335/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
336/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
337/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339 if (VL.empty())
340 return 0;
341 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
342 return 0;
343 auto *SV = cast<ShuffleVectorInst>(VL.front());
344 unsigned SVNumElements =
345 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
346 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
347 if (SVNumElements % ShuffleMaskSize != 0)
348 return 0;
349 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
350 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
351 return 0;
352 unsigned NumGroup = 0;
353 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
354 auto *SV = cast<ShuffleVectorInst>(VL[I]);
355 Value *Src = SV->getOperand(0);
356 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
357 SmallBitVector ExpectedIndex(GroupSize);
358 if (!all_of(Group, [&](Value *V) {
359 auto *SV = cast<ShuffleVectorInst>(V);
360 // From the same source.
361 if (SV->getOperand(0) != Src)
362 return false;
363 int Index;
364 if (!SV->isExtractSubvectorMask(Index))
365 return false;
366 ExpectedIndex.set(Index / ShuffleMaskSize);
367 return true;
368 }))
369 return 0;
370 if (!ExpectedIndex.all())
371 return 0;
372 ++NumGroup;
373 }
374 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
375 return NumGroup;
376}
377
378/// \returns a shufflevector mask which is used to vectorize shufflevectors
379/// e.g.,
380/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
381/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
382/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
383/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
384/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
385/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
386/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
387/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
388/// the result is
389/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
391 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
392 auto *SV = cast<ShuffleVectorInst>(VL.front());
393 unsigned SVNumElements =
394 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
395 SmallVector<int> Mask;
396 unsigned AccumulateLength = 0;
397 for (Value *V : VL) {
398 auto *SV = cast<ShuffleVectorInst>(V);
399 for (int M : SV->getShuffleMask())
400 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
401 : AccumulateLength + M);
402 AccumulateLength += SVNumElements;
403 }
404 return Mask;
405}
406
407/// \returns True if the value is a constant (but not globals/constant
408/// expressions).
409static bool isConstant(Value *V) {
410 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
411}
412
413/// Checks if \p V is one of vector-like instructions, i.e. undef,
414/// insertelement/extractelement with constant indices for fixed vector type or
415/// extractvalue instruction.
417 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
418 !isa<ExtractValueInst, UndefValue>(V))
419 return false;
420 auto *I = dyn_cast<Instruction>(V);
421 if (!I || isa<ExtractValueInst>(I))
422 return true;
423 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
424 return false;
425 if (isa<ExtractElementInst>(I))
426 return isConstant(I->getOperand(1));
427 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
428 return isConstant(I->getOperand(2));
429}
430
431/// Returns power-of-2 number of elements in a single register (part), given the
432/// total number of elements \p Size and number of registers (parts) \p
433/// NumParts.
434static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
435 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
436}
437
438/// Returns correct remaining number of elements, considering total amount \p
439/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
440/// and current register (part) \p Part.
441static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
442 unsigned Part) {
443 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
444}
445
446#if !defined(NDEBUG)
447/// Print a short descriptor of the instruction bundle suitable for debug output.
448static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
449 std::string Result;
450 raw_string_ostream OS(Result);
451 if (Idx >= 0)
452 OS << "Idx: " << Idx << ", ";
453 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
454 return Result;
455}
456#endif
457
458/// \returns true if all of the instructions in \p VL are in the same block or
459/// false otherwise.
461 auto *It = find_if(VL, IsaPred<Instruction>);
462 if (It == VL.end())
463 return false;
464 Instruction *I0 = cast<Instruction>(*It);
466 return true;
467
468 BasicBlock *BB = I0->getParent();
469 for (Value *V : iterator_range(It, VL.end())) {
470 if (isa<PoisonValue>(V))
471 continue;
472 auto *II = dyn_cast<Instruction>(V);
473 if (!II)
474 return false;
475
476 if (BB != II->getParent())
477 return false;
478 }
479 return true;
480}
481
482/// \returns True if all of the values in \p VL are constants (but not
483/// globals/constant expressions).
485 // Constant expressions and globals can't be vectorized like normal integer/FP
486 // constants.
487 return all_of(VL, isConstant);
488}
489
490/// \returns True if all of the values in \p VL are identical or some of them
491/// are UndefValue.
492static bool isSplat(ArrayRef<Value *> VL) {
493 Value *FirstNonUndef = nullptr;
494 for (Value *V : VL) {
495 if (isa<UndefValue>(V))
496 continue;
497 if (!FirstNonUndef) {
498 FirstNonUndef = V;
499 continue;
500 }
501 if (V != FirstNonUndef)
502 return false;
503 }
504 return FirstNonUndef != nullptr;
505}
506
507/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
509 if (auto *Cmp = dyn_cast<CmpInst>(I))
510 return Cmp->isCommutative();
511 if (auto *BO = dyn_cast<BinaryOperator>(I))
512 return BO->isCommutative() ||
513 (BO->getOpcode() == Instruction::Sub &&
514 !BO->hasNUsesOrMore(UsesLimit) &&
515 all_of(
516 BO->uses(),
517 [](const Use &U) {
518 // Commutative, if icmp eq/ne sub, 0
519 CmpPredicate Pred;
520 if (match(U.getUser(),
521 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
522 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
523 return true;
524 // Commutative, if abs(sub nsw, true) or abs(sub, false).
525 ConstantInt *Flag;
526 return match(U.getUser(),
527 m_Intrinsic<Intrinsic::abs>(
528 m_Specific(U.get()), m_ConstantInt(Flag))) &&
529 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
530 Flag->isOne());
531 })) ||
532 (BO->getOpcode() == Instruction::FSub &&
533 !BO->hasNUsesOrMore(UsesLimit) &&
534 all_of(BO->uses(), [](const Use &U) {
535 return match(U.getUser(),
536 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
537 }));
538 return I->isCommutative();
539}
540
541template <typename T>
542static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
543 unsigned Offset) {
544 static_assert(std::is_same_v<T, InsertElementInst> ||
545 std::is_same_v<T, ExtractElementInst>,
546 "unsupported T");
547 int Index = Offset;
548 if (const auto *IE = dyn_cast<T>(Inst)) {
549 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
550 if (!VT)
551 return std::nullopt;
552 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
553 if (!CI)
554 return std::nullopt;
555 if (CI->getValue().uge(VT->getNumElements()))
556 return std::nullopt;
557 Index *= VT->getNumElements();
558 Index += CI->getZExtValue();
559 return Index;
560 }
561 return std::nullopt;
562}
563
564/// \returns inserting or extracting index of InsertElement, ExtractElement or
565/// InsertValue instruction, using Offset as base offset for index.
566/// \returns std::nullopt if the index is not an immediate.
567static std::optional<unsigned> getElementIndex(const Value *Inst,
568 unsigned Offset = 0) {
569 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
570 return Index;
571 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
572 return Index;
573
574 int Index = Offset;
575
576 const auto *IV = dyn_cast<InsertValueInst>(Inst);
577 if (!IV)
578 return std::nullopt;
579
580 Type *CurrentType = IV->getType();
581 for (unsigned I : IV->indices()) {
582 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
583 Index *= ST->getNumElements();
584 CurrentType = ST->getElementType(I);
585 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
586 Index *= AT->getNumElements();
587 CurrentType = AT->getElementType();
588 } else {
589 return std::nullopt;
590 }
591 Index += I;
592 }
593 return Index;
594}
595
596namespace {
597/// Specifies the way the mask should be analyzed for undefs/poisonous elements
598/// in the shuffle mask.
599enum class UseMask {
600 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
601 ///< check for the mask elements for the first argument (mask
602 ///< indices are in range [0:VF)).
603 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
604 ///< for the mask elements for the second argument (mask indices
605 ///< are in range [VF:2*VF))
606 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
607 ///< future shuffle elements and mark them as ones as being used
608 ///< in future. Non-undef elements are considered as unused since
609 ///< they're already marked as used in the mask.
610};
611} // namespace
612
613/// Prepares a use bitset for the given mask either for the first argument or
614/// for the second.
616 UseMask MaskArg) {
617 SmallBitVector UseMask(VF, true);
618 for (auto [Idx, Value] : enumerate(Mask)) {
619 if (Value == PoisonMaskElem) {
620 if (MaskArg == UseMask::UndefsAsMask)
621 UseMask.reset(Idx);
622 continue;
623 }
624 if (MaskArg == UseMask::FirstArg && Value < VF)
625 UseMask.reset(Value);
626 else if (MaskArg == UseMask::SecondArg && Value >= VF)
627 UseMask.reset(Value - VF);
628 }
629 return UseMask;
630}
631
632/// Checks if the given value is actually an undefined constant vector.
633/// Also, if the \p UseMask is not empty, tries to check if the non-masked
634/// elements actually mask the insertelement buildvector, if any.
635template <bool IsPoisonOnly = false>
637 const SmallBitVector &UseMask = {}) {
638 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
639 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
640 if (isa<T>(V))
641 return Res;
642 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
643 if (!VecTy)
644 return Res.reset();
645 auto *C = dyn_cast<Constant>(V);
646 if (!C) {
647 if (!UseMask.empty()) {
648 const Value *Base = V;
649 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
650 Base = II->getOperand(0);
651 if (isa<T>(II->getOperand(1)))
652 continue;
653 std::optional<unsigned> Idx = getElementIndex(II);
654 if (!Idx) {
655 Res.reset();
656 return Res;
657 }
658 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
659 Res.reset(*Idx);
660 }
661 // TODO: Add analysis for shuffles here too.
662 if (V == Base) {
663 Res.reset();
664 } else {
665 SmallBitVector SubMask(UseMask.size(), false);
666 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
667 }
668 } else {
669 Res.reset();
670 }
671 return Res;
672 }
673 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
674 if (Constant *Elem = C->getAggregateElement(I))
675 if (!isa<T>(Elem) &&
676 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
677 Res.reset(I);
678 }
679 return Res;
680}
681
682/// Checks if the vector of instructions can be represented as a shuffle, like:
683/// %x0 = extractelement <4 x i8> %x, i32 0
684/// %x3 = extractelement <4 x i8> %x, i32 3
685/// %y1 = extractelement <4 x i8> %y, i32 1
686/// %y2 = extractelement <4 x i8> %y, i32 2
687/// %x0x0 = mul i8 %x0, %x0
688/// %x3x3 = mul i8 %x3, %x3
689/// %y1y1 = mul i8 %y1, %y1
690/// %y2y2 = mul i8 %y2, %y2
691/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
692/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
693/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
694/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
695/// ret <4 x i8> %ins4
696/// can be transformed into:
697/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
698/// i32 6>
699/// %2 = mul <4 x i8> %1, %1
700/// ret <4 x i8> %2
701/// Mask will return the Shuffle Mask equivalent to the extracted elements.
702/// TODO: Can we split off and reuse the shuffle mask detection from
703/// ShuffleVectorInst/getShuffleCost?
704static std::optional<TargetTransformInfo::ShuffleKind>
706 AssumptionCache *AC) {
707 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
708 if (It == VL.end())
709 return std::nullopt;
710 unsigned Size =
711 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
712 auto *EI = dyn_cast<ExtractElementInst>(V);
713 if (!EI)
714 return S;
715 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
716 if (!VTy)
717 return S;
718 return std::max(S, VTy->getNumElements());
719 });
720
721 Value *Vec1 = nullptr;
722 Value *Vec2 = nullptr;
723 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
724 auto *EE = dyn_cast<ExtractElementInst>(V);
725 if (!EE)
726 return false;
727 Value *Vec = EE->getVectorOperand();
728 if (isa<UndefValue>(Vec))
729 return false;
730 return isGuaranteedNotToBePoison(Vec, AC);
731 });
732 enum ShuffleMode { Unknown, Select, Permute };
733 ShuffleMode CommonShuffleMode = Unknown;
734 Mask.assign(VL.size(), PoisonMaskElem);
735 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
736 // Undef can be represented as an undef element in a vector.
737 if (isa<UndefValue>(VL[I]))
738 continue;
739 auto *EI = cast<ExtractElementInst>(VL[I]);
740 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
741 return std::nullopt;
742 auto *Vec = EI->getVectorOperand();
743 // We can extractelement from undef or poison vector.
744 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
745 continue;
746 // All vector operands must have the same number of vector elements.
747 if (isa<UndefValue>(Vec)) {
748 Mask[I] = I;
749 } else {
750 if (isa<UndefValue>(EI->getIndexOperand()))
751 continue;
752 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
753 if (!Idx)
754 return std::nullopt;
755 // Undefined behavior if Idx is negative or >= Size.
756 if (Idx->getValue().uge(Size))
757 continue;
758 unsigned IntIdx = Idx->getValue().getZExtValue();
759 Mask[I] = IntIdx;
760 }
761 if (isUndefVector(Vec).all() && HasNonUndefVec)
762 continue;
763 // For correct shuffling we have to have at most 2 different vector operands
764 // in all extractelement instructions.
765 if (!Vec1 || Vec1 == Vec) {
766 Vec1 = Vec;
767 } else if (!Vec2 || Vec2 == Vec) {
768 Vec2 = Vec;
769 Mask[I] += Size;
770 } else {
771 return std::nullopt;
772 }
773 if (CommonShuffleMode == Permute)
774 continue;
775 // If the extract index is not the same as the operation number, it is a
776 // permutation.
777 if (Mask[I] % Size != I) {
778 CommonShuffleMode = Permute;
779 continue;
780 }
781 CommonShuffleMode = Select;
782 }
783 // If we're not crossing lanes in different vectors, consider it as blending.
784 if (CommonShuffleMode == Select && Vec2)
786 // If Vec2 was never used, we have a permutation of a single vector, otherwise
787 // we have permutation of 2 vectors.
790}
791
792/// \returns True if Extract{Value,Element} instruction extracts element Idx.
793static std::optional<unsigned> getExtractIndex(Instruction *E) {
794 unsigned Opcode = E->getOpcode();
795 assert((Opcode == Instruction::ExtractElement ||
796 Opcode == Instruction::ExtractValue) &&
797 "Expected extractelement or extractvalue instruction.");
798 if (Opcode == Instruction::ExtractElement) {
799 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
800 if (!CI)
801 return std::nullopt;
802 return CI->getZExtValue();
803 }
804 auto *EI = cast<ExtractValueInst>(E);
805 if (EI->getNumIndices() != 1)
806 return std::nullopt;
807 return *EI->idx_begin();
808}
809
810namespace {
811
812/// Main data required for vectorization of instructions.
813class InstructionsState {
814 /// The main/alternate instruction. MainOp is also VL0.
815 Instruction *MainOp = nullptr;
816 Instruction *AltOp = nullptr;
817
818public:
819 Instruction *getMainOp() const {
820 assert(valid() && "InstructionsState is invalid.");
821 return MainOp;
822 }
823
824 Instruction *getAltOp() const {
825 assert(valid() && "InstructionsState is invalid.");
826 return AltOp;
827 }
828
829 /// The main/alternate opcodes for the list of instructions.
830 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
831
832 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
833
834 /// Some of the instructions in the list have alternate opcodes.
835 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
836
837 bool isOpcodeOrAlt(Instruction *I) const {
838 unsigned CheckedOpcode = I->getOpcode();
839 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
840 }
841
842 /// Checks if the current state is valid, i.e. has non-null MainOp
843 bool valid() const { return MainOp && AltOp; }
844
845 explicit operator bool() const { return valid(); }
846
847 InstructionsState() = delete;
848 InstructionsState(Instruction *MainOp, Instruction *AltOp)
849 : MainOp(MainOp), AltOp(AltOp) {}
850 static InstructionsState invalid() { return {nullptr, nullptr}; }
851};
852
853} // end anonymous namespace
854
855/// \returns true if \p Opcode is allowed as part of the main/alternate
856/// instruction for SLP vectorization.
857///
858/// Example of unsupported opcode is SDIV that can potentially cause UB if the
859/// "shuffled out" lane would result in division by zero.
860static bool isValidForAlternation(unsigned Opcode) {
861 if (Instruction::isIntDivRem(Opcode))
862 return false;
863
864 return true;
865}
866
867static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
868 const TargetLibraryInfo &TLI);
869
870/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
871/// compatible instructions or constants, or just some other regular values.
872static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
873 Value *Op1, const TargetLibraryInfo &TLI) {
874 return (isConstant(BaseOp0) && isConstant(Op0)) ||
875 (isConstant(BaseOp1) && isConstant(Op1)) ||
876 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
877 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
878 BaseOp0 == Op0 || BaseOp1 == Op1 ||
879 getSameOpcode({BaseOp0, Op0}, TLI) ||
880 getSameOpcode({BaseOp1, Op1}, TLI);
881}
882
883/// \returns true if a compare instruction \p CI has similar "look" and
884/// same predicate as \p BaseCI, "as is" or with its operands and predicate
885/// swapped, false otherwise.
886static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
887 const TargetLibraryInfo &TLI) {
888 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
889 "Assessing comparisons of different types?");
890 CmpInst::Predicate BasePred = BaseCI->getPredicate();
891 CmpInst::Predicate Pred = CI->getPredicate();
893
894 Value *BaseOp0 = BaseCI->getOperand(0);
895 Value *BaseOp1 = BaseCI->getOperand(1);
896 Value *Op0 = CI->getOperand(0);
897 Value *Op1 = CI->getOperand(1);
898
899 return (BasePred == Pred &&
900 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
901 (BasePred == SwappedPred &&
902 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
903}
904
905/// \returns analysis of the Instructions in \p VL described in
906/// InstructionsState, the Opcode that we suppose the whole list
907/// could be vectorized even if its structure is diverse.
908static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
909 const TargetLibraryInfo &TLI) {
910 // Make sure these are all Instructions.
911 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
912 return InstructionsState::invalid();
913
914 auto *It = find_if(VL, IsaPred<Instruction>);
915 if (It == VL.end())
916 return InstructionsState::invalid();
917
918 Value *V = *It;
919 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
920 if ((VL.size() > 2 && !isa<PHINode>(V) && InstCnt < VL.size() / 2) ||
921 (VL.size() == 2 && InstCnt < 2))
922 return InstructionsState::invalid();
923
924 bool IsCastOp = isa<CastInst>(V);
925 bool IsBinOp = isa<BinaryOperator>(V);
926 bool IsCmpOp = isa<CmpInst>(V);
927 CmpInst::Predicate BasePred =
928 IsCmpOp ? cast<CmpInst>(V)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE;
929 unsigned Opcode = cast<Instruction>(V)->getOpcode();
930 unsigned AltOpcode = Opcode;
931 unsigned AltIndex = std::distance(VL.begin(), It);
932
933 bool SwappedPredsCompatible = [&]() {
934 if (!IsCmpOp)
935 return false;
936 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
937 UniquePreds.insert(BasePred);
938 UniqueNonSwappedPreds.insert(BasePred);
939 for (Value *V : VL) {
940 auto *I = dyn_cast<CmpInst>(V);
941 if (!I)
942 return false;
943 CmpInst::Predicate CurrentPred = I->getPredicate();
944 CmpInst::Predicate SwappedCurrentPred =
945 CmpInst::getSwappedPredicate(CurrentPred);
946 UniqueNonSwappedPreds.insert(CurrentPred);
947 if (!UniquePreds.contains(CurrentPred) &&
948 !UniquePreds.contains(SwappedCurrentPred))
949 UniquePreds.insert(CurrentPred);
950 }
951 // Total number of predicates > 2, but if consider swapped predicates
952 // compatible only 2, consider swappable predicates as compatible opcodes,
953 // not alternate.
954 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
955 }();
956 // Check for one alternate opcode from another BinaryOperator.
957 // TODO - generalize to support all operators (types, calls etc.).
958 auto *IBase = cast<Instruction>(V);
959 Intrinsic::ID BaseID = 0;
960 SmallVector<VFInfo> BaseMappings;
961 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
963 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
964 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
965 return InstructionsState::invalid();
966 }
967 bool AnyPoison = InstCnt != VL.size();
968 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
969 auto *I = dyn_cast<Instruction>(VL[Cnt]);
970 if (!I)
971 continue;
972
973 // Cannot combine poison and divisions.
974 // TODO: do some smart analysis of the CallInsts to exclude divide-like
975 // intrinsics/functions only.
976 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
977 return InstructionsState::invalid();
978 unsigned InstOpcode = I->getOpcode();
979 if (IsBinOp && isa<BinaryOperator>(I)) {
980 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
981 continue;
982 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
983 isValidForAlternation(Opcode)) {
984 AltOpcode = InstOpcode;
985 AltIndex = Cnt;
986 continue;
987 }
988 } else if (IsCastOp && isa<CastInst>(I)) {
989 Value *Op0 = IBase->getOperand(0);
990 Type *Ty0 = Op0->getType();
991 Value *Op1 = I->getOperand(0);
992 Type *Ty1 = Op1->getType();
993 if (Ty0 == Ty1) {
994 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995 continue;
996 if (Opcode == AltOpcode) {
998 isValidForAlternation(InstOpcode) &&
999 "Cast isn't safe for alternation, logic needs to be updated!");
1000 AltOpcode = InstOpcode;
1001 AltIndex = Cnt;
1002 continue;
1003 }
1004 }
1005 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
1006 auto *BaseInst = cast<CmpInst>(V);
1007 Type *Ty0 = BaseInst->getOperand(0)->getType();
1008 Type *Ty1 = Inst->getOperand(0)->getType();
1009 if (Ty0 == Ty1) {
1010 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1011 assert(InstOpcode == AltOpcode &&
1012 "Alternate instructions are only supported by BinaryOperator "
1013 "and CastInst.");
1014 // Check for compatible operands. If the corresponding operands are not
1015 // compatible - need to perform alternate vectorization.
1016 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1017 CmpInst::Predicate SwappedCurrentPred =
1018 CmpInst::getSwappedPredicate(CurrentPred);
1019
1020 if ((E == 2 || SwappedPredsCompatible) &&
1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1022 continue;
1023
1024 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1025 continue;
1026 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
1027 if (AltIndex) {
1028 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1029 continue;
1030 } else if (BasePred != CurrentPred) {
1031 assert(
1032 isValidForAlternation(InstOpcode) &&
1033 "CmpInst isn't safe for alternation, logic needs to be updated!");
1034 AltIndex = Cnt;
1035 continue;
1036 }
1037 CmpInst::Predicate AltPred = AltInst->getPredicate();
1038 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1040 continue;
1041 }
1042 } else if (InstOpcode == Opcode) {
1043 assert(InstOpcode == AltOpcode &&
1044 "Alternate instructions are only supported by BinaryOperator and "
1045 "CastInst.");
1046 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1047 if (Gep->getNumOperands() != 2 ||
1048 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
1049 return InstructionsState::invalid();
1050 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1052 return InstructionsState::invalid();
1053 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1054 auto *BaseLI = cast<LoadInst>(IBase);
1055 if (!LI->isSimple() || !BaseLI->isSimple())
1056 return InstructionsState::invalid();
1057 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1058 auto *CallBase = cast<CallInst>(IBase);
1059 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1060 return InstructionsState::invalid();
1061 if (Call->hasOperandBundles() &&
1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1065 CallBase->op_begin() +
1067 return InstructionsState::invalid();
1069 if (ID != BaseID)
1070 return InstructionsState::invalid();
1071 if (!ID) {
1072 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1073 if (Mappings.size() != BaseMappings.size() ||
1074 Mappings.front().ISA != BaseMappings.front().ISA ||
1075 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1076 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1077 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1078 Mappings.front().Shape.Parameters !=
1079 BaseMappings.front().Shape.Parameters)
1080 return InstructionsState::invalid();
1081 }
1082 }
1083 continue;
1084 }
1085 return InstructionsState::invalid();
1086 }
1087
1088 return InstructionsState(cast<Instruction>(V),
1089 cast<Instruction>(VL[AltIndex]));
1090}
1091
1092/// \returns true if all of the values in \p VL have the same type or false
1093/// otherwise.
1095 Type *Ty = VL.front()->getType();
1096 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1097}
1098
1099/// \returns True if in-tree use also needs extract. This refers to
1100/// possible scalar operand in vectorized instruction.
1101static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1102 TargetLibraryInfo *TLI,
1103 const TargetTransformInfo *TTI) {
1104 if (!UserInst)
1105 return false;
1106 unsigned Opcode = UserInst->getOpcode();
1107 switch (Opcode) {
1108 case Instruction::Load: {
1109 LoadInst *LI = cast<LoadInst>(UserInst);
1110 return (LI->getPointerOperand() == Scalar);
1111 }
1112 case Instruction::Store: {
1113 StoreInst *SI = cast<StoreInst>(UserInst);
1114 return (SI->getPointerOperand() == Scalar);
1115 }
1116 case Instruction::Call: {
1117 CallInst *CI = cast<CallInst>(UserInst);
1119 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1120 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1121 Arg.value().get() == Scalar;
1122 });
1123 }
1124 default:
1125 return false;
1126 }
1127}
1128
1129/// \returns the AA location that is being access by the instruction.
1131 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1132 return MemoryLocation::get(SI);
1133 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1134 return MemoryLocation::get(LI);
1135 return MemoryLocation();
1136}
1137
1138/// \returns True if the instruction is not a volatile or atomic load/store.
1139static bool isSimple(Instruction *I) {
1140 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1141 return LI->isSimple();
1142 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1143 return SI->isSimple();
1144 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1145 return !MI->isVolatile();
1146 return true;
1147}
1148
1149/// Shuffles \p Mask in accordance with the given \p SubMask.
1150/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1151/// one but two input vectors.
1152static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1153 bool ExtendingManyInputs = false) {
1154 if (SubMask.empty())
1155 return;
1156 assert(
1157 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1158 // Check if input scalars were extended to match the size of other node.
1159 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1160 "SubMask with many inputs support must be larger than the mask.");
1161 if (Mask.empty()) {
1162 Mask.append(SubMask.begin(), SubMask.end());
1163 return;
1164 }
1165 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1166 int TermValue = std::min(Mask.size(), SubMask.size());
1167 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1168 if (SubMask[I] == PoisonMaskElem ||
1169 (!ExtendingManyInputs &&
1170 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1171 continue;
1172 NewMask[I] = Mask[SubMask[I]];
1173 }
1174 Mask.swap(NewMask);
1175}
1176
1177/// Order may have elements assigned special value (size) which is out of
1178/// bounds. Such indices only appear on places which correspond to undef values
1179/// (see canReuseExtract for details) and used in order to avoid undef values
1180/// have effect on operands ordering.
1181/// The first loop below simply finds all unused indices and then the next loop
1182/// nest assigns these indices for undef values positions.
1183/// As an example below Order has two undef positions and they have assigned
1184/// values 3 and 7 respectively:
1185/// before: 6 9 5 4 9 2 1 0
1186/// after: 6 3 5 4 7 2 1 0
1188 const unsigned Sz = Order.size();
1189 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1190 SmallBitVector MaskedIndices(Sz);
1191 for (unsigned I = 0; I < Sz; ++I) {
1192 if (Order[I] < Sz)
1193 UnusedIndices.reset(Order[I]);
1194 else
1195 MaskedIndices.set(I);
1196 }
1197 if (MaskedIndices.none())
1198 return;
1199 assert(UnusedIndices.count() == MaskedIndices.count() &&
1200 "Non-synced masked/available indices.");
1201 int Idx = UnusedIndices.find_first();
1202 int MIdx = MaskedIndices.find_first();
1203 while (MIdx >= 0) {
1204 assert(Idx >= 0 && "Indices must be synced.");
1205 Order[MIdx] = Idx;
1206 Idx = UnusedIndices.find_next(Idx);
1207 MIdx = MaskedIndices.find_next(MIdx);
1208 }
1209}
1210
1211/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1212/// Opcode1.
1214 unsigned Opcode1) {
1215 Type *ScalarTy = VL[0]->getType();
1216 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1217 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1218 for (unsigned Lane : seq<unsigned>(VL.size())) {
1219 if (isa<PoisonValue>(VL[Lane]))
1220 continue;
1221 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1222 OpcodeMask.set(Lane * ScalarTyNumElements,
1223 Lane * ScalarTyNumElements + ScalarTyNumElements);
1224 }
1225 return OpcodeMask;
1226}
1227
1228namespace llvm {
1229
1231 SmallVectorImpl<int> &Mask) {
1232 Mask.clear();
1233 const unsigned E = Indices.size();
1234 Mask.resize(E, PoisonMaskElem);
1235 for (unsigned I = 0; I < E; ++I)
1236 Mask[Indices[I]] = I;
1237}
1238
1239/// Reorders the list of scalars in accordance with the given \p Mask.
1241 ArrayRef<int> Mask) {
1242 assert(!Mask.empty() && "Expected non-empty mask.");
1243 SmallVector<Value *> Prev(Scalars.size(),
1244 PoisonValue::get(Scalars.front()->getType()));
1245 Prev.swap(Scalars);
1246 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1247 if (Mask[I] != PoisonMaskElem)
1248 Scalars[Mask[I]] = Prev[I];
1249}
1250
1251/// Checks if the provided value does not require scheduling. It does not
1252/// require scheduling if this is not an instruction or it is an instruction
1253/// that does not read/write memory and all operands are either not instructions
1254/// or phi nodes or instructions from different blocks.
1256 auto *I = dyn_cast<Instruction>(V);
1257 if (!I)
1258 return true;
1259 return !mayHaveNonDefUseDependency(*I) &&
1260 all_of(I->operands(), [I](Value *V) {
1261 auto *IO = dyn_cast<Instruction>(V);
1262 if (!IO)
1263 return true;
1264 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1265 });
1266}
1267
1268/// Checks if the provided value does not require scheduling. It does not
1269/// require scheduling if this is not an instruction or it is an instruction
1270/// that does not read/write memory and all users are phi nodes or instructions
1271/// from the different blocks.
1272static bool isUsedOutsideBlock(Value *V) {
1273 auto *I = dyn_cast<Instruction>(V);
1274 if (!I)
1275 return true;
1276 // Limits the number of uses to save compile time.
1277 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1278 all_of(I->users(), [I](User *U) {
1279 auto *IU = dyn_cast<Instruction>(U);
1280 if (!IU)
1281 return true;
1282 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1283 });
1284}
1285
1286/// Checks if the specified value does not require scheduling. It does not
1287/// require scheduling if all operands and all users do not need to be scheduled
1288/// in the current basic block.
1291}
1292
1293/// Checks if the specified array of instructions does not require scheduling.
1294/// It is so if all either instructions have operands that do not require
1295/// scheduling or their users do not require scheduling since they are phis or
1296/// in other basic blocks.
1298 return !VL.empty() &&
1300}
1301
1302/// Returns true if widened type of \p Ty elements with size \p Sz represents
1303/// full vector type, i.e. adding extra element results in extra parts upon type
1304/// legalization.
1306 unsigned Sz) {
1307 if (Sz <= 1)
1308 return false;
1309 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1310 return false;
1311 if (has_single_bit(Sz))
1312 return true;
1313 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1314 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1315 Sz % NumParts == 0;
1316}
1317
1318namespace slpvectorizer {
1319
1320/// Bottom Up SLP Vectorizer.
1321class BoUpSLP {
1322 struct TreeEntry;
1323 struct ScheduleData;
1326
1327public:
1328 /// Tracks the state we can represent the loads in the given sequence.
1329 enum class LoadsState {
1330 Gather,
1331 Vectorize,
1334 };
1335
1342
1344 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1347 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1348 AC(AC), DB(DB), DL(DL), ORE(ORE),
1349 Builder(Se->getContext(), TargetFolder(*DL)) {
1350 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1351 // Use the vector register size specified by the target unless overridden
1352 // by a command-line option.
1353 // TODO: It would be better to limit the vectorization factor based on
1354 // data type rather than just register size. For example, x86 AVX has
1355 // 256-bit registers, but it does not support integer operations
1356 // at that width (that requires AVX2).
1357 if (MaxVectorRegSizeOption.getNumOccurrences())
1358 MaxVecRegSize = MaxVectorRegSizeOption;
1359 else
1360 MaxVecRegSize =
1362 .getFixedValue();
1363
1364 if (MinVectorRegSizeOption.getNumOccurrences())
1365 MinVecRegSize = MinVectorRegSizeOption;
1366 else
1367 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1368 }
1369
1370 /// Vectorize the tree that starts with the elements in \p VL.
1371 /// Returns the vectorized root.
1373
1374 /// Vectorize the tree but with the list of externally used values \p
1375 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1376 /// generated extractvalue instructions.
1377 Value *
1378 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1379 Instruction *ReductionRoot = nullptr);
1380
1381 /// \returns the cost incurred by unwanted spills and fills, caused by
1382 /// holding live values over call sites.
1384
1385 /// \returns the vectorization cost of the subtree that starts at \p VL.
1386 /// A negative number means that this is profitable.
1387 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1388
1389 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1390 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1391 void buildTree(ArrayRef<Value *> Roots,
1392 const SmallDenseSet<Value *> &UserIgnoreLst);
1393
1394 /// Construct a vectorizable tree that starts at \p Roots.
1395 void buildTree(ArrayRef<Value *> Roots);
1396
1397 /// Returns whether the root node has in-tree uses.
1399 return !VectorizableTree.empty() &&
1400 !VectorizableTree.front()->UserTreeIndices.empty();
1401 }
1402
1403 /// Return the scalars of the root node.
1405 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1406 return VectorizableTree.front()->Scalars;
1407 }
1408
1409 /// Returns the type/is-signed info for the root node in the graph without
1410 /// casting.
1411 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1412 const TreeEntry &Root = *VectorizableTree.front().get();
1413 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1414 !Root.Scalars.front()->getType()->isIntegerTy())
1415 return std::nullopt;
1416 auto It = MinBWs.find(&Root);
1417 if (It != MinBWs.end())
1418 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1419 It->second.first),
1420 It->second.second);
1421 if (Root.getOpcode() == Instruction::ZExt ||
1422 Root.getOpcode() == Instruction::SExt)
1423 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1424 Root.getOpcode() == Instruction::SExt);
1425 return std::nullopt;
1426 }
1427
1428 /// Checks if the root graph node can be emitted with narrower bitwidth at
1429 /// codegen and returns it signedness, if so.
1431 return MinBWs.at(VectorizableTree.front().get()).second;
1432 }
1433
1434 /// Returns reduction type after minbitdth analysis.
1436 if (ReductionBitWidth == 0 ||
1437 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1438 ReductionBitWidth >=
1439 DL->getTypeSizeInBits(
1440 VectorizableTree.front()->Scalars.front()->getType()))
1441 return getWidenedType(
1442 VectorizableTree.front()->Scalars.front()->getType(),
1443 VectorizableTree.front()->getVectorFactor());
1444 return getWidenedType(
1446 VectorizableTree.front()->Scalars.front()->getContext(),
1447 ReductionBitWidth),
1448 VectorizableTree.front()->getVectorFactor());
1449 }
1450
1451 /// Builds external uses of the vectorized scalars, i.e. the list of
1452 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1453 /// ExternallyUsedValues contains additional list of external uses to handle
1454 /// vectorization of reductions.
1455 void
1456 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1457
1458 /// Transforms graph nodes to target specific representations, if profitable.
1459 void transformNodes();
1460
1461 /// Clear the internal data structures that are created by 'buildTree'.
1462 void deleteTree() {
1463 VectorizableTree.clear();
1464 ScalarToTreeEntry.clear();
1465 MultiNodeScalars.clear();
1466 MustGather.clear();
1467 NonScheduledFirst.clear();
1468 EntryToLastInstruction.clear();
1469 LoadEntriesToVectorize.clear();
1470 IsGraphTransformMode = false;
1471 GatheredLoadsEntriesFirst.reset();
1472 ExternalUses.clear();
1473 ExternalUsesAsOriginalScalar.clear();
1474 for (auto &Iter : BlocksSchedules) {
1475 BlockScheduling *BS = Iter.second.get();
1476 BS->clear();
1477 }
1478 MinBWs.clear();
1479 ReductionBitWidth = 0;
1480 BaseGraphSize = 1;
1481 CastMaxMinBWSizes.reset();
1482 ExtraBitWidthNodes.clear();
1483 InstrElementSize.clear();
1484 UserIgnoreList = nullptr;
1485 PostponedGathers.clear();
1486 ValueToGatherNodes.clear();
1487 }
1488
1489 unsigned getTreeSize() const { return VectorizableTree.size(); }
1490
1491 /// Returns the base graph size, before any transformations.
1492 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1493
1494 /// Perform LICM and CSE on the newly generated gather sequences.
1496
1497 /// Does this non-empty order represent an identity order? Identity
1498 /// should be represented as an empty order, so this is used to
1499 /// decide if we can canonicalize a computed order. Undef elements
1500 /// (represented as size) are ignored.
1502 assert(!Order.empty() && "expected non-empty order");
1503 const unsigned Sz = Order.size();
1504 return all_of(enumerate(Order), [&](const auto &P) {
1505 return P.value() == P.index() || P.value() == Sz;
1506 });
1507 }
1508
1509 /// Checks if the specified gather tree entry \p TE can be represented as a
1510 /// shuffled vector entry + (possibly) permutation with other gathers. It
1511 /// implements the checks only for possibly ordered scalars (Loads,
1512 /// ExtractElement, ExtractValue), which can be part of the graph.
1513 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1514
1515 /// Sort loads into increasing pointers offsets to allow greater clustering.
1516 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1517
1518 /// Gets reordering data for the given tree entry. If the entry is vectorized
1519 /// - just return ReorderIndices, otherwise check if the scalars can be
1520 /// reordered and return the most optimal order.
1521 /// \return std::nullopt if ordering is not important, empty order, if
1522 /// identity order is important, or the actual order.
1523 /// \param TopToBottom If true, include the order of vectorized stores and
1524 /// insertelement nodes, otherwise skip them.
1525 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1526 bool TopToBottom);
1527
1528 /// Reorders the current graph to the most profitable order starting from the
1529 /// root node to the leaf nodes. The best order is chosen only from the nodes
1530 /// of the same size (vectorization factor). Smaller nodes are considered
1531 /// parts of subgraph with smaller VF and they are reordered independently. We
1532 /// can make it because we still need to extend smaller nodes to the wider VF
1533 /// and we can merge reordering shuffles with the widening shuffles.
1534 void reorderTopToBottom();
1535
1536 /// Reorders the current graph to the most profitable order starting from
1537 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1538 /// number of reshuffles if the leaf nodes use the same order. In this case we
1539 /// can merge the orders and just shuffle user node instead of shuffling its
1540 /// operands. Plus, even the leaf nodes have different orders, it allows to
1541 /// sink reordering in the graph closer to the root node and merge it later
1542 /// during analysis.
1543 void reorderBottomToTop(bool IgnoreReorder = false);
1544
1545 /// \return The vector element size in bits to use when vectorizing the
1546 /// expression tree ending at \p V. If V is a store, the size is the width of
1547 /// the stored value. Otherwise, the size is the width of the largest loaded
1548 /// value reaching V. This method is used by the vectorizer to calculate
1549 /// vectorization factors.
1550 unsigned getVectorElementSize(Value *V);
1551
1552 /// Compute the minimum type sizes required to represent the entries in a
1553 /// vectorizable tree.
1555
1556 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1557 unsigned getMaxVecRegSize() const {
1558 return MaxVecRegSize;
1559 }
1560
1561 // \returns minimum vector register size as set by cl::opt.
1562 unsigned getMinVecRegSize() const {
1563 return MinVecRegSize;
1564 }
1565
1566 unsigned getMinVF(unsigned Sz) const {
1567 return std::max(2U, getMinVecRegSize() / Sz);
1568 }
1569
1570 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1571 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1572 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1573 return MaxVF ? MaxVF : UINT_MAX;
1574 }
1575
1576 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1577 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1578 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1579 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1580 ///
1581 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1582 unsigned canMapToVector(Type *T) const;
1583
1584 /// \returns True if the VectorizableTree is both tiny and not fully
1585 /// vectorizable. We do not vectorize such trees.
1586 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1587
1588 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1589 /// It may happen, if all gather nodes are loads and they cannot be
1590 /// "clusterized". In this case even subgraphs cannot be vectorized more
1591 /// effectively than the base graph.
1592 bool isTreeNotExtendable() const;
1593
1594 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1595 /// can be load combined in the backend. Load combining may not be allowed in
1596 /// the IR optimizer, so we do not want to alter the pattern. For example,
1597 /// partially transforming a scalar bswap() pattern into vector code is
1598 /// effectively impossible for the backend to undo.
1599 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1600 /// may not be necessary.
1601 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1602
1603 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1604 /// can be load combined in the backend. Load combining may not be allowed in
1605 /// the IR optimizer, so we do not want to alter the pattern. For example,
1606 /// partially transforming a scalar bswap() pattern into vector code is
1607 /// effectively impossible for the backend to undo.
1608 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1609 /// may not be necessary.
1610 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1611
1612 /// Checks if the given array of loads can be represented as a vectorized,
1613 /// scatter or just simple gather.
1614 /// \param VL list of loads.
1615 /// \param VL0 main load value.
1616 /// \param Order returned order of load instructions.
1617 /// \param PointerOps returned list of pointer operands.
1618 /// \param BestVF return best vector factor, if recursive check found better
1619 /// vectorization sequences rather than masked gather.
1620 /// \param TryRecursiveCheck used to check if long masked gather can be
1621 /// represented as a serie of loads/insert subvector, if profitable.
1624 SmallVectorImpl<Value *> &PointerOps,
1625 unsigned *BestVF = nullptr,
1626 bool TryRecursiveCheck = true) const;
1627
1628 /// Registers non-vectorizable sequence of loads
1629 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1630 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1631 }
1632
1633 /// Checks if the given loads sequence is known as not vectorizable
1634 template <typename T>
1636 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1637 }
1638
1640
1641 /// This structure holds any data we need about the edges being traversed
1642 /// during buildTree_rec(). We keep track of:
1643 /// (i) the user TreeEntry index, and
1644 /// (ii) the index of the edge.
1645 struct EdgeInfo {
1646 EdgeInfo() = default;
1647 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1649 /// The user TreeEntry.
1650 TreeEntry *UserTE = nullptr;
1651 /// The operand index of the use.
1652 unsigned EdgeIdx = UINT_MAX;
1653#ifndef NDEBUG
1655 const BoUpSLP::EdgeInfo &EI) {
1656 EI.dump(OS);
1657 return OS;
1658 }
1659 /// Debug print.
1660 void dump(raw_ostream &OS) const {
1661 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1662 << " EdgeIdx:" << EdgeIdx << "}";
1663 }
1664 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1665#endif
1666 bool operator == (const EdgeInfo &Other) const {
1667 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1668 }
1669 };
1670
1671 /// A helper class used for scoring candidates for two consecutive lanes.
1673 const TargetLibraryInfo &TLI;
1674 const DataLayout &DL;
1675 ScalarEvolution &SE;
1676 const BoUpSLP &R;
1677 int NumLanes; // Total number of lanes (aka vectorization factor).
1678 int MaxLevel; // The maximum recursion depth for accumulating score.
1679
1680 public:
1682 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1683 int MaxLevel)
1684 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1685 MaxLevel(MaxLevel) {}
1686
1687 // The hard-coded scores listed here are not very important, though it shall
1688 // be higher for better matches to improve the resulting cost. When
1689 // computing the scores of matching one sub-tree with another, we are
1690 // basically counting the number of values that are matching. So even if all
1691 // scores are set to 1, we would still get a decent matching result.
1692 // However, sometimes we have to break ties. For example we may have to
1693 // choose between matching loads vs matching opcodes. This is what these
1694 // scores are helping us with: they provide the order of preference. Also,
1695 // this is important if the scalar is externally used or used in another
1696 // tree entry node in the different lane.
1697
1698 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1699 static const int ScoreConsecutiveLoads = 4;
1700 /// The same load multiple times. This should have a better score than
1701 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1702 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1703 /// a vector load and 1.0 for a broadcast.
1704 static const int ScoreSplatLoads = 3;
1705 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1706 static const int ScoreReversedLoads = 3;
1707 /// A load candidate for masked gather.
1708 static const int ScoreMaskedGatherCandidate = 1;
1709 /// ExtractElementInst from same vector and consecutive indexes.
1710 static const int ScoreConsecutiveExtracts = 4;
1711 /// ExtractElementInst from same vector and reversed indices.
1712 static const int ScoreReversedExtracts = 3;
1713 /// Constants.
1714 static const int ScoreConstants = 2;
1715 /// Instructions with the same opcode.
1716 static const int ScoreSameOpcode = 2;
1717 /// Instructions with alt opcodes (e.g, add + sub).
1718 static const int ScoreAltOpcodes = 1;
1719 /// Identical instructions (a.k.a. splat or broadcast).
1720 static const int ScoreSplat = 1;
1721 /// Matching with an undef is preferable to failing.
1722 static const int ScoreUndef = 1;
1723 /// Score for failing to find a decent match.
1724 static const int ScoreFail = 0;
1725 /// Score if all users are vectorized.
1726 static const int ScoreAllUserVectorized = 1;
1727
1728 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1729 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1730 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1731 /// MainAltOps.
1733 ArrayRef<Value *> MainAltOps) const {
1734 if (!isValidElementType(V1->getType()) ||
1735 !isValidElementType(V2->getType()))
1737
1738 if (V1 == V2) {
1739 if (isa<LoadInst>(V1)) {
1740 // Retruns true if the users of V1 and V2 won't need to be extracted.
1741 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1742 // Bail out if we have too many uses to save compilation time.
1743 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1744 return false;
1745
1746 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1747 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1748 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1749 });
1750 };
1751 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1752 };
1753 // A broadcast of a load can be cheaper on some targets.
1754 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1755 ElementCount::getFixed(NumLanes)) &&
1756 ((int)V1->getNumUses() == NumLanes ||
1757 AllUsersAreInternal(V1, V2)))
1759 }
1761 }
1762
1763 auto CheckSameEntryOrFail = [&]() {
1764 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1765 TE1 && TE1 == R.getTreeEntry(V2))
1768 };
1769
1770 auto *LI1 = dyn_cast<LoadInst>(V1);
1771 auto *LI2 = dyn_cast<LoadInst>(V2);
1772 if (LI1 && LI2) {
1773 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1774 !LI2->isSimple())
1775 return CheckSameEntryOrFail();
1776
1777 std::optional<int> Dist = getPointersDiff(
1778 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1779 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1780 if (!Dist || *Dist == 0) {
1781 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1782 getUnderlyingObject(LI2->getPointerOperand()) &&
1783 R.TTI->isLegalMaskedGather(
1784 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1786 return CheckSameEntryOrFail();
1787 }
1788 // The distance is too large - still may be profitable to use masked
1789 // loads/gathers.
1790 if (std::abs(*Dist) > NumLanes / 2)
1792 // This still will detect consecutive loads, but we might have "holes"
1793 // in some cases. It is ok for non-power-2 vectorization and may produce
1794 // better results. It should not affect current vectorization.
1797 }
1798
1799 auto *C1 = dyn_cast<Constant>(V1);
1800 auto *C2 = dyn_cast<Constant>(V2);
1801 if (C1 && C2)
1803
1804 // Extracts from consecutive indexes of the same vector better score as
1805 // the extracts could be optimized away.
1806 Value *EV1;
1807 ConstantInt *Ex1Idx;
1808 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1809 // Undefs are always profitable for extractelements.
1810 // Compiler can easily combine poison and extractelement <non-poison> or
1811 // undef and extractelement <poison>. But combining undef +
1812 // extractelement <non-poison-but-may-produce-poison> requires some
1813 // extra operations.
1814 if (isa<UndefValue>(V2))
1815 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1818 Value *EV2 = nullptr;
1819 ConstantInt *Ex2Idx = nullptr;
1820 if (match(V2,
1822 m_Undef())))) {
1823 // Undefs are always profitable for extractelements.
1824 if (!Ex2Idx)
1826 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1828 if (EV2 == EV1) {
1829 int Idx1 = Ex1Idx->getZExtValue();
1830 int Idx2 = Ex2Idx->getZExtValue();
1831 int Dist = Idx2 - Idx1;
1832 // The distance is too large - still may be profitable to use
1833 // shuffles.
1834 if (std::abs(Dist) == 0)
1836 if (std::abs(Dist) > NumLanes / 2)
1840 }
1842 }
1843 return CheckSameEntryOrFail();
1844 }
1845
1846 auto *I1 = dyn_cast<Instruction>(V1);
1847 auto *I2 = dyn_cast<Instruction>(V2);
1848 if (I1 && I2) {
1849 if (I1->getParent() != I2->getParent())
1850 return CheckSameEntryOrFail();
1851 SmallVector<Value *, 4> Ops(MainAltOps);
1852 Ops.push_back(I1);
1853 Ops.push_back(I2);
1854 InstructionsState S = getSameOpcode(Ops, TLI);
1855 // Note: Only consider instructions with <= 2 operands to avoid
1856 // complexity explosion.
1857 if (S &&
1858 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1859 !S.isAltShuffle()) &&
1860 all_of(Ops, [&S](Value *V) {
1861 return isa<PoisonValue>(V) ||
1862 cast<Instruction>(V)->getNumOperands() ==
1863 S.getMainOp()->getNumOperands();
1864 }))
1865 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1867 }
1868
1869 if (I1 && isa<PoisonValue>(V2))
1871
1872 if (isa<UndefValue>(V2))
1874
1875 return CheckSameEntryOrFail();
1876 }
1877
1878 /// Go through the operands of \p LHS and \p RHS recursively until
1879 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1880 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1881 /// of \p U1 and \p U2), except at the beginning of the recursion where
1882 /// these are set to nullptr.
1883 ///
1884 /// For example:
1885 /// \verbatim
1886 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1887 /// \ / \ / \ / \ /
1888 /// + + + +
1889 /// G1 G2 G3 G4
1890 /// \endverbatim
1891 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1892 /// each level recursively, accumulating the score. It starts from matching
1893 /// the additions at level 0, then moves on to the loads (level 1). The
1894 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1895 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1896 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1897 /// Please note that the order of the operands does not matter, as we
1898 /// evaluate the score of all profitable combinations of operands. In
1899 /// other words the score of G1 and G4 is the same as G1 and G2. This
1900 /// heuristic is based on ideas described in:
1901 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1902 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1903 /// Luís F. W. Góes
1905 Instruction *U2, int CurrLevel,
1906 ArrayRef<Value *> MainAltOps) const {
1907
1908 // Get the shallow score of V1 and V2.
1909 int ShallowScoreAtThisLevel =
1910 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1911
1912 // If reached MaxLevel,
1913 // or if V1 and V2 are not instructions,
1914 // or if they are SPLAT,
1915 // or if they are not consecutive,
1916 // or if profitable to vectorize loads or extractelements, early return
1917 // the current cost.
1918 auto *I1 = dyn_cast<Instruction>(LHS);
1919 auto *I2 = dyn_cast<Instruction>(RHS);
1920 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1921 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1922 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1923 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1924 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1925 ShallowScoreAtThisLevel))
1926 return ShallowScoreAtThisLevel;
1927 assert(I1 && I2 && "Should have early exited.");
1928
1929 // Contains the I2 operand indexes that got matched with I1 operands.
1930 SmallSet<unsigned, 4> Op2Used;
1931
1932 // Recursion towards the operands of I1 and I2. We are trying all possible
1933 // operand pairs, and keeping track of the best score.
1934 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1935 OpIdx1 != NumOperands1; ++OpIdx1) {
1936 // Try to pair op1I with the best operand of I2.
1937 int MaxTmpScore = 0;
1938 unsigned MaxOpIdx2 = 0;
1939 bool FoundBest = false;
1940 // If I2 is commutative try all combinations.
1941 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1942 unsigned ToIdx = isCommutative(I2)
1943 ? I2->getNumOperands()
1944 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1945 assert(FromIdx <= ToIdx && "Bad index");
1946 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1947 // Skip operands already paired with OpIdx1.
1948 if (Op2Used.count(OpIdx2))
1949 continue;
1950 // Recursively calculate the cost at each level
1951 int TmpScore =
1952 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1953 I1, I2, CurrLevel + 1, {});
1954 // Look for the best score.
1955 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1956 TmpScore > MaxTmpScore) {
1957 MaxTmpScore = TmpScore;
1958 MaxOpIdx2 = OpIdx2;
1959 FoundBest = true;
1960 }
1961 }
1962 if (FoundBest) {
1963 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1964 Op2Used.insert(MaxOpIdx2);
1965 ShallowScoreAtThisLevel += MaxTmpScore;
1966 }
1967 }
1968 return ShallowScoreAtThisLevel;
1969 }
1970 };
1971 /// A helper data structure to hold the operands of a vector of instructions.
1972 /// This supports a fixed vector length for all operand vectors.
1974 /// For each operand we need (i) the value, and (ii) the opcode that it
1975 /// would be attached to if the expression was in a left-linearized form.
1976 /// This is required to avoid illegal operand reordering.
1977 /// For example:
1978 /// \verbatim
1979 /// 0 Op1
1980 /// |/
1981 /// Op1 Op2 Linearized + Op2
1982 /// \ / ----------> |/
1983 /// - -
1984 ///
1985 /// Op1 - Op2 (0 + Op1) - Op2
1986 /// \endverbatim
1987 ///
1988 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1989 ///
1990 /// Another way to think of this is to track all the operations across the
1991 /// path from the operand all the way to the root of the tree and to
1992 /// calculate the operation that corresponds to this path. For example, the
1993 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1994 /// corresponding operation is a '-' (which matches the one in the
1995 /// linearized tree, as shown above).
1996 ///
1997 /// For lack of a better term, we refer to this operation as Accumulated
1998 /// Path Operation (APO).
1999 struct OperandData {
2000 OperandData() = default;
2001 OperandData(Value *V, bool APO, bool IsUsed)
2002 : V(V), APO(APO), IsUsed(IsUsed) {}
2003 /// The operand value.
2004 Value *V = nullptr;
2005 /// TreeEntries only allow a single opcode, or an alternate sequence of
2006 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2007 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2008 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2009 /// (e.g., Add/Mul)
2010 bool APO = false;
2011 /// Helper data for the reordering function.
2012 bool IsUsed = false;
2013 };
2014
2015 /// During operand reordering, we are trying to select the operand at lane
2016 /// that matches best with the operand at the neighboring lane. Our
2017 /// selection is based on the type of value we are looking for. For example,
2018 /// if the neighboring lane has a load, we need to look for a load that is
2019 /// accessing a consecutive address. These strategies are summarized in the
2020 /// 'ReorderingMode' enumerator.
2021 enum class ReorderingMode {
2022 Load, ///< Matching loads to consecutive memory addresses
2023 Opcode, ///< Matching instructions based on opcode (same or alternate)
2024 Constant, ///< Matching constants
2025 Splat, ///< Matching the same instruction multiple times (broadcast)
2026 Failed, ///< We failed to create a vectorizable group
2027 };
2028
2030
2031 /// A vector of operand vectors.
2033 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2034 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2035 unsigned ArgSize = 0;
2036
2037 const TargetLibraryInfo &TLI;
2038 const DataLayout &DL;
2039 ScalarEvolution &SE;
2040 const BoUpSLP &R;
2041 const Loop *L = nullptr;
2042
2043 /// \returns the operand data at \p OpIdx and \p Lane.
2044 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2045 return OpsVec[OpIdx][Lane];
2046 }
2047
2048 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2049 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2050 return OpsVec[OpIdx][Lane];
2051 }
2052
2053 /// Clears the used flag for all entries.
2054 void clearUsed() {
2055 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2056 OpIdx != NumOperands; ++OpIdx)
2057 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2058 ++Lane)
2059 OpsVec[OpIdx][Lane].IsUsed = false;
2060 }
2061
2062 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2063 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2064 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2065 }
2066
2067 /// \param Lane lane of the operands under analysis.
2068 /// \param OpIdx operand index in \p Lane lane we're looking the best
2069 /// candidate for.
2070 /// \param Idx operand index of the current candidate value.
2071 /// \returns The additional score due to possible broadcasting of the
2072 /// elements in the lane. It is more profitable to have power-of-2 unique
2073 /// elements in the lane, it will be vectorized with higher probability
2074 /// after removing duplicates. Currently the SLP vectorizer supports only
2075 /// vectorization of the power-of-2 number of unique scalars.
2076 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2077 const SmallBitVector &UsedLanes) const {
2078 Value *IdxLaneV = getData(Idx, Lane).V;
2079 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2080 isa<ExtractElementInst>(IdxLaneV))
2081 return 0;
2083 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2084 if (Ln == Lane)
2085 continue;
2086 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2087 if (!isa<Instruction>(OpIdxLnV))
2088 return 0;
2089 Uniques.try_emplace(OpIdxLnV, Ln);
2090 }
2091 unsigned UniquesCount = Uniques.size();
2092 auto IdxIt = Uniques.find(IdxLaneV);
2093 unsigned UniquesCntWithIdxLaneV =
2094 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2095 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2096 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2097 unsigned UniquesCntWithOpIdxLaneV =
2098 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2099 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2100 return 0;
2101 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2102 UniquesCntWithOpIdxLaneV,
2103 UniquesCntWithOpIdxLaneV -
2104 bit_floor(UniquesCntWithOpIdxLaneV)) -
2105 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2106 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2107 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2108 }
2109
2110 /// \param Lane lane of the operands under analysis.
2111 /// \param OpIdx operand index in \p Lane lane we're looking the best
2112 /// candidate for.
2113 /// \param Idx operand index of the current candidate value.
2114 /// \returns The additional score for the scalar which users are all
2115 /// vectorized.
2116 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2117 Value *IdxLaneV = getData(Idx, Lane).V;
2118 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2119 // Do not care about number of uses for vector-like instructions
2120 // (extractelement/extractvalue with constant indices), they are extracts
2121 // themselves and already externally used. Vectorization of such
2122 // instructions does not add extra extractelement instruction, just may
2123 // remove it.
2124 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2125 isVectorLikeInstWithConstOps(OpIdxLaneV))
2127 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2128 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2129 return 0;
2130 return R.areAllUsersVectorized(IdxLaneI)
2132 : 0;
2133 }
2134
2135 /// Score scaling factor for fully compatible instructions but with
2136 /// different number of external uses. Allows better selection of the
2137 /// instructions with less external uses.
2138 static const int ScoreScaleFactor = 10;
2139
2140 /// \Returns the look-ahead score, which tells us how much the sub-trees
2141 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2142 /// score. This helps break ties in an informed way when we cannot decide on
2143 /// the order of the operands by just considering the immediate
2144 /// predecessors.
2145 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2146 int Lane, unsigned OpIdx, unsigned Idx,
2147 bool &IsUsed, const SmallBitVector &UsedLanes) {
2148 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2150 // Keep track of the instruction stack as we recurse into the operands
2151 // during the look-ahead score exploration.
2152 int Score =
2153 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2154 /*CurrLevel=*/1, MainAltOps);
2155 if (Score) {
2156 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2157 if (Score <= -SplatScore) {
2158 // Failed score.
2159 Score = 0;
2160 } else {
2161 Score += SplatScore;
2162 // Scale score to see the difference between different operands
2163 // and similar operands but all vectorized/not all vectorized
2164 // uses. It does not affect actual selection of the best
2165 // compatible operand in general, just allows to select the
2166 // operand with all vectorized uses.
2167 Score *= ScoreScaleFactor;
2168 Score += getExternalUseScore(Lane, OpIdx, Idx);
2169 IsUsed = true;
2170 }
2171 }
2172 return Score;
2173 }
2174
2175 /// Best defined scores per lanes between the passes. Used to choose the
2176 /// best operand (with the highest score) between the passes.
2177 /// The key - {Operand Index, Lane}.
2178 /// The value - the best score between the passes for the lane and the
2179 /// operand.
2181 BestScoresPerLanes;
2182
2183 // Search all operands in Ops[*][Lane] for the one that matches best
2184 // Ops[OpIdx][LastLane] and return its opreand index.
2185 // If no good match can be found, return std::nullopt.
2186 std::optional<unsigned>
2187 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2188 ArrayRef<ReorderingMode> ReorderingModes,
2189 ArrayRef<Value *> MainAltOps,
2190 const SmallBitVector &UsedLanes) {
2191 unsigned NumOperands = getNumOperands();
2192
2193 // The operand of the previous lane at OpIdx.
2194 Value *OpLastLane = getData(OpIdx, LastLane).V;
2195
2196 // Our strategy mode for OpIdx.
2197 ReorderingMode RMode = ReorderingModes[OpIdx];
2198 if (RMode == ReorderingMode::Failed)
2199 return std::nullopt;
2200
2201 // The linearized opcode of the operand at OpIdx, Lane.
2202 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2203
2204 // The best operand index and its score.
2205 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2206 // are using the score to differentiate between the two.
2207 struct BestOpData {
2208 std::optional<unsigned> Idx;
2209 unsigned Score = 0;
2210 } BestOp;
2211 BestOp.Score =
2212 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2213 .first->second;
2214
2215 // Track if the operand must be marked as used. If the operand is set to
2216 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2217 // want to reestimate the operands again on the following iterations).
2218 bool IsUsed = RMode == ReorderingMode::Splat ||
2219 RMode == ReorderingMode::Constant ||
2220 RMode == ReorderingMode::Load;
2221 // Iterate through all unused operands and look for the best.
2222 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2223 // Get the operand at Idx and Lane.
2224 OperandData &OpData = getData(Idx, Lane);
2225 Value *Op = OpData.V;
2226 bool OpAPO = OpData.APO;
2227
2228 // Skip already selected operands.
2229 if (OpData.IsUsed)
2230 continue;
2231
2232 // Skip if we are trying to move the operand to a position with a
2233 // different opcode in the linearized tree form. This would break the
2234 // semantics.
2235 if (OpAPO != OpIdxAPO)
2236 continue;
2237
2238 // Look for an operand that matches the current mode.
2239 switch (RMode) {
2240 case ReorderingMode::Load:
2241 case ReorderingMode::Opcode: {
2242 bool LeftToRight = Lane > LastLane;
2243 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2244 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2245 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2246 OpIdx, Idx, IsUsed, UsedLanes);
2247 if (Score > static_cast<int>(BestOp.Score) ||
2248 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2249 Idx == OpIdx)) {
2250 BestOp.Idx = Idx;
2251 BestOp.Score = Score;
2252 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2253 }
2254 break;
2255 }
2256 case ReorderingMode::Constant:
2257 if (isa<Constant>(Op) ||
2258 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2259 BestOp.Idx = Idx;
2260 if (isa<Constant>(Op)) {
2262 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2264 }
2265 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2266 IsUsed = false;
2267 }
2268 break;
2269 case ReorderingMode::Splat:
2270 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2271 IsUsed = Op == OpLastLane;
2272 if (Op == OpLastLane) {
2273 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2274 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2276 }
2277 BestOp.Idx = Idx;
2278 }
2279 break;
2280 case ReorderingMode::Failed:
2281 llvm_unreachable("Not expected Failed reordering mode.");
2282 }
2283 }
2284
2285 if (BestOp.Idx) {
2286 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2287 return BestOp.Idx;
2288 }
2289 // If we could not find a good match return std::nullopt.
2290 return std::nullopt;
2291 }
2292
2293 /// Helper for reorderOperandVecs.
2294 /// \returns the lane that we should start reordering from. This is the one
2295 /// which has the least number of operands that can freely move about or
2296 /// less profitable because it already has the most optimal set of operands.
2297 unsigned getBestLaneToStartReordering() const {
2298 unsigned Min = UINT_MAX;
2299 unsigned SameOpNumber = 0;
2300 // std::pair<unsigned, unsigned> is used to implement a simple voting
2301 // algorithm and choose the lane with the least number of operands that
2302 // can freely move about or less profitable because it already has the
2303 // most optimal set of operands. The first unsigned is a counter for
2304 // voting, the second unsigned is the counter of lanes with instructions
2305 // with same/alternate opcodes and same parent basic block.
2307 // Try to be closer to the original results, if we have multiple lanes
2308 // with same cost. If 2 lanes have the same cost, use the one with the
2309 // highest index.
2310 for (int I = getNumLanes(); I > 0; --I) {
2311 unsigned Lane = I - 1;
2312 OperandsOrderData NumFreeOpsHash =
2313 getMaxNumOperandsThatCanBeReordered(Lane);
2314 // Compare the number of operands that can move and choose the one with
2315 // the least number.
2316 if (NumFreeOpsHash.NumOfAPOs < Min) {
2317 Min = NumFreeOpsHash.NumOfAPOs;
2318 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2319 HashMap.clear();
2320 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2321 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2322 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2323 // Select the most optimal lane in terms of number of operands that
2324 // should be moved around.
2325 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2326 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2327 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2328 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2329 auto [It, Inserted] =
2330 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2331 if (!Inserted)
2332 ++It->second.first;
2333 }
2334 }
2335 // Select the lane with the minimum counter.
2336 unsigned BestLane = 0;
2337 unsigned CntMin = UINT_MAX;
2338 for (const auto &Data : reverse(HashMap)) {
2339 if (Data.second.first < CntMin) {
2340 CntMin = Data.second.first;
2341 BestLane = Data.second.second;
2342 }
2343 }
2344 return BestLane;
2345 }
2346
2347 /// Data structure that helps to reorder operands.
2348 struct OperandsOrderData {
2349 /// The best number of operands with the same APOs, which can be
2350 /// reordered.
2351 unsigned NumOfAPOs = UINT_MAX;
2352 /// Number of operands with the same/alternate instruction opcode and
2353 /// parent.
2354 unsigned NumOpsWithSameOpcodeParent = 0;
2355 /// Hash for the actual operands ordering.
2356 /// Used to count operands, actually their position id and opcode
2357 /// value. It is used in the voting mechanism to find the lane with the
2358 /// least number of operands that can freely move about or less profitable
2359 /// because it already has the most optimal set of operands. Can be
2360 /// replaced with SmallVector<unsigned> instead but hash code is faster
2361 /// and requires less memory.
2362 unsigned Hash = 0;
2363 };
2364 /// \returns the maximum number of operands that are allowed to be reordered
2365 /// for \p Lane and the number of compatible instructions(with the same
2366 /// parent/opcode). This is used as a heuristic for selecting the first lane
2367 /// to start operand reordering.
2368 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2369 unsigned CntTrue = 0;
2370 unsigned NumOperands = getNumOperands();
2371 // Operands with the same APO can be reordered. We therefore need to count
2372 // how many of them we have for each APO, like this: Cnt[APO] = x.
2373 // Since we only have two APOs, namely true and false, we can avoid using
2374 // a map. Instead we can simply count the number of operands that
2375 // correspond to one of them (in this case the 'true' APO), and calculate
2376 // the other by subtracting it from the total number of operands.
2377 // Operands with the same instruction opcode and parent are more
2378 // profitable since we don't need to move them in many cases, with a high
2379 // probability such lane already can be vectorized effectively.
2380 bool AllUndefs = true;
2381 unsigned NumOpsWithSameOpcodeParent = 0;
2382 Instruction *OpcodeI = nullptr;
2383 BasicBlock *Parent = nullptr;
2384 unsigned Hash = 0;
2385 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2386 const OperandData &OpData = getData(OpIdx, Lane);
2387 if (OpData.APO)
2388 ++CntTrue;
2389 // Use Boyer-Moore majority voting for finding the majority opcode and
2390 // the number of times it occurs.
2391 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2392 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
2393 I->getParent() != Parent) {
2394 if (NumOpsWithSameOpcodeParent == 0) {
2395 NumOpsWithSameOpcodeParent = 1;
2396 OpcodeI = I;
2397 Parent = I->getParent();
2398 } else {
2399 --NumOpsWithSameOpcodeParent;
2400 }
2401 } else {
2402 ++NumOpsWithSameOpcodeParent;
2403 }
2404 }
2405 Hash = hash_combine(
2406 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2407 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2408 }
2409 if (AllUndefs)
2410 return {};
2411 OperandsOrderData Data;
2412 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2413 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2414 Data.Hash = Hash;
2415 return Data;
2416 }
2417
2418 /// Go through the instructions in VL and append their operands.
2419 void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
2420 assert(!VL.empty() && "Bad VL");
2421 assert((empty() || VL.size() == getNumLanes()) &&
2422 "Expected same number of lanes");
2423 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2424 // arguments to the intrinsic produces the same result.
2425 constexpr unsigned IntrinsicNumOperands = 2;
2426 unsigned NumOperands = VL0->getNumOperands();
2427 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2428 OpsVec.resize(NumOperands);
2429 unsigned NumLanes = VL.size();
2430 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2431 OpsVec[OpIdx].resize(NumLanes);
2432 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2433 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2434 "Expected instruction or poison value");
2435 // Our tree has just 3 nodes: the root and two operands.
2436 // It is therefore trivial to get the APO. We only need to check the
2437 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2438 // RHS operand. The LHS operand of both add and sub is never attached
2439 // to an inversese operation in the linearized form, therefore its APO
2440 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2441
2442 // Since operand reordering is performed on groups of commutative
2443 // operations or alternating sequences (e.g., +, -), we can safely
2444 // tell the inverse operations by checking commutativity.
2445 if (isa<PoisonValue>(VL[Lane])) {
2446 OpsVec[OpIdx][Lane] = {
2447 PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
2448 false};
2449 continue;
2450 }
2451 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2452 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2453 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2454 APO, false};
2455 }
2456 }
2457 }
2458
2459 /// \returns the number of operands.
2460 unsigned getNumOperands() const { return ArgSize; }
2461
2462 /// \returns the number of lanes.
2463 unsigned getNumLanes() const { return OpsVec[0].size(); }
2464
2465 /// \returns the operand value at \p OpIdx and \p Lane.
2466 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2467 return getData(OpIdx, Lane).V;
2468 }
2469
2470 /// \returns true if the data structure is empty.
2471 bool empty() const { return OpsVec.empty(); }
2472
2473 /// Clears the data.
2474 void clear() { OpsVec.clear(); }
2475
2476 /// \Returns true if there are enough operands identical to \p Op to fill
2477 /// the whole vector (it is mixed with constants or loop invariant values).
2478 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2479 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2480 assert(Op == getValue(OpIdx, Lane) &&
2481 "Op is expected to be getValue(OpIdx, Lane).");
2482 // Small number of loads - try load matching.
2483 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2484 return false;
2485 bool OpAPO = getData(OpIdx, Lane).APO;
2486 bool IsInvariant = L && L->isLoopInvariant(Op);
2487 unsigned Cnt = 0;
2488 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2489 if (Ln == Lane)
2490 continue;
2491 // This is set to true if we found a candidate for broadcast at Lane.
2492 bool FoundCandidate = false;
2493 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2494 OperandData &Data = getData(OpI, Ln);
2495 if (Data.APO != OpAPO || Data.IsUsed)
2496 continue;
2497 Value *OpILane = getValue(OpI, Lane);
2498 bool IsConstantOp = isa<Constant>(OpILane);
2499 // Consider the broadcast candidate if:
2500 // 1. Same value is found in one of the operands.
2501 if (Data.V == Op ||
2502 // 2. The operand in the given lane is not constant but there is a
2503 // constant operand in another lane (which can be moved to the
2504 // given lane). In this case we can represent it as a simple
2505 // permutation of constant and broadcast.
2506 (!IsConstantOp &&
2507 ((Lns > 2 && isa<Constant>(Data.V)) ||
2508 // 2.1. If we have only 2 lanes, need to check that value in the
2509 // next lane does not build same opcode sequence.
2510 (Lns == 2 &&
2511 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2512 isa<Constant>(Data.V)))) ||
2513 // 3. The operand in the current lane is loop invariant (can be
2514 // hoisted out) and another operand is also a loop invariant
2515 // (though not a constant). In this case the whole vector can be
2516 // hoisted out.
2517 // FIXME: need to teach the cost model about this case for better
2518 // estimation.
2519 (IsInvariant && !isa<Constant>(Data.V) &&
2520 !getSameOpcode({Op, Data.V}, TLI) &&
2521 L->isLoopInvariant(Data.V))) {
2522 FoundCandidate = true;
2523 Data.IsUsed = Data.V == Op;
2524 if (Data.V == Op)
2525 ++Cnt;
2526 break;
2527 }
2528 }
2529 if (!FoundCandidate)
2530 return false;
2531 }
2532 return getNumLanes() == 2 || Cnt > 1;
2533 }
2534
2535 /// Checks if there is at least single compatible operand in lanes other
2536 /// than \p Lane, compatible with the operand \p Op.
2537 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2538 assert(Op == getValue(OpIdx, Lane) &&
2539 "Op is expected to be getValue(OpIdx, Lane).");
2540 bool OpAPO = getData(OpIdx, Lane).APO;
2541 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2542 if (Ln == Lane)
2543 continue;
2544 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2545 const OperandData &Data = getData(OpI, Ln);
2546 if (Data.APO != OpAPO || Data.IsUsed)
2547 return true;
2548 Value *OpILn = getValue(OpI, Ln);
2549 return (L && L->isLoopInvariant(OpILn)) ||
2550 (getSameOpcode({Op, OpILn}, TLI) &&
2551 allSameBlock({Op, OpILn}));
2552 }))
2553 return true;
2554 }
2555 return false;
2556 }
2557
2558 public:
2559 /// Initialize with all the operands of the instruction vector \p RootVL.
2561 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2562 L(R.LI->getLoopFor((VL0->getParent()))) {
2563 // Append all the operands of RootVL.
2564 appendOperandsOfVL(RootVL, VL0);
2565 }
2566
2567 /// \Returns a value vector with the operands across all lanes for the
2568 /// opearnd at \p OpIdx.
2569 ValueList getVL(unsigned OpIdx) const {
2570 ValueList OpVL(OpsVec[OpIdx].size());
2571 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2572 "Expected same num of lanes across all operands");
2573 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2574 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2575 return OpVL;
2576 }
2577
2578 // Performs operand reordering for 2 or more operands.
2579 // The original operands are in OrigOps[OpIdx][Lane].
2580 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2581 void reorder() {
2582 unsigned NumOperands = getNumOperands();
2583 unsigned NumLanes = getNumLanes();
2584 // Each operand has its own mode. We are using this mode to help us select
2585 // the instructions for each lane, so that they match best with the ones
2586 // we have selected so far.
2587 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2588
2589 // This is a greedy single-pass algorithm. We are going over each lane
2590 // once and deciding on the best order right away with no back-tracking.
2591 // However, in order to increase its effectiveness, we start with the lane
2592 // that has operands that can move the least. For example, given the
2593 // following lanes:
2594 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2595 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2596 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2597 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2598 // we will start at Lane 1, since the operands of the subtraction cannot
2599 // be reordered. Then we will visit the rest of the lanes in a circular
2600 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2601
2602 // Find the first lane that we will start our search from.
2603 unsigned FirstLane = getBestLaneToStartReordering();
2604
2605 // Initialize the modes.
2606 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2607 Value *OpLane0 = getValue(OpIdx, FirstLane);
2608 // Keep track if we have instructions with all the same opcode on one
2609 // side.
2610 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2611 // Check if OpLane0 should be broadcast.
2612 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2613 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2614 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2615 else if (isa<LoadInst>(OpILane0))
2616 ReorderingModes[OpIdx] = ReorderingMode::Load;
2617 else
2618 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2619 } else if (isa<Constant>(OpLane0)) {
2620 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2621 } else if (isa<Argument>(OpLane0)) {
2622 // Our best hope is a Splat. It may save some cost in some cases.
2623 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2624 } else {
2625 llvm_unreachable("Unexpected value kind.");
2626 }
2627 }
2628
2629 // Check that we don't have same operands. No need to reorder if operands
2630 // are just perfect diamond or shuffled diamond match. Do not do it only
2631 // for possible broadcasts or non-power of 2 number of scalars (just for
2632 // now).
2633 auto &&SkipReordering = [this]() {
2634 SmallPtrSet<Value *, 4> UniqueValues;
2635 ArrayRef<OperandData> Op0 = OpsVec.front();
2636 for (const OperandData &Data : Op0)
2637 UniqueValues.insert(Data.V);
2639 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2640 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2641 return !UniqueValues.contains(Data.V);
2642 }))
2643 return false;
2644 }
2645 // TODO: Check if we can remove a check for non-power-2 number of
2646 // scalars after full support of non-power-2 vectorization.
2647 return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2648 };
2649
2650 // If the initial strategy fails for any of the operand indexes, then we
2651 // perform reordering again in a second pass. This helps avoid assigning
2652 // high priority to the failed strategy, and should improve reordering for
2653 // the non-failed operand indexes.
2654 for (int Pass = 0; Pass != 2; ++Pass) {
2655 // Check if no need to reorder operands since they're are perfect or
2656 // shuffled diamond match.
2657 // Need to do it to avoid extra external use cost counting for
2658 // shuffled matches, which may cause regressions.
2659 if (SkipReordering())
2660 break;
2661 // Skip the second pass if the first pass did not fail.
2662 bool StrategyFailed = false;
2663 // Mark all operand data as free to use.
2664 clearUsed();
2665 // We keep the original operand order for the FirstLane, so reorder the
2666 // rest of the lanes. We are visiting the nodes in a circular fashion,
2667 // using FirstLane as the center point and increasing the radius
2668 // distance.
2669 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2670 for (unsigned I = 0; I < NumOperands; ++I)
2671 MainAltOps[I].push_back(getData(I, FirstLane).V);
2672
2673 SmallBitVector UsedLanes(NumLanes);
2674 UsedLanes.set(FirstLane);
2675 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2676 // Visit the lane on the right and then the lane on the left.
2677 for (int Direction : {+1, -1}) {
2678 int Lane = FirstLane + Direction * Distance;
2679 if (Lane < 0 || Lane >= (int)NumLanes)
2680 continue;
2681 UsedLanes.set(Lane);
2682 int LastLane = Lane - Direction;
2683 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2684 "Out of bounds");
2685 // Look for a good match for each operand.
2686 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2687 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2688 std::optional<unsigned> BestIdx =
2689 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2690 MainAltOps[OpIdx], UsedLanes);
2691 // By not selecting a value, we allow the operands that follow to
2692 // select a better matching value. We will get a non-null value in
2693 // the next run of getBestOperand().
2694 if (BestIdx) {
2695 // Swap the current operand with the one returned by
2696 // getBestOperand().
2697 swap(OpIdx, *BestIdx, Lane);
2698 } else {
2699 // Enable the second pass.
2700 StrategyFailed = true;
2701 }
2702 // Try to get the alternate opcode and follow it during analysis.
2703 if (MainAltOps[OpIdx].size() != 2) {
2704 OperandData &AltOp = getData(OpIdx, Lane);
2705 InstructionsState OpS =
2706 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2707 if (OpS && OpS.isAltShuffle())
2708 MainAltOps[OpIdx].push_back(AltOp.V);
2709 }
2710 }
2711 }
2712 }
2713 // Skip second pass if the strategy did not fail.
2714 if (!StrategyFailed)
2715 break;
2716 }
2717 }
2718
2719#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2720 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2721 switch (RMode) {
2722 case ReorderingMode::Load:
2723 return "Load";
2724 case ReorderingMode::Opcode:
2725 return "Opcode";
2726 case ReorderingMode::Constant:
2727 return "Constant";
2728 case ReorderingMode::Splat:
2729 return "Splat";
2730 case ReorderingMode::Failed:
2731 return "Failed";
2732 }
2733 llvm_unreachable("Unimplemented Reordering Type");
2734 }
2735
2736 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2737 raw_ostream &OS) {
2738 return OS << getModeStr(RMode);
2739 }
2740
2741 /// Debug print.
2742 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2743 printMode(RMode, dbgs());
2744 }
2745
2746 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2747 return printMode(RMode, OS);
2748 }
2749
2751 const unsigned Indent = 2;
2752 unsigned Cnt = 0;
2753 for (const OperandDataVec &OpDataVec : OpsVec) {
2754 OS << "Operand " << Cnt++ << "\n";
2755 for (const OperandData &OpData : OpDataVec) {
2756 OS.indent(Indent) << "{";
2757 if (Value *V = OpData.V)
2758 OS << *V;
2759 else
2760 OS << "null";
2761 OS << ", APO:" << OpData.APO << "}\n";
2762 }
2763 OS << "\n";
2764 }
2765 return OS;
2766 }
2767
2768 /// Debug print.
2769 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2770#endif
2771 };
2772
2773 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2774 /// for a pair which have highest score deemed to have best chance to form
2775 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2776 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2777 /// of the cost, considered to be good enough score.
2778 std::optional<int>
2779 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2780 int Limit = LookAheadHeuristics::ScoreFail) const {
2781 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2783 int BestScore = Limit;
2784 std::optional<int> Index;
2785 for (int I : seq<int>(0, Candidates.size())) {
2786 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2787 Candidates[I].second,
2788 /*U1=*/nullptr, /*U2=*/nullptr,
2789 /*CurrLevel=*/1, {});
2790 if (Score > BestScore) {
2791 BestScore = Score;
2792 Index = I;
2793 }
2794 }
2795 return Index;
2796 }
2797
2798 /// Checks if the instruction is marked for deletion.
2799 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2800
2801 /// Removes an instruction from its block and eventually deletes it.
2802 /// It's like Instruction::eraseFromParent() except that the actual deletion
2803 /// is delayed until BoUpSLP is destructed.
2805 DeletedInstructions.insert(I);
2806 }
2807
2808 /// Remove instructions from the parent function and clear the operands of \p
2809 /// DeadVals instructions, marking for deletion trivially dead operands.
2810 template <typename T>
2813 for (T *V : DeadVals) {
2814 auto *I = cast<Instruction>(V);
2815 DeletedInstructions.insert(I);
2816 }
2817 DenseSet<Value *> Processed;
2818 for (T *V : DeadVals) {
2819 if (!V || !Processed.insert(V).second)
2820 continue;
2821 auto *I = cast<Instruction>(V);
2824 if (const TreeEntry *Entry = getTreeEntry(I)) {
2825 Entries.push_back(Entry);
2826 auto It = MultiNodeScalars.find(I);
2827 if (It != MultiNodeScalars.end())
2828 Entries.append(It->second.begin(), It->second.end());
2829 }
2830 for (Use &U : I->operands()) {
2831 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2832 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2834 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2835 return Entry->VectorizedValue == OpI;
2836 })))
2837 DeadInsts.push_back(OpI);
2838 }
2839 I->dropAllReferences();
2840 }
2841 for (T *V : DeadVals) {
2842 auto *I = cast<Instruction>(V);
2843 if (!I->getParent())
2844 continue;
2845 assert((I->use_empty() || all_of(I->uses(),
2846 [&](Use &U) {
2847 return isDeleted(
2848 cast<Instruction>(U.getUser()));
2849 })) &&
2850 "trying to erase instruction with users.");
2851 I->removeFromParent();
2852 SE->forgetValue(I);
2853 }
2854 // Process the dead instruction list until empty.
2855 while (!DeadInsts.empty()) {
2856 Value *V = DeadInsts.pop_back_val();
2857 Instruction *VI = cast_or_null<Instruction>(V);
2858 if (!VI || !VI->getParent())
2859 continue;
2861 "Live instruction found in dead worklist!");
2862 assert(VI->use_empty() && "Instructions with uses are not dead.");
2863
2864 // Don't lose the debug info while deleting the instructions.
2865 salvageDebugInfo(*VI);
2866
2867 // Null out all of the instruction's operands to see if any operand
2868 // becomes dead as we go.
2869 for (Use &OpU : VI->operands()) {
2870 Value *OpV = OpU.get();
2871 if (!OpV)
2872 continue;
2873 OpU.set(nullptr);
2874
2875 if (!OpV->use_empty())
2876 continue;
2877
2878 // If the operand is an instruction that became dead as we nulled out
2879 // the operand, and if it is 'trivially' dead, delete it in a future
2880 // loop iteration.
2881 if (auto *OpI = dyn_cast<Instruction>(OpV))
2882 if (!DeletedInstructions.contains(OpI) &&
2884 DeadInsts.push_back(OpI);
2885 }
2886
2887 VI->removeFromParent();
2888 DeletedInstructions.insert(VI);
2889 SE->forgetValue(VI);
2890 }
2891 }
2892
2893 /// Checks if the instruction was already analyzed for being possible
2894 /// reduction root.
2896 return AnalyzedReductionsRoots.count(I);
2897 }
2898 /// Register given instruction as already analyzed for being possible
2899 /// reduction root.
2901 AnalyzedReductionsRoots.insert(I);
2902 }
2903 /// Checks if the provided list of reduced values was checked already for
2904 /// vectorization.
2906 return AnalyzedReductionVals.contains(hash_value(VL));
2907 }
2908 /// Adds the list of reduced values to list of already checked values for the
2909 /// vectorization.
2911 AnalyzedReductionVals.insert(hash_value(VL));
2912 }
2913 /// Clear the list of the analyzed reduction root instructions.
2915 AnalyzedReductionsRoots.clear();
2916 AnalyzedReductionVals.clear();
2917 AnalyzedMinBWVals.clear();
2918 }
2919 /// Checks if the given value is gathered in one of the nodes.
2920 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2921 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2922 }
2923 /// Checks if the given value is gathered in one of the nodes.
2924 bool isGathered(const Value *V) const {
2925 return MustGather.contains(V);
2926 }
2927 /// Checks if the specified value was not schedule.
2928 bool isNotScheduled(const Value *V) const {
2929 return NonScheduledFirst.contains(V);
2930 }
2931
2932 /// Check if the value is vectorized in the tree.
2933 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2934
2935 ~BoUpSLP();
2936
2937private:
2938 /// Determine if a node \p E in can be demoted to a smaller type with a
2939 /// truncation. We collect the entries that will be demoted in ToDemote.
2940 /// \param E Node for analysis
2941 /// \param ToDemote indices of the nodes to be demoted.
2942 bool collectValuesToDemote(
2943 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2945 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2946 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2947
2948 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2949 /// reordering (i.e. the operands can be reordered because they have only one
2950 /// user and reordarable).
2951 /// \param ReorderableGathers List of all gather nodes that require reordering
2952 /// (e.g., gather of extractlements or partially vectorizable loads).
2953 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2954 /// reordering, subset of \p NonVectorized.
2955 bool
2956 canReorderOperands(TreeEntry *UserTE,
2957 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2958 ArrayRef<TreeEntry *> ReorderableGathers,
2959 SmallVectorImpl<TreeEntry *> &GatherOps);
2960
2961 /// Checks if the given \p TE is a gather node with clustered reused scalars
2962 /// and reorders it per given \p Mask.
2963 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2964
2965 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2966 /// if any. If it is not vectorized (gather node), returns nullptr.
2967 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2968 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2969 TreeEntry *TE = nullptr;
2970 const auto *It = find_if(VL, [&](Value *V) {
2971 TE = getTreeEntry(V);
2972 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2973 return true;
2974 auto It = MultiNodeScalars.find(V);
2975 if (It != MultiNodeScalars.end()) {
2976 for (TreeEntry *E : It->second) {
2977 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2978 TE = E;
2979 return true;
2980 }
2981 }
2982 }
2983 return false;
2984 });
2985 if (It != VL.end()) {
2986 assert(TE->isSame(VL) && "Expected same scalars.");
2987 return TE;
2988 }
2989 return nullptr;
2990 }
2991
2992 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2993 /// if any. If it is not vectorized (gather node), returns nullptr.
2994 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2995 unsigned OpIdx) const {
2996 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2997 const_cast<TreeEntry *>(UserTE), OpIdx);
2998 }
2999
3000 /// Checks if all users of \p I are the part of the vectorization tree.
3001 bool areAllUsersVectorized(
3002 Instruction *I,
3003 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3004
3005 /// Return information about the vector formed for the specified index
3006 /// of a vector of (the same) instruction.
3008
3009 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3010 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3011
3012 /// Gets the root instruction for the given node. If the node is a strided
3013 /// load/store node with the reverse order, the root instruction is the last
3014 /// one.
3015 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3016
3017 /// \returns Cast context for the given graph node.
3019 getCastContextHint(const TreeEntry &TE) const;
3020
3021 /// \returns the cost of the vectorizable entry.
3022 InstructionCost getEntryCost(const TreeEntry *E,
3023 ArrayRef<Value *> VectorizedVals,
3024 SmallPtrSetImpl<Value *> &CheckedExtracts);
3025
3026 /// This is the recursive part of buildTree.
3027 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3028 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3029
3030 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3031 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3032 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3033 /// returns false, setting \p CurrentOrder to either an empty vector or a
3034 /// non-identity permutation that allows to reuse extract instructions.
3035 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3036 /// extract order.
3037 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
3038 SmallVectorImpl<unsigned> &CurrentOrder,
3039 bool ResizeAllowed = false) const;
3040
3041 /// Vectorize a single entry in the tree.
3042 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3043 /// avoid issues with def-use order.
3044 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3045
3046 /// Returns vectorized operand node, that matches the order of the scalars
3047 /// operand number \p NodeIdx in entry \p E.
3048 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3049 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3050 unsigned NodeIdx) const {
3051 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3052 }
3053
3054 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3055 /// \p E.
3056 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3057 /// avoid issues with def-use order.
3058 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3059
3060 /// Create a new vector from a list of scalar values. Produces a sequence
3061 /// which exploits values reused across lanes, and arranges the inserts
3062 /// for ease of later optimization.
3063 template <typename BVTy, typename ResTy, typename... Args>
3064 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3065
3066 /// Create a new vector from a list of scalar values. Produces a sequence
3067 /// which exploits values reused across lanes, and arranges the inserts
3068 /// for ease of later optimization.
3069 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3070 bool PostponedPHIs);
3071
3072 /// Returns the instruction in the bundle, which can be used as a base point
3073 /// for scheduling. Usually it is the last instruction in the bundle, except
3074 /// for the case when all operands are external (in this case, it is the first
3075 /// instruction in the list).
3076 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3077
3078 /// Tries to find extractelement instructions with constant indices from fixed
3079 /// vector type and gather such instructions into a bunch, which highly likely
3080 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3081 /// was successful, the matched scalars are replaced by poison values in \p VL
3082 /// for future analysis.
3083 std::optional<TargetTransformInfo::ShuffleKind>
3084 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3085 SmallVectorImpl<int> &Mask) const;
3086
3087 /// Tries to find extractelement instructions with constant indices from fixed
3088 /// vector type and gather such instructions into a bunch, which highly likely
3089 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3090 /// was successful, the matched scalars are replaced by poison values in \p VL
3091 /// for future analysis.
3093 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3095 unsigned NumParts) const;
3096
3097 /// Checks if the gathered \p VL can be represented as a single register
3098 /// shuffle(s) of previous tree entries.
3099 /// \param TE Tree entry checked for permutation.
3100 /// \param VL List of scalars (a subset of the TE scalar), checked for
3101 /// permutations. Must form single-register vector.
3102 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3103 /// commands to build the mask using the original vector value, without
3104 /// relying on the potential reordering.
3105 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3106 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3107 std::optional<TargetTransformInfo::ShuffleKind>
3108 isGatherShuffledSingleRegisterEntry(
3109 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3110 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3111 bool ForOrder);
3112
3113 /// Checks if the gathered \p VL can be represented as multi-register
3114 /// shuffle(s) of previous tree entries.
3115 /// \param TE Tree entry checked for permutation.
3116 /// \param VL List of scalars (a subset of the TE scalar), checked for
3117 /// permutations.
3118 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3119 /// commands to build the mask using the original vector value, without
3120 /// relying on the potential reordering.
3121 /// \returns per-register series of ShuffleKind, if gathered values can be
3122 /// represented as shuffles of previous tree entries. \p Mask is filled with
3123 /// the shuffle mask (also on per-register base).
3125 isGatherShuffledEntry(
3126 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3128 unsigned NumParts, bool ForOrder = false);
3129
3130 /// \returns the cost of gathering (inserting) the values in \p VL into a
3131 /// vector.
3132 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3133 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3134 Type *ScalarTy) const;
3135
3136 /// Set the Builder insert point to one after the last instruction in
3137 /// the bundle
3138 void setInsertPointAfterBundle(const TreeEntry *E);
3139
3140 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3141 /// specified, the starting vector value is poison.
3142 Value *
3143 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3144 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3145
3146 /// \returns whether the VectorizableTree is fully vectorizable and will
3147 /// be beneficial even the tree height is tiny.
3148 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3149
3150 /// Run through the list of all gathered loads in the graph and try to find
3151 /// vector loads/masked gathers instead of regular gathers. Later these loads
3152 /// are reshufled to build final gathered nodes.
3153 void tryToVectorizeGatheredLoads(
3154 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3155 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3156 8> &GatheredLoads);
3157
3158 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3159 /// users of \p TE and collects the stores. It returns the map from the store
3160 /// pointers to the collected stores.
3162 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3163
3164 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3165 /// stores in \p StoresVec can form a vector instruction. If so it returns
3166 /// true and populates \p ReorderIndices with the shuffle indices of the
3167 /// stores when compared to the sorted vector.
3168 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3169 OrdersType &ReorderIndices) const;
3170
3171 /// Iterates through the users of \p TE, looking for scalar stores that can be
3172 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3173 /// their order and builds an order index vector for each store bundle. It
3174 /// returns all these order vectors found.
3175 /// We run this after the tree has formed, otherwise we may come across user
3176 /// instructions that are not yet in the tree.
3178 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3179
3180 /// Tries to reorder the gathering node for better vectorization
3181 /// opportunities.
3182 void reorderGatherNode(TreeEntry &TE);
3183
3184 struct TreeEntry {
3185 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3186 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3187
3188 /// \returns Common mask for reorder indices and reused scalars.
3189 SmallVector<int> getCommonMask() const {
3191 inversePermutation(ReorderIndices, Mask);
3192 ::addMask(Mask, ReuseShuffleIndices);
3193 return Mask;
3194 }
3195
3196 /// \returns true if the scalars in VL are equal to this entry.
3197 bool isSame(ArrayRef<Value *> VL) const {
3198 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3199 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3200 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3201 return VL.size() == Mask.size() &&
3202 std::equal(VL.begin(), VL.end(), Mask.begin(),
3203 [Scalars](Value *V, int Idx) {
3204 return (isa<UndefValue>(V) &&
3205 Idx == PoisonMaskElem) ||
3206 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3207 });
3208 };
3209 if (!ReorderIndices.empty()) {
3210 // TODO: implement matching if the nodes are just reordered, still can
3211 // treat the vector as the same if the list of scalars matches VL
3212 // directly, without reordering.
3214 inversePermutation(ReorderIndices, Mask);
3215 if (VL.size() == Scalars.size())
3216 return IsSame(Scalars, Mask);
3217 if (VL.size() == ReuseShuffleIndices.size()) {
3218 ::addMask(Mask, ReuseShuffleIndices);
3219 return IsSame(Scalars, Mask);
3220 }
3221 return false;
3222 }
3223 return IsSame(Scalars, ReuseShuffleIndices);
3224 }
3225
3226 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3227 return isGather() && !UserTreeIndices.empty() &&
3228 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3229 UserTreeIndices.front().UserTE == UserEI.UserTE;
3230 }
3231
3232 /// \returns true if current entry has same operands as \p TE.
3233 bool hasEqualOperands(const TreeEntry &TE) const {
3234 if (TE.getNumOperands() != getNumOperands())
3235 return false;
3236 SmallBitVector Used(getNumOperands());
3237 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3238 unsigned PrevCount = Used.count();
3239 for (unsigned K = 0; K < E; ++K) {
3240 if (Used.test(K))
3241 continue;
3242 if (getOperand(K) == TE.getOperand(I)) {
3243 Used.set(K);
3244 break;
3245 }
3246 }
3247 // Check if we actually found the matching operand.
3248 if (PrevCount == Used.count())
3249 return false;
3250 }
3251 return true;
3252 }
3253
3254 /// \return Final vectorization factor for the node. Defined by the total
3255 /// number of vectorized scalars, including those, used several times in the
3256 /// entry and counted in the \a ReuseShuffleIndices, if any.
3257 unsigned getVectorFactor() const {
3258 if (!ReuseShuffleIndices.empty())
3259 return ReuseShuffleIndices.size();
3260 return Scalars.size();
3261 };
3262
3263 /// Checks if the current node is a gather node.
3264 bool isGather() const {return State == NeedToGather; }
3265
3266 /// A vector of scalars.
3267 ValueList Scalars;
3268
3269 /// The Scalars are vectorized into this value. It is initialized to Null.
3270 WeakTrackingVH VectorizedValue = nullptr;
3271
3272 /// New vector phi instructions emitted for the vectorized phi nodes.
3273 PHINode *PHI = nullptr;
3274
3275 /// Do we need to gather this sequence or vectorize it
3276 /// (either with vector instruction or with scatter/gather
3277 /// intrinsics for store/load)?
3278 enum EntryState {
3279 Vectorize, ///< The node is regularly vectorized.
3280 ScatterVectorize, ///< Masked scatter/gather node.
3281 StridedVectorize, ///< Strided loads (and stores)
3282 NeedToGather, ///< Gather/buildvector node.
3283 CombinedVectorize, ///< Vectorized node, combined with its user into more
3284 ///< complex node like select/cmp to minmax, mul/add to
3285 ///< fma, etc. Must be used for the following nodes in
3286 ///< the pattern, not the very first one.
3287 };
3288 EntryState State;
3289
3290 /// List of combined opcodes supported by the vectorizer.
3291 enum CombinedOpcode {
3292 NotCombinedOp = -1,
3293 MinMax = Instruction::OtherOpsEnd + 1,
3294 };
3295 CombinedOpcode CombinedOp = NotCombinedOp;
3296
3297 /// Does this sequence require some shuffling?
3298 SmallVector<int, 4> ReuseShuffleIndices;
3299
3300 /// Does this entry require reordering?
3301 SmallVector<unsigned, 4> ReorderIndices;
3302
3303 /// Points back to the VectorizableTree.
3304 ///
3305 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3306 /// to be a pointer and needs to be able to initialize the child iterator.
3307 /// Thus we need a reference back to the container to translate the indices
3308 /// to entries.
3309 VecTreeTy &Container;
3310
3311 /// The TreeEntry index containing the user of this entry. We can actually
3312 /// have multiple users so the data structure is not truly a tree.
3313 SmallVector<EdgeInfo, 1> UserTreeIndices;
3314
3315 /// The index of this treeEntry in VectorizableTree.
3316 unsigned Idx = 0;
3317
3318 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3319 /// other nodes as a series of insertvector instructions.
3320 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3321
3322 private:
3323 /// The operands of each instruction in each lane Operands[op_index][lane].
3324 /// Note: This helps avoid the replication of the code that performs the
3325 /// reordering of operands during buildTree_rec() and vectorizeTree().
3327
3328 /// The main/alternate instruction.
3329 Instruction *MainOp = nullptr;
3330 Instruction *AltOp = nullptr;
3331
3332 /// Interleaving factor for interleaved loads Vectorize nodes.
3333 unsigned InterleaveFactor = 0;
3334
3335 public:
3336 /// Returns interleave factor for interleave nodes.
3337 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3338 /// Sets interleaving factor for the interleaving nodes.
3339 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3340
3341 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3342 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3343 if (Operands.size() < OpIdx + 1)
3344 Operands.resize(OpIdx + 1);
3345 assert(Operands[OpIdx].empty() && "Already resized?");
3346 assert(OpVL.size() <= Scalars.size() &&
3347 "Number of operands is greater than the number of scalars.");
3348 Operands[OpIdx].resize(OpVL.size());
3349 copy(OpVL, Operands[OpIdx].begin());
3350 }
3351
3352 /// Set this bundle's operand from Scalars.
3353 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3354 VLOperands Ops(Scalars, MainOp, R);
3355 if (RequireReorder)
3356 Ops.reorder();
3357 for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
3358 setOperand(I, Ops.getVL(I));
3359 }
3360
3361 /// Reorders operands of the node to the given mask \p Mask.
3362 void reorderOperands(ArrayRef<int> Mask) {
3363 for (ValueList &Operand : Operands)
3364 reorderScalars(Operand, Mask);
3365 }
3366
3367 /// \returns the \p OpIdx operand of this TreeEntry.
3368 ValueList &getOperand(unsigned OpIdx) {
3369 assert(OpIdx < Operands.size() && "Off bounds");
3370 return Operands[OpIdx];
3371 }
3372
3373 /// \returns the \p OpIdx operand of this TreeEntry.
3374 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3375 assert(OpIdx < Operands.size() && "Off bounds");
3376 return Operands[OpIdx];
3377 }
3378
3379 /// \returns the number of operands.
3380 unsigned getNumOperands() const { return Operands.size(); }
3381
3382 /// \return the single \p OpIdx operand.
3383 Value *getSingleOperand(unsigned OpIdx) const {
3384 assert(OpIdx < Operands.size() && "Off bounds");
3385 assert(!Operands[OpIdx].empty() && "No operand available");
3386 return Operands[OpIdx][0];
3387 }
3388
3389 /// Some of the instructions in the list have alternate opcodes.
3390 bool isAltShuffle() const { return MainOp != AltOp; }
3391
3392 bool isOpcodeOrAlt(Instruction *I) const {
3393 unsigned CheckedOpcode = I->getOpcode();
3394 return (getOpcode() == CheckedOpcode ||
3395 getAltOpcode() == CheckedOpcode);
3396 }
3397
3398 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3399 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3400 /// \p OpValue.
3401 Value *isOneOf(Value *Op) const {
3402 auto *I = dyn_cast<Instruction>(Op);
3403 if (I && isOpcodeOrAlt(I))
3404 return Op;
3405 return MainOp;
3406 }
3407
3408 void setOperations(const InstructionsState &S) {
3409 assert(S && "InstructionsState is invalid.");
3410 MainOp = S.getMainOp();
3411 AltOp = S.getAltOp();
3412 }
3413
3414 Instruction *getMainOp() const {
3415 return MainOp;
3416 }
3417
3418 Instruction *getAltOp() const {
3419 return AltOp;
3420 }
3421
3422 /// The main/alternate opcodes for the list of instructions.
3423 unsigned getOpcode() const {
3424 return MainOp ? MainOp->getOpcode() : 0;
3425 }
3426
3427 unsigned getAltOpcode() const {
3428 return AltOp ? AltOp->getOpcode() : 0;
3429 }
3430
3431 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3432 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3433 int findLaneForValue(Value *V) const {
3434 unsigned FoundLane = getVectorFactor();
3435 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3436 std::advance(It, 1)) {
3437 if (*It != V)
3438 continue;
3439 FoundLane = std::distance(Scalars.begin(), It);
3440 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3441 if (!ReorderIndices.empty())
3442 FoundLane = ReorderIndices[FoundLane];
3443 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3444 if (ReuseShuffleIndices.empty())
3445 break;
3446 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3447 RIt != ReuseShuffleIndices.end()) {
3448 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3449 break;
3450 }
3451 }
3452 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3453 return FoundLane;
3454 }
3455
3456 /// Build a shuffle mask for graph entry which represents a merge of main
3457 /// and alternate operations.
3458 void
3459 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3461 SmallVectorImpl<Value *> *OpScalars = nullptr,
3462 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3463
3464 /// Return true if this is a non-power-of-2 node.
3465 bool isNonPowOf2Vec() const {
3466 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3467 return IsNonPowerOf2;
3468 }
3469
3470 /// Return true if this is a node, which tries to vectorize number of
3471 /// elements, forming whole vectors.
3472 bool
3473 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3474 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3475 TTI, getValueType(Scalars.front()), Scalars.size());
3476 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3477 "Reshuffling not supported with non-power-of-2 vectors yet.");
3478 return IsNonPowerOf2;
3479 }
3480
3481 Value *getOrdered(unsigned Idx) const {
3482 assert(isGather() && "Must be used only for buildvectors/gathers.");
3483 if (ReorderIndices.empty())
3484 return Scalars[Idx];
3486 inversePermutation(ReorderIndices, Mask);
3487 return Scalars[Mask[Idx]];
3488 }
3489
3490#ifndef NDEBUG
3491 /// Debug printer.
3492 LLVM_DUMP_METHOD void dump() const {
3493 dbgs() << Idx << ".\n";
3494 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3495 dbgs() << "Operand " << OpI << ":\n";
3496 for (const Value *V : Operands[OpI])
3497 dbgs().indent(2) << *V << "\n";
3498 }
3499 dbgs() << "Scalars: \n";
3500 for (Value *V : Scalars)
3501 dbgs().indent(2) << *V << "\n";
3502 dbgs() << "State: ";
3503 switch (State) {
3504 case Vectorize:
3505 if (InterleaveFactor > 0) {
3506 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3507 << "\n";
3508 } else {
3509 dbgs() << "Vectorize\n";
3510 }
3511 break;
3512 case ScatterVectorize:
3513 dbgs() << "ScatterVectorize\n";
3514 break;
3515 case StridedVectorize:
3516 dbgs() << "StridedVectorize\n";
3517 break;
3518 case NeedToGather:
3519 dbgs() << "NeedToGather\n";
3520 break;
3521 case CombinedVectorize:
3522 dbgs() << "CombinedVectorize\n";
3523 break;
3524 }
3525 dbgs() << "MainOp: ";
3526 if (MainOp)
3527 dbgs() << *MainOp << "\n";
3528 else
3529 dbgs() << "NULL\n";
3530 dbgs() << "AltOp: ";
3531 if (AltOp)
3532 dbgs() << *AltOp << "\n";
3533 else
3534 dbgs() << "NULL\n";
3535 dbgs() << "VectorizedValue: ";
3536 if (VectorizedValue)
3537 dbgs() << *VectorizedValue << "\n";
3538 else
3539 dbgs() << "NULL\n";
3540 dbgs() << "ReuseShuffleIndices: ";
3541 if (ReuseShuffleIndices.empty())
3542 dbgs() << "Empty";
3543 else
3544 for (int ReuseIdx : ReuseShuffleIndices)
3545 dbgs() << ReuseIdx << ", ";
3546 dbgs() << "\n";
3547 dbgs() << "ReorderIndices: ";
3548 for (unsigned ReorderIdx : ReorderIndices)
3549 dbgs() << ReorderIdx << ", ";
3550 dbgs() << "\n";
3551 dbgs() << "UserTreeIndices: ";
3552 for (const auto &EInfo : UserTreeIndices)
3553 dbgs() << EInfo << ", ";
3554 dbgs() << "\n";
3555 if (!CombinedEntriesWithIndices.empty()) {
3556 dbgs() << "Combined entries: ";
3557 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
3558 dbgs() << "Entry index " << P.first << " with offset " << P.second;
3559 });
3560 dbgs() << "\n";
3561 }
3562 }
3563#endif
3564 };
3565
3566#ifndef NDEBUG
3567 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3568 InstructionCost VecCost, InstructionCost ScalarCost,
3569 StringRef Banner) const {
3570 dbgs() << "SLP: " << Banner << ":\n";
3571 E->dump();
3572 dbgs() << "SLP: Costs:\n";
3573 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3574 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3575 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3576 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3577 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3578 }
3579#endif
3580
3581 /// Create a new VectorizableTree entry.
3582 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3583 std::optional<ScheduleData *> Bundle,
3584 const InstructionsState &S,
3585 const EdgeInfo &UserTreeIdx,
3586 ArrayRef<int> ReuseShuffleIndices = {},
3587 ArrayRef<unsigned> ReorderIndices = {},
3588 unsigned InterleaveFactor = 0) {
3589 TreeEntry::EntryState EntryState =
3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3592 ReuseShuffleIndices, ReorderIndices);
3593 if (E && InterleaveFactor > 0)
3594 E->setInterleave(InterleaveFactor);
3595 return E;
3596 }
3597
3598 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3599 TreeEntry::EntryState EntryState,
3600 std::optional<ScheduleData *> Bundle,
3601 const InstructionsState &S,
3602 const EdgeInfo &UserTreeIdx,
3603 ArrayRef<int> ReuseShuffleIndices = {},
3604 ArrayRef<unsigned> ReorderIndices = {}) {
3605 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3607 "Need to vectorize gather entry?");
3608 // Gathered loads still gathered? Do not create entry, use the original one.
3609 if (GatheredLoadsEntriesFirst.has_value() &&
3610 EntryState == TreeEntry::NeedToGather && S &&
3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3612 !UserTreeIdx.UserTE)
3613 return nullptr;
3614 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3615 TreeEntry *Last = VectorizableTree.back().get();
3616 Last->Idx = VectorizableTree.size() - 1;
3617 Last->State = EntryState;
3618 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3619 // for non-power-of-two vectors.
3620 assert(
3622 ReuseShuffleIndices.empty()) &&
3623 "Reshuffling scalars not yet supported for nodes with padding");
3624 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3625 ReuseShuffleIndices.end());
3626 if (ReorderIndices.empty()) {
3627 Last->Scalars.assign(VL.begin(), VL.end());
3628 if (S)
3629 Last->setOperations(S);
3630 } else {
3631 // Reorder scalars and build final mask.
3632 Last->Scalars.assign(VL.size(), nullptr);
3633 transform(ReorderIndices, Last->Scalars.begin(),
3634 [VL](unsigned Idx) -> Value * {
3635 if (Idx >= VL.size())
3636 return UndefValue::get(VL.front()->getType());
3637 return VL[Idx];
3638 });
3639 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3640 if (S)
3641 Last->setOperations(S);
3642 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3643 }
3644 if (!Last->isGather()) {
3645 for (Value *V : VL) {
3646 const TreeEntry *TE = getTreeEntry(V);
3647 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3648 "Scalar already in tree!");
3649 if (TE) {
3650 if (TE != Last)
3651 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3652 continue;
3653 }
3654 ScalarToTreeEntry[V] = Last;
3655 }
3656 // Update the scheduler bundle to point to this TreeEntry.
3657 ScheduleData *BundleMember = *Bundle;
3658 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3659 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3660 doesNotNeedToSchedule(VL)) &&
3661 "Bundle and VL out of sync");
3662 if (BundleMember) {
3663 for (Value *V : VL) {
3665 continue;
3666 if (!BundleMember)
3667 continue;
3668 BundleMember->TE = Last;
3669 BundleMember = BundleMember->NextInBundle;
3670 }
3671 }
3672 assert(!BundleMember && "Bundle and VL out of sync");
3673 } else {
3674 // Build a map for gathered scalars to the nodes where they are used.
3675 bool AllConstsOrCasts = true;
3676 for (Value *V : VL)
3677 if (!isConstant(V)) {
3678 auto *I = dyn_cast<CastInst>(V);
3679 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3680 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3681 !UserTreeIdx.UserTE->isGather())
3682 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3683 }
3684 if (AllConstsOrCasts)
3685 CastMaxMinBWSizes =
3686 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3687 MustGather.insert(VL.begin(), VL.end());
3688 }
3689
3690 if (UserTreeIdx.UserTE)
3691 Last->UserTreeIndices.push_back(UserTreeIdx);
3692 return Last;
3693 }
3694
3695 /// -- Vectorization State --
3696 /// Holds all of the tree entries.
3697 TreeEntry::VecTreeTy VectorizableTree;
3698
3699#ifndef NDEBUG
3700 /// Debug printer.
3701 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3702 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3703 VectorizableTree[Id]->dump();
3704 dbgs() << "\n";
3705 }
3706 }
3707#endif
3708
3709 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3710
3711 const TreeEntry *getTreeEntry(Value *V) const {
3712 return ScalarToTreeEntry.lookup(V);
3713 }
3714
3715 /// Check that the operand node of alternate node does not generate
3716 /// buildvector sequence. If it is, then probably not worth it to build
3717 /// alternate shuffle, if number of buildvector operands + alternate
3718 /// instruction > than the number of buildvector instructions.
3719 /// \param S the instructions state of the analyzed values.
3720 /// \param VL list of the instructions with alternate opcodes.
3721 bool areAltOperandsProfitable(const InstructionsState &S,
3722 ArrayRef<Value *> VL) const;
3723
3724 /// Checks if the specified list of the instructions/values can be vectorized
3725 /// and fills required data before actual scheduling of the instructions.
3726 TreeEntry::EntryState
3727 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3728 bool IsScatterVectorizeUserTE,
3729 OrdersType &CurrentOrder,
3730 SmallVectorImpl<Value *> &PointerOps);
3731
3732 /// Maps a specific scalar to its tree entry.
3733 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3734
3735 /// List of scalars, used in several vectorize nodes, and the list of the
3736 /// nodes.
3738
3739 /// Maps a value to the proposed vectorizable size.
3740 SmallDenseMap<Value *, unsigned> InstrElementSize;
3741
3742 /// A list of scalars that we found that we need to keep as scalars.
3743 ValueSet MustGather;
3744
3745 /// A set of first non-schedulable values.
3746 ValueSet NonScheduledFirst;
3747
3748 /// A map between the vectorized entries and the last instructions in the
3749 /// bundles. The bundles are built in use order, not in the def order of the
3750 /// instructions. So, we cannot rely directly on the last instruction in the
3751 /// bundle being the last instruction in the program order during
3752 /// vectorization process since the basic blocks are affected, need to
3753 /// pre-gather them before.
3754 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3755
3756 /// List of gather nodes, depending on other gather/vector nodes, which should
3757 /// be emitted after the vector instruction emission process to correctly
3758 /// handle order of the vector instructions and shuffles.
3759 SetVector<const TreeEntry *> PostponedGathers;
3760
3761 using ValueToGatherNodesMap =
3763 ValueToGatherNodesMap ValueToGatherNodes;
3764
3765 /// A list of the load entries (node indices), which can be vectorized using
3766 /// strided or masked gather approach, but attempted to be represented as
3767 /// contiguous loads.
3768 SetVector<unsigned> LoadEntriesToVectorize;
3769
3770 /// true if graph nodes transforming mode is on.
3771 bool IsGraphTransformMode = false;
3772
3773 /// The index of the first gathered load entry in the VectorizeTree.
3774 std::optional<unsigned> GatheredLoadsEntriesFirst;
3775
3776 /// This POD struct describes one external user in the vectorized tree.
3777 struct ExternalUser {
3778 ExternalUser(Value *S, llvm::User *U, int L)
3779 : Scalar(S), User(U), Lane(L) {}
3780
3781 // Which scalar in our function.
3782 Value *Scalar;
3783
3784 // Which user that uses the scalar.
3786
3787 // Which lane does the scalar belong to.
3788 int Lane;
3789 };
3790 using UserList = SmallVector<ExternalUser, 16>;
3791
3792 /// Checks if two instructions may access the same memory.
3793 ///
3794 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3795 /// is invariant in the calling loop.
3796 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3797 Instruction *Inst2) {
3798 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3799 return true;
3800 // First check if the result is already in the cache.
3801 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3802 auto It = AliasCache.find(Key);
3803 if (It != AliasCache.end())
3804 return It->second;
3805 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3806 // Store the result in the cache.
3807 AliasCache.try_emplace(Key, Aliased);
3808 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3809 return Aliased;
3810 }
3811
3812 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3813
3814 /// Cache for alias results.
3815 /// TODO: consider moving this to the AliasAnalysis itself.
3817
3818 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3819 // globally through SLP because we don't perform any action which
3820 // invalidates capture results.
3821 BatchAAResults BatchAA;
3822
3823 /// Temporary store for deleted instructions. Instructions will be deleted
3824 /// eventually when the BoUpSLP is destructed. The deferral is required to
3825 /// ensure that there are no incorrect collisions in the AliasCache, which
3826 /// can happen if a new instruction is allocated at the same address as a
3827 /// previously deleted instruction.
3828 DenseSet<Instruction *> DeletedInstructions;
3829
3830 /// Set of the instruction, being analyzed already for reductions.
3831 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3832
3833 /// Set of hashes for the list of reduction values already being analyzed.
3834 DenseSet<size_t> AnalyzedReductionVals;
3835
3836 /// Values, already been analyzed for mininmal bitwidth and found to be
3837 /// non-profitable.
3838 DenseSet<Value *> AnalyzedMinBWVals;
3839
3840 /// A list of values that need to extracted out of the tree.
3841 /// This list holds pairs of (Internal Scalar : External User). External User
3842 /// can be nullptr, it means that this Internal Scalar will be used later,
3843 /// after vectorization.
3844 UserList ExternalUses;
3845
3846 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3847 /// extractelement instructions.
3848 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3849
3850 /// Values used only by @llvm.assume calls.
3852
3853 /// Holds all of the instructions that we gathered, shuffle instructions and
3854 /// extractelements.
3855 SetVector<Instruction *> GatherShuffleExtractSeq;
3856
3857 /// A list of blocks that we are going to CSE.
3858 DenseSet<BasicBlock *> CSEBlocks;
3859
3860 /// List of hashes of vector of loads, which are known to be non vectorizable.
3861 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3862
3863 /// Contains all scheduling relevant data for an instruction.
3864 /// A ScheduleData either represents a single instruction or a member of an
3865 /// instruction bundle (= a group of instructions which is combined into a
3866 /// vector instruction).
3867 struct ScheduleData {
3868 // The initial value for the dependency counters. It means that the
3869 // dependencies are not calculated yet.
3870 enum { InvalidDeps = -1 };
3871
3872 ScheduleData() = default;
3873
3874 void init(int BlockSchedulingRegionID, Instruction *I) {
3875 FirstInBundle = this;
3876 NextInBundle = nullptr;
3877 NextLoadStore = nullptr;
3878 IsScheduled = false;
3879 SchedulingRegionID = BlockSchedulingRegionID;
3880 clearDependencies();
3881 Inst = I;
3882 TE = nullptr;
3883 }
3884
3885 /// Verify basic self consistency properties
3886 void verify() {
3887 if (hasValidDependencies()) {
3888 assert(UnscheduledDeps <= Dependencies && "invariant");
3889 } else {
3890 assert(UnscheduledDeps == Dependencies && "invariant");
3891 }
3892
3893 if (IsScheduled) {
3894 assert(isSchedulingEntity() &&
3895 "unexpected scheduled state");
3896 for (const ScheduleData *BundleMember = this; BundleMember;
3897 BundleMember = BundleMember->NextInBundle) {
3898 assert(BundleMember->hasValidDependencies() &&
3899 BundleMember->UnscheduledDeps == 0 &&
3900 "unexpected scheduled state");
3901 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3902 "only bundle is marked scheduled");
3903 }
3904 }
3905
3906 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3907 "all bundle members must be in same basic block");
3908 }
3909
3910 /// Returns true if the dependency information has been calculated.
3911 /// Note that depenendency validity can vary between instructions within
3912 /// a single bundle.
3913 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3914
3915 /// Returns true for single instructions and for bundle representatives
3916 /// (= the head of a bundle).
3917 bool isSchedulingEntity() const { return FirstInBundle == this; }
3918
3919 /// Returns true if it represents an instruction bundle and not only a
3920 /// single instruction.
3921 bool isPartOfBundle() const {
3922 return NextInBundle != nullptr || FirstInBundle != this || TE;
3923 }
3924
3925 /// Returns true if it is ready for scheduling, i.e. it has no more
3926 /// unscheduled depending instructions/bundles.
3927 bool isReady() const {
3928 assert(isSchedulingEntity() &&
3929 "can't consider non-scheduling entity for ready list");
3930 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3931 }
3932
3933 /// Modifies the number of unscheduled dependencies for this instruction,
3934 /// and returns the number of remaining dependencies for the containing
3935 /// bundle.
3936 int incrementUnscheduledDeps(int Incr) {
3937 assert(hasValidDependencies() &&
3938 "increment of unscheduled deps would be meaningless");
3939 UnscheduledDeps += Incr;
3940 return FirstInBundle->unscheduledDepsInBundle();
3941 }
3942
3943 /// Sets the number of unscheduled dependencies to the number of
3944 /// dependencies.
3945 void resetUnscheduledDeps() {
3946 UnscheduledDeps = Dependencies;
3947 }
3948
3949 /// Clears all dependency information.
3950 void clearDependencies() {
3951 Dependencies = InvalidDeps;
3952 resetUnscheduledDeps();
3953 MemoryDependencies.clear();
3954 ControlDependencies.clear();
3955 }
3956
3957 int unscheduledDepsInBundle() const {
3958 assert(isSchedulingEntity() && "only meaningful on the bundle");
3959 int Sum = 0;
3960 for (const ScheduleData *BundleMember = this; BundleMember;
3961 BundleMember = BundleMember->NextInBundle) {
3962 if (BundleMember->UnscheduledDeps == InvalidDeps)
3963 return InvalidDeps;
3964 Sum += BundleMember->UnscheduledDeps;
3965 }
3966 return Sum;
3967 }
3968
3969 void dump(raw_ostream &os) const {
3970 if (!isSchedulingEntity()) {
3971 os << "/ " << *Inst;
3972 } else if (NextInBundle) {
3973 os << '[' << *Inst;
3974 ScheduleData *SD = NextInBundle;
3975 while (SD) {
3976 os << ';' << *SD->Inst;
3977 SD = SD->NextInBundle;
3978 }
3979 os << ']';
3980 } else {
3981 os << *Inst;
3982 }
3983 }
3984
3985 Instruction *Inst = nullptr;
3986
3987 /// The TreeEntry that this instruction corresponds to.
3988 TreeEntry *TE = nullptr;
3989
3990 /// Points to the head in an instruction bundle (and always to this for
3991 /// single instructions).
3992 ScheduleData *FirstInBundle = nullptr;
3993
3994 /// Single linked list of all instructions in a bundle. Null if it is a
3995 /// single instruction.
3996 ScheduleData *NextInBundle = nullptr;
3997
3998 /// Single linked list of all memory instructions (e.g. load, store, call)
3999 /// in the block - until the end of the scheduling region.
4000 ScheduleData *NextLoadStore = nullptr;
4001
4002 /// The dependent memory instructions.
4003 /// This list is derived on demand in calculateDependencies().
4004 SmallVector<ScheduleData *, 4> MemoryDependencies;
4005
4006 /// List of instructions which this instruction could be control dependent
4007 /// on. Allowing such nodes to be scheduled below this one could introduce
4008 /// a runtime fault which didn't exist in the original program.
4009 /// ex: this is a load or udiv following a readonly call which inf loops
4010 SmallVector<ScheduleData *, 4> ControlDependencies;
4011
4012 /// This ScheduleData is in the current scheduling region if this matches
4013 /// the current SchedulingRegionID of BlockScheduling.
4014 int SchedulingRegionID = 0;
4015
4016 /// Used for getting a "good" final ordering of instructions.
4017 int SchedulingPriority = 0;
4018
4019 /// The number of dependencies. Constitutes of the number of users of the
4020 /// instruction plus the number of dependent memory instructions (if any).
4021 /// This value is calculated on demand.
4022 /// If InvalidDeps, the number of dependencies is not calculated yet.
4023 int Dependencies = InvalidDeps;
4024
4025 /// The number of dependencies minus the number of dependencies of scheduled
4026 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4027 /// for scheduling.
4028 /// Note that this is negative as long as Dependencies is not calculated.
4029 int UnscheduledDeps = InvalidDeps;
4030
4031 /// True if this instruction is scheduled (or considered as scheduled in the
4032 /// dry-run).
4033 bool IsScheduled = false;
4034 };
4035
4036#ifndef NDEBUG
4038 const BoUpSLP::ScheduleData &SD) {
4039 SD.dump(os);
4040 return os;
4041 }
4042#endif
4043
4044 friend struct GraphTraits<BoUpSLP *>;
4045 friend struct DOTGraphTraits<BoUpSLP *>;
4046
4047 /// Contains all scheduling data for a basic block.
4048 /// It does not schedules instructions, which are not memory read/write
4049 /// instructions and their operands are either constants, or arguments, or
4050 /// phis, or instructions from others blocks, or their users are phis or from
4051 /// the other blocks. The resulting vector instructions can be placed at the
4052 /// beginning of the basic block without scheduling (if operands does not need
4053 /// to be scheduled) or at the end of the block (if users are outside of the
4054 /// block). It allows to save some compile time and memory used by the
4055 /// compiler.
4056 /// ScheduleData is assigned for each instruction in between the boundaries of
4057 /// the tree entry, even for those, which are not part of the graph. It is
4058 /// required to correctly follow the dependencies between the instructions and
4059 /// their correct scheduling. The ScheduleData is not allocated for the
4060 /// instructions, which do not require scheduling, like phis, nodes with
4061 /// extractelements/insertelements only or nodes with instructions, with
4062 /// uses/operands outside of the block.
4063 struct BlockScheduling {
4064 BlockScheduling(BasicBlock *BB)
4065 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4066
4067 void clear() {
4068 ReadyInsts.clear();
4069 ScheduleStart = nullptr;
4070 ScheduleEnd = nullptr;
4071 FirstLoadStoreInRegion = nullptr;
4072 LastLoadStoreInRegion = nullptr;
4073 RegionHasStackSave = false;
4074
4075 // Reduce the maximum schedule region size by the size of the
4076 // previous scheduling run.
4077 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4078 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4079 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4080 ScheduleRegionSize = 0;
4081
4082 // Make a new scheduling region, i.e. all existing ScheduleData is not
4083 // in the new region yet.
4084 ++SchedulingRegionID;
4085 }
4086
4087 ScheduleData *getScheduleData(Instruction *I) {
4088 if (BB != I->getParent())
4089 // Avoid lookup if can't possibly be in map.
4090 return nullptr;
4091 ScheduleData *SD = ScheduleDataMap.lookup(I);
4092 if (SD && isInSchedulingRegion(SD))
4093 return SD;
4094 return nullptr;
4095 }
4096
4097 ScheduleData *getScheduleData(Value *V) {
4098 if (auto *I = dyn_cast<Instruction>(V))
4099 return getScheduleData(I);
4100 return nullptr;
4101 }
4102
4103 bool isInSchedulingRegion(ScheduleData *SD) const {
4104 return SD->SchedulingRegionID == SchedulingRegionID;
4105 }
4106
4107 /// Marks an instruction as scheduled and puts all dependent ready
4108 /// instructions into the ready-list.
4109 template <typename ReadyListType>
4110 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4111 SD->IsScheduled = true;
4112 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4113
4114 for (ScheduleData *BundleMember = SD; BundleMember;
4115 BundleMember = BundleMember->NextInBundle) {
4116
4117 // Handle the def-use chain dependencies.
4118
4119 // Decrement the unscheduled counter and insert to ready list if ready.
4120 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4121 ScheduleData *OpDef = getScheduleData(I);
4122 if (OpDef && OpDef->hasValidDependencies() &&
4123 OpDef->incrementUnscheduledDeps(-1) == 0) {
4124 // There are no more unscheduled dependencies after
4125 // decrementing, so we can put the dependent instruction
4126 // into the ready list.
4127 ScheduleData *DepBundle = OpDef->FirstInBundle;
4128 assert(!DepBundle->IsScheduled &&
4129 "already scheduled bundle gets ready");
4130 ReadyList.insert(DepBundle);
4132 << "SLP: gets ready (def): " << *DepBundle << "\n");
4133 }
4134 };
4135
4136 // If BundleMember is a vector bundle, its operands may have been
4137 // reordered during buildTree(). We therefore need to get its operands
4138 // through the TreeEntry.
4139 if (TreeEntry *TE = BundleMember->TE) {
4140 // Need to search for the lane since the tree entry can be reordered.
4141 int Lane = std::distance(TE->Scalars.begin(),
4142 find(TE->Scalars, BundleMember->Inst));
4143 assert(Lane >= 0 && "Lane not set");
4144
4145 // Since vectorization tree is being built recursively this assertion
4146 // ensures that the tree entry has all operands set before reaching
4147 // this code. Couple of exceptions known at the moment are extracts
4148 // where their second (immediate) operand is not added. Since
4149 // immediates do not affect scheduler behavior this is considered
4150 // okay.
4151 auto *In = BundleMember->Inst;
4152 assert(
4153 In &&
4154 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4155 In->getNumOperands() == TE->getNumOperands()) &&
4156 "Missed TreeEntry operands?");
4157 (void)In; // fake use to avoid build failure when assertions disabled
4158
4159 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
4160 OpIdx != NumOperands; ++OpIdx)
4161 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4162 DecrUnsched(I);
4163 } else {
4164 // If BundleMember is a stand-alone instruction, no operand reordering
4165 // has taken place, so we directly access its operands.
4166 for (Use &U : BundleMember->Inst->operands())
4167 if (auto *I = dyn_cast<Instruction>(U.get()))
4168 DecrUnsched(I);
4169 }
4170 // Handle the memory dependencies.
4171 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4172 if (MemoryDepSD->hasValidDependencies() &&
4173 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4174 // There are no more unscheduled dependencies after decrementing,
4175 // so we can put the dependent instruction into the ready list.
4176 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4177 assert(!DepBundle->IsScheduled &&
4178 "already scheduled bundle gets ready");
4179 ReadyList.insert(DepBundle);
4181 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4182 }
4183 }
4184 // Handle the control dependencies.
4185 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4186 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4187 // There are no more unscheduled dependencies after decrementing,
4188 // so we can put the dependent instruction into the ready list.
4189 ScheduleData *DepBundle = DepSD->FirstInBundle;
4190 assert(!DepBundle->IsScheduled &&
4191 "already scheduled bundle gets ready");
4192 ReadyList.insert(DepBundle);
4194 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4195 }
4196 }
4197 }
4198 }
4199
4200 /// Verify basic self consistency properties of the data structure.
4201 void verify() {
4202 if (!ScheduleStart)
4203 return;
4204
4205 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4206 ScheduleStart->comesBefore(ScheduleEnd) &&
4207 "Not a valid scheduling region?");
4208
4209 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4210 auto *SD = getScheduleData(I);
4211 if (!SD)
4212 continue;
4213 assert(isInSchedulingRegion(SD) &&
4214 "primary schedule data not in window?");
4215 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4216 "entire bundle in window!");
4217 SD->verify();
4218 }
4219
4220 for (auto *SD : ReadyInsts) {
4221 assert(SD->isSchedulingEntity() && SD->isReady() &&
4222 "item in ready list not ready?");
4223 (void)SD;
4224 }
4225 }
4226
4227 /// Put all instructions into the ReadyList which are ready for scheduling.
4228 template <typename ReadyListType>
4229 void initialFillReadyList(ReadyListType &ReadyList) {
4230 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4231 ScheduleData *SD = getScheduleData(I);
4232 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4233 SD->isReady()) {
4234 ReadyList.insert(SD);
4236 << "SLP: initially in ready list: " << *SD << "\n");
4237 }
4238 }
4239 }
4240
4241 /// Build a bundle from the ScheduleData nodes corresponding to the
4242 /// scalar instruction for each lane.
4243 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4244
4245 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4246 /// cyclic dependencies. This is only a dry-run, no instructions are
4247 /// actually moved at this stage.
4248 /// \returns the scheduling bundle. The returned Optional value is not
4249 /// std::nullopt if \p VL is allowed to be scheduled.
4250 std::optional<ScheduleData *>
4251 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4252 const InstructionsState &S);
4253
4254 /// Un-bundles a group of instructions.
4255 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4256
4257 /// Allocates schedule data chunk.
4258 ScheduleData *allocateScheduleDataChunks();
4259
4260 /// Extends the scheduling region so that V is inside the region.
4261 /// \returns true if the region size is within the limit.
4262 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4263
4264 /// Initialize the ScheduleData structures for new instructions in the
4265 /// scheduling region.
4266 void initScheduleData(Instruction *FromI, Instruction *ToI,
4267 ScheduleData *PrevLoadStore,
4268 ScheduleData *NextLoadStore);
4269
4270 /// Updates the dependency information of a bundle and of all instructions/
4271 /// bundles which depend on the original bundle.
4272 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4273 BoUpSLP *SLP);
4274
4275 /// Sets all instruction in the scheduling region to un-scheduled.
4276 void resetSchedule();
4277
4278 BasicBlock *BB;
4279
4280 /// Simple memory allocation for ScheduleData.
4282
4283 /// The size of a ScheduleData array in ScheduleDataChunks.
4284 int ChunkSize;
4285
4286 /// The allocator position in the current chunk, which is the last entry
4287 /// of ScheduleDataChunks.
4288 int ChunkPos;
4289
4290 /// Attaches ScheduleData to Instruction.
4291 /// Note that the mapping survives during all vectorization iterations, i.e.
4292 /// ScheduleData structures are recycled.
4294
4295 /// The ready-list for scheduling (only used for the dry-run).
4296 SetVector<ScheduleData *> ReadyInsts;
4297
4298 /// The first instruction of the scheduling region.
4299 Instruction *ScheduleStart = nullptr;
4300
4301 /// The first instruction _after_ the scheduling region.
4302 Instruction *ScheduleEnd = nullptr;
4303
4304 /// The first memory accessing instruction in the scheduling region
4305 /// (can be null).
4306 ScheduleData *FirstLoadStoreInRegion = nullptr;
4307
4308 /// The last memory accessing instruction in the scheduling region
4309 /// (can be null).
4310 ScheduleData *LastLoadStoreInRegion = nullptr;
4311
4312 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4313 /// region? Used to optimize the dependence calculation for the
4314 /// common case where there isn't.
4315 bool RegionHasStackSave = false;
4316
4317 /// The current size of the scheduling region.
4318 int ScheduleRegionSize = 0;
4319
4320 /// The maximum size allowed for the scheduling region.
4321 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4322
4323 /// The ID of the scheduling region. For a new vectorization iteration this
4324 /// is incremented which "removes" all ScheduleData from the region.
4325 /// Make sure that the initial SchedulingRegionID is greater than the
4326 /// initial SchedulingRegionID in ScheduleData (which is 0).
4327 int SchedulingRegionID = 1;
4328 };
4329
4330 /// Attaches the BlockScheduling structures to basic blocks.
4332
4333 /// Performs the "real" scheduling. Done before vectorization is actually
4334 /// performed in a basic block.
4335 void scheduleBlock(BlockScheduling *BS);
4336
4337 /// List of users to ignore during scheduling and that don't need extracting.
4338 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4339
4340 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4341 /// sorted SmallVectors of unsigned.
4342 struct OrdersTypeDenseMapInfo {
4343 static OrdersType getEmptyKey() {
4344 OrdersType V;
4345 V.push_back(~1U);
4346 return V;
4347 }
4348
4349 static OrdersType getTombstoneKey() {
4350 OrdersType V;
4351 V.push_back(~2U);
4352 return V;
4353 }
4354
4355 static unsigned getHashValue(const OrdersType &V) {
4356 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4357 }
4358
4359 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4360 return LHS == RHS;
4361 }
4362 };
4363
4364 // Analysis and block reference.
4365 Function *F;
4366 ScalarEvolution *SE;
4368 TargetLibraryInfo *TLI;
4369 LoopInfo *LI;
4370 DominatorTree *DT;
4371 AssumptionCache *AC;
4372 DemandedBits *DB;
4373 const DataLayout *DL;
4375
4376 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4377 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4378
4379 /// Instruction builder to construct the vectorized tree.
4381
4382 /// A map of scalar integer values to the smallest bit width with which they
4383 /// can legally be represented. The values map to (width, signed) pairs,
4384 /// where "width" indicates the minimum bit width and "signed" is True if the
4385 /// value must be signed-extended, rather than zero-extended, back to its
4386 /// original width.
4388
4389 /// Final size of the reduced vector, if the current graph represents the
4390 /// input for the reduction and it was possible to narrow the size of the
4391 /// reduction.
4392 unsigned ReductionBitWidth = 0;
4393
4394 /// Canonical graph size before the transformations.
4395 unsigned BaseGraphSize = 1;
4396
4397 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4398 /// type sizes, used in the tree.
4399 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4400
4401 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4402 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4403 DenseSet<unsigned> ExtraBitWidthNodes;
4404};
4405
4406} // end namespace slpvectorizer
4407
4408template <> struct GraphTraits<BoUpSLP *> {
4409 using TreeEntry = BoUpSLP::TreeEntry;
4410
4411 /// NodeRef has to be a pointer per the GraphWriter.
4413
4415
4416 /// Add the VectorizableTree to the index iterator to be able to return
4417 /// TreeEntry pointers.
4418 struct ChildIteratorType
4419 : public iterator_adaptor_base<
4420 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4422
4424 ContainerTy &VT)
4425 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4426
4427 NodeRef operator*() { return I->UserTE; }
4428 };
4429
4431 return R.VectorizableTree[0].get();
4432 }
4433
4434 static ChildIteratorType child_begin(NodeRef N) {
4435 return {N->UserTreeIndices.begin(), N->Container};
4436 }
4437
4438 static ChildIteratorType child_end(NodeRef N) {
4439 return {N->UserTreeIndices.end(), N->Container};
4440 }
4441
4442 /// For the node iterator we just need to turn the TreeEntry iterator into a
4443 /// TreeEntry* iterator so that it dereferences to NodeRef.
4444 class nodes_iterator {
4446 ItTy It;
4447
4448 public:
4449 nodes_iterator(const ItTy &It2) : It(It2) {}
4450 NodeRef operator*() { return It->get(); }
4451 nodes_iterator operator++() {
4452 ++It;
4453 return *this;
4454 }
4455 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4456 };
4457
4458 static nodes_iterator nodes_begin(BoUpSLP *R) {
4459 return nodes_iterator(R->VectorizableTree.begin());
4460 }
4461
4462 static nodes_iterator nodes_end(BoUpSLP *R) {
4463 return nodes_iterator(R->VectorizableTree.end());
4464 }
4465
4466 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4467};
4468
4469template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4470 using TreeEntry = BoUpSLP::TreeEntry;
4471
4472 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4473
4474 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4475 std::string Str;
4477 OS << Entry->Idx << ".\n";
4478 if (isSplat(Entry->Scalars))
4479 OS << "<splat> ";
4480 for (auto *V : Entry->Scalars) {
4481 OS << *V;
4482 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4483 return EU.Scalar == V;
4484 }))
4485 OS << " <extract>";
4486 OS << "\n";
4487 }
4488 return Str;
4489 }
4490
4491 static std::string getNodeAttributes(const TreeEntry *Entry,
4492 const BoUpSLP *) {
4493 if (Entry->isGather())
4494 return "color=red";
4495 if (Entry->State == TreeEntry::ScatterVectorize ||
4496 Entry->State == TreeEntry::StridedVectorize)
4497 return "color=blue";
4498 return "";
4499 }
4500};
4501
4502} // end namespace llvm
4503
4506 for (auto *I : DeletedInstructions) {
4507 if (!I->getParent()) {
4508 // Temporarily insert instruction back to erase them from parent and
4509 // memory later.
4510 if (isa<PHINode>(I))
4511 // Phi nodes must be the very first instructions in the block.
4512 I->insertBefore(F->getEntryBlock(),
4513 F->getEntryBlock().getFirstNonPHIIt());
4514 else
4515 I->insertBefore(F->getEntryBlock().getTerminator());
4516 continue;
4517 }
4518 for (Use &U : I->operands()) {
4519 auto *Op = dyn_cast<Instruction>(U.get());
4520 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4522 DeadInsts.emplace_back(Op);
4523 }
4524 I->dropAllReferences();
4525 }
4526 for (auto *I : DeletedInstructions) {
4527 assert(I->use_empty() &&
4528 "trying to erase instruction with users.");
4529 I->eraseFromParent();
4530 }
4531
4532 // Cleanup any dead scalar code feeding the vectorized instructions
4534
4535#ifdef EXPENSIVE_CHECKS
4536 // If we could guarantee that this call is not extremely slow, we could
4537 // remove the ifdef limitation (see PR47712).
4538 assert(!verifyFunction(*F, &dbgs()));
4539#endif
4540}
4541
4542/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4543/// contains original mask for the scalars reused in the node. Procedure
4544/// transform this mask in accordance with the given \p Mask.
4546 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4547 "Expected non-empty mask.");
4548 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4549 Prev.swap(Reuses);
4550 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4551 if (Mask[I] != PoisonMaskElem)
4552 Reuses[Mask[I]] = Prev[I];
4553}
4554
4555/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4556/// the original order of the scalars. Procedure transforms the provided order
4557/// in accordance with the given \p Mask. If the resulting \p Order is just an
4558/// identity order, \p Order is cleared.
4560 bool BottomOrder = false) {
4561 assert(!Mask.empty() && "Expected non-empty mask.");
4562 unsigned Sz = Mask.size();
4563 if (BottomOrder) {
4564 SmallVector<unsigned> PrevOrder;
4565 if (Order.empty()) {
4566 PrevOrder.resize(Sz);
4567 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4568 } else {
4569 PrevOrder.swap(Order);
4570 }
4571 Order.assign(Sz, Sz);
4572 for (unsigned I = 0; I < Sz; ++I)
4573 if (Mask[I] != PoisonMaskElem)
4574 Order[I] = PrevOrder[Mask[I]];
4575 if (all_of(enumerate(Order), [&](const auto &Data) {
4576 return Data.value() == Sz || Data.index() == Data.value();
4577 })) {
4578 Order.clear();
4579 return;
4580 }
4581 fixupOrderingIndices(Order);
4582 return;
4583 }
4584 SmallVector<int> MaskOrder;
4585 if (Order.empty()) {
4586 MaskOrder.resize(Sz);
4587 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4588 } else {
4589 inversePermutation(Order, MaskOrder);
4590 }
4591 reorderReuses(MaskOrder, Mask);
4592 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4593 Order.clear();
4594 return;
4595 }
4596 Order.assign(Sz, Sz);
4597 for (unsigned I = 0; I < Sz; ++I)
4598 if (MaskOrder[I] != PoisonMaskElem)
4599 Order[MaskOrder[I]] = I;
4600 fixupOrderingIndices(Order);
4601}
4602
4603std::optional<BoUpSLP::OrdersType>
4604BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4605 assert(TE.isGather() && "Expected gather node only.");
4606 // Try to find subvector extract/insert patterns and reorder only such
4607 // patterns.
4608 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4609 Type *ScalarTy = GatheredScalars.front()->getType();
4610 int NumScalars = GatheredScalars.size();
4611 if (!isValidElementType(ScalarTy))
4612 return std::nullopt;
4613 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4614 int NumParts = TTI->getNumberOfParts(VecTy);
4615 if (NumParts == 0 || NumParts >= NumScalars ||
4616 VecTy->getNumElements() % NumParts != 0 ||
4617 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4618 VecTy->getNumElements() / NumParts))
4619 NumParts = 1;
4620 SmallVector<int> ExtractMask;
4621 SmallVector<int> Mask;
4624 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4626 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4627 /*ForOrder=*/true);
4628 // No shuffled operands - ignore.
4629 if (GatherShuffles.empty() && ExtractShuffles.empty())
4630 return std::nullopt;
4631 OrdersType CurrentOrder(NumScalars, NumScalars);
4632 if (GatherShuffles.size() == 1 &&
4633 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4634 Entries.front().front()->isSame(TE.Scalars)) {
4635 // Perfect match in the graph, will reuse the previously vectorized
4636 // node. Cost is 0.
4637 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4638 return CurrentOrder;
4639 }
4640 auto IsSplatMask = [](ArrayRef<int> Mask) {
4641 int SingleElt = PoisonMaskElem;
4642 return all_of(Mask, [&](int I) {
4643 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4644 SingleElt = I;
4645 return I == PoisonMaskElem || I == SingleElt;
4646 });
4647 };
4648 // Exclusive broadcast mask - ignore.
4649 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4650 (Entries.size() != 1 ||
4651 Entries.front().front()->ReorderIndices.empty())) ||
4652 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4653 return std::nullopt;
4654 SmallBitVector ShuffledSubMasks(NumParts);
4655 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4656 ArrayRef<int> Mask, int PartSz, int NumParts,
4657 function_ref<unsigned(unsigned)> GetVF) {
4658 for (int I : seq<int>(0, NumParts)) {
4659 if (ShuffledSubMasks.test(I))
4660 continue;
4661 const int VF = GetVF(I);
4662 if (VF == 0)
4663 continue;
4664 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4665 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4666 // Shuffle of at least 2 vectors - ignore.
4667 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4668 std::fill(Slice.begin(), Slice.end(), NumScalars);
4669 ShuffledSubMasks.set(I);
4670 continue;
4671 }
4672 // Try to include as much elements from the mask as possible.
4673 int FirstMin = INT_MAX;
4674 int SecondVecFound = false;
4675 for (int K : seq<int>(Limit)) {
4676 int Idx = Mask[I * PartSz + K];
4677 if (Idx == PoisonMaskElem) {
4678 Value *V = GatheredScalars[I * PartSz + K];
4679 if (isConstant(V) && !isa<PoisonValue>(V)) {
4680 SecondVecFound = true;
4681 break;
4682 }
4683 continue;
4684 }
4685 if (Idx < VF) {
4686 if (FirstMin > Idx)
4687 FirstMin = Idx;
4688 } else {
4689 SecondVecFound = true;
4690 break;
4691 }
4692 }
4693 FirstMin = (FirstMin / PartSz) * PartSz;
4694 // Shuffle of at least 2 vectors - ignore.
4695 if (SecondVecFound) {
4696 std::fill(Slice.begin(), Slice.end(), NumScalars);
4697 ShuffledSubMasks.set(I);
4698 continue;
4699 }
4700 for (int K : seq<int>(Limit)) {
4701 int Idx = Mask[I * PartSz + K];
4702 if (Idx == PoisonMaskElem)
4703 continue;
4704 Idx -= FirstMin;
4705 if (Idx >= PartSz) {
4706 SecondVecFound = true;
4707 break;
4708 }
4709 if (CurrentOrder[I * PartSz + Idx] >
4710 static_cast<unsigned>(I * PartSz + K) &&
4711 CurrentOrder[I * PartSz + Idx] !=
4712 static_cast<unsigned>(I * PartSz + Idx))
4713 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4714 }
4715 // Shuffle of at least 2 vectors - ignore.
4716 if (SecondVecFound) {
4717 std::fill(Slice.begin(), Slice.end(), NumScalars);
4718 ShuffledSubMasks.set(I);
4719 continue;
4720 }
4721 }
4722 };
4723 int PartSz = getPartNumElems(NumScalars, NumParts);
4724 if (!ExtractShuffles.empty())
4725 TransformMaskToOrder(
4726 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4727 if (!ExtractShuffles[I])
4728 return 0U;
4729 unsigned VF = 0;
4730 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4731 for (unsigned Idx : seq<unsigned>(Sz)) {
4732 int K = I * PartSz + Idx;
4733 if (ExtractMask[K] == PoisonMaskElem)
4734 continue;
4735 if (!TE.ReuseShuffleIndices.empty())
4736 K = TE.ReuseShuffleIndices[K];
4737 if (K == PoisonMaskElem)
4738 continue;
4739 if (!TE.ReorderIndices.empty())
4740 K = std::distance(TE.ReorderIndices.begin(),
4741 find(TE.ReorderIndices, K));
4742 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4743 if (!EI)
4744 continue;
4745 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4746 ->getElementCount()
4747 .getKnownMinValue());
4748 }
4749 return VF;
4750 });
4751 // Check special corner case - single shuffle of the same entry.
4752 if (GatherShuffles.size() == 1 && NumParts != 1) {
4753 if (ShuffledSubMasks.any())
4754 return std::nullopt;
4755 PartSz = NumScalars;
4756 NumParts = 1;
4757 }
4758 if (!Entries.empty())
4759 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4760 if (!GatherShuffles[I])
4761 return 0U;
4762 return std::max(Entries[I].front()->getVectorFactor(),
4763 Entries[I].back()->getVectorFactor());
4764 });
4765 int NumUndefs =
4766 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4767 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4768 return std::nullopt;
4769 return std::move(CurrentOrder);
4770}
4771
4772static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4773 const TargetLibraryInfo &TLI,
4774 bool CompareOpcodes = true) {
4777 return false;
4778 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4779 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4780 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4781 (!GEP2 || GEP2->getNumOperands() == 2) &&
4782 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4783 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4784 !CompareOpcodes ||
4785 (GEP1 && GEP2 &&
4786 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4787}
4788
4789/// Calculates minimal alignment as a common alignment.
4790template <typename T>
4792 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4793 for (Value *V : VL.drop_front())
4794 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4795 return CommonAlignment;
4796}
4797
4798/// Check if \p Order represents reverse order.
4800 assert(!Order.empty() &&
4801 "Order is empty. Please check it before using isReverseOrder.");
4802 unsigned Sz = Order.size();
4803 return all_of(enumerate(Order), [&](const auto &Pair) {
4804 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4805 });
4806}
4807
4808/// Checks if the provided list of pointers \p Pointers represents the strided
4809/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4810/// Otherwise, if \p Inst is not specified, just initialized optional value is
4811/// returned to show that the pointers represent strided pointers. If \p Inst
4812/// specified, the runtime stride is materialized before the given \p Inst.
4813/// \returns std::nullopt if the pointers are not pointers with the runtime
4814/// stride, nullptr or actual stride value, otherwise.
4815static std::optional<Value *>
4817 const DataLayout &DL, ScalarEvolution &SE,
4818 SmallVectorImpl<unsigned> &SortedIndices,
4819 Instruction *Inst = nullptr) {
4821 const SCEV *PtrSCEVLowest = nullptr;
4822 const SCEV *PtrSCEVHighest = nullptr;
4823 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4824 // addresses).
4825 for (Value *Ptr : PointerOps) {
4826 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4827 if (!PtrSCEV)
4828 return std::nullopt;
4829 SCEVs.push_back(PtrSCEV);
4830 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4831 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4832 continue;
4833 }
4834 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4835 if (isa<SCEVCouldNotCompute>(Diff))
4836 return std::nullopt;
4837 if (Diff->isNonConstantNegative()) {
4838 PtrSCEVLowest = PtrSCEV;
4839 continue;
4840 }
4841 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4842 if (isa<SCEVCouldNotCompute>(Diff1))
4843 return std::nullopt;
4844 if (Diff1->isNonConstantNegative()) {
4845 PtrSCEVHighest = PtrSCEV;
4846 continue;
4847 }
4848 }
4849 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4850 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4851 if (isa<SCEVCouldNotCompute>(Dist))
4852 return std::nullopt;
4853 int Size = DL.getTypeStoreSize(ElemTy);
4854 auto TryGetStride = [&](const SCEV *Dist,
4855 const SCEV *Multiplier) -> const SCEV * {
4856 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4857 if (M->getOperand(0) == Multiplier)
4858 return M->getOperand(1);
4859 if (M->getOperand(1) == Multiplier)
4860 return M->getOperand(0);
4861 return nullptr;
4862 }
4863 if (Multiplier == Dist)
4864 return SE.getConstant(Dist->getType(), 1);
4865 return SE.getUDivExactExpr(Dist, Multiplier);
4866 };
4867 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4868 const SCEV *Stride = nullptr;
4869 if (Size != 1 || SCEVs.size() > 2) {
4870 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4871 Stride = TryGetStride(Dist, Sz);
4872 if (!Stride)
4873 return std::nullopt;
4874 }
4875 if (!Stride || isa<SCEVConstant>(Stride))
4876 return std::nullopt;
4877 // Iterate through all pointers and check if all distances are
4878 // unique multiple of Stride.
4879 using DistOrdPair = std::pair<int64_t, int>;
4880 auto Compare = llvm::less_first();
4881 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4882 int Cnt = 0;
4883 bool IsConsecutive = true;
4884 for (const SCEV *PtrSCEV : SCEVs) {
4885 unsigned Dist = 0;
4886 if (PtrSCEV != PtrSCEVLowest) {
4887 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4888 const SCEV *Coeff = TryGetStride(Diff, Stride);
4889 if (!Coeff)
4890 return std::nullopt;
4891 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4892 if (!SC || isa<SCEVCouldNotCompute>(SC))
4893 return std::nullopt;
4894 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4895 SE.getMulExpr(Stride, SC)))
4896 ->isZero())
4897 return std::nullopt;
4898 Dist = SC->getAPInt().getZExtValue();
4899 }
4900 // If the strides are not the same or repeated, we can't vectorize.
4901 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4902 return std::nullopt;
4903 auto Res = Offsets.emplace(Dist, Cnt);
4904 if (!Res.second)
4905 return std::nullopt;
4906 // Consecutive order if the inserted element is the last one.
4907 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4908 ++Cnt;
4909 }
4910 if (Offsets.size() != SCEVs.size())
4911 return std::nullopt;
4912 SortedIndices.clear();
4913 if (!IsConsecutive) {
4914 // Fill SortedIndices array only if it is non-consecutive.
4915 SortedIndices.resize(PointerOps.size());
4916 Cnt = 0;
4917 for (const std::pair<int64_t, int> &Pair : Offsets) {
4918 SortedIndices[Cnt] = Pair.second;
4919 ++Cnt;
4920 }
4921 }
4922 if (!Inst)
4923 return nullptr;
4924 SCEVExpander Expander(SE, DL, "strided-load-vec");
4925 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4926}
4927
4928static std::pair<InstructionCost, InstructionCost>
4930 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4931 Type *ScalarTy, VectorType *VecTy);
4932
4933/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4934/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4935/// subvector pattern.
4936static InstructionCost
4938 VectorType *Tp, ArrayRef<int> Mask = {},
4940 int Index = 0, VectorType *SubTp = nullptr,
4942 if (Kind != TTI::SK_PermuteTwoSrc)
4943 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4944 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4945 int NumSubElts;
4947 Mask, NumSrcElts, NumSubElts, Index)) {
4948 if (Index + NumSubElts > NumSrcElts &&
4949 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4950 return TTI.getShuffleCost(
4952 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4954 }
4955 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4956}
4957
4961 SmallVectorImpl<Value *> &PointerOps,
4962 unsigned *BestVF, bool TryRecursiveCheck) const {
4963 // Check that a vectorized load would load the same memory as a scalar
4964 // load. For example, we don't want to vectorize loads that are smaller
4965 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4966 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4967 // from such a struct, we read/write packed bits disagreeing with the
4968 // unvectorized version.
4969 if (BestVF)
4970 *BestVF = 0;
4972 return LoadsState::Gather;
4973 Type *ScalarTy = VL0->getType();
4974
4975 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4976 return LoadsState::Gather;
4977
4978 // Make sure all loads in the bundle are simple - we can't vectorize
4979 // atomic or volatile loads.
4980 PointerOps.clear();
4981 const unsigned Sz = VL.size();
4982 PointerOps.resize(Sz);
4983 auto *POIter = PointerOps.begin();
4984 for (Value *V : VL) {
4985 auto *L = dyn_cast<LoadInst>(V);
4986 if (!L || !L->isSimple())
4987 return LoadsState::Gather;
4988 *POIter = L->getPointerOperand();
4989 ++POIter;
4990 }
4991
4992 Order.clear();
4993 // Check the order of pointer operands or that all pointers are the same.
4994 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4995
4996 auto *VecTy = getWidenedType(ScalarTy, Sz);
4997 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4998 if (!IsSorted) {
4999 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
5000 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5001 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5003 }
5004
5005 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5006 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5007 return LoadsState::Gather;
5008
5009 if (!all_of(PointerOps, [&](Value *P) {
5010 return arePointersCompatible(P, PointerOps.front(), *TLI);
5011 }))
5012 return LoadsState::Gather;
5013
5014 } else {
5015 Value *Ptr0;
5016 Value *PtrN;
5017 if (Order.empty()) {
5018 Ptr0 = PointerOps.front();
5019 PtrN = PointerOps.back();
5020 } else {
5021 Ptr0 = PointerOps[Order.front()];
5022 PtrN = PointerOps[Order.back()];
5023 }
5024 std::optional<int> Diff =
5025 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5026 // Check that the sorted loads are consecutive.
5027 if (static_cast<unsigned>(*Diff) == Sz - 1)
5028 return LoadsState::Vectorize;
5029 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5030 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5031 return LoadsState::Gather;
5032 // Simple check if not a strided access - clear order.
5033 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5034 // Try to generate strided load node if:
5035 // 1. Target with strided load support is detected.
5036 // 2. The number of loads is greater than MinProfitableStridedLoads,
5037 // or the potential stride <= MaxProfitableLoadStride and the
5038 // potential stride is power-of-2 (to avoid perf regressions for the very
5039 // small number of loads) and max distance > number of loads, or potential
5040 // stride is -1.
5041 // 3. The loads are ordered, or number of unordered loads <=
5042 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5043 // (this check is to avoid extra costs for very expensive shuffles).
5044 // 4. Any pointer operand is an instruction with the users outside of the
5045 // current graph (for masked gathers extra extractelement instructions
5046 // might be required).
5047 auto IsAnyPointerUsedOutGraph =
5048 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5049 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5050 return !getTreeEntry(U) && !MustGather.contains(U);
5051 });
5052 });
5053 const unsigned AbsoluteDiff = std::abs(*Diff);
5054 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5056 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5057 has_single_bit(AbsoluteDiff))) &&
5058 AbsoluteDiff > Sz) ||
5059 *Diff == -(static_cast<int>(Sz) - 1))) {
5060 int Stride = *Diff / static_cast<int>(Sz - 1);
5061 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5062 Align Alignment =
5063 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5064 ->getAlign();
5065 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5066 // Iterate through all pointers and check if all distances are
5067 // unique multiple of Dist.
5068 SmallSet<int, 4> Dists;
5069 for (Value *Ptr : PointerOps) {
5070 int Dist = 0;
5071 if (Ptr == PtrN)
5072 Dist = *Diff;
5073 else if (Ptr != Ptr0)
5074 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5075 // If the strides are not the same or repeated, we can't
5076 // vectorize.
5077 if (((Dist / Stride) * Stride) != Dist ||
5078 !Dists.insert(Dist).second)
5079 break;
5080 }
5081 if (Dists.size() == Sz)
5083 }
5084 }
5085 }
5086 }
5087 // Correctly identify compare the cost of loads + shuffles rather than
5088 // strided/masked gather loads. Returns true if vectorized + shuffles
5089 // representation is better than just gather.
5090 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5091 unsigned *BestVF,
5092 bool ProfitableGatherPointers) {
5093 if (BestVF)
5094 *BestVF = 0;
5095 // Compare masked gather cost and loads + insert subvector costs.
5097 auto [ScalarGEPCost, VectorGEPCost] =
5098 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5099 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5100 // Estimate the cost of masked gather GEP. If not a splat, roughly
5101 // estimate as a buildvector, otherwise estimate as splat.
5102 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5103 VectorType *PtrVecTy =
5104 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5105 VecTy->getNumElements());
5106 if (static_cast<unsigned>(count_if(
5107 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5108 any_of(PointerOps, [&](Value *V) {
5109 return getUnderlyingObject(V) !=
5110 getUnderlyingObject(PointerOps.front());
5111 }))
5112 VectorGEPCost += TTI.getScalarizationOverhead(
5113 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5114 else
5115 VectorGEPCost +=
5117 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5118 /*Insert=*/true, /*Extract=*/false, CostKind) +
5120 // The cost of scalar loads.
5121 InstructionCost ScalarLoadsCost =
5122 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5123 [&](InstructionCost C, Value *V) {
5124 return C + TTI.getInstructionCost(
5125 cast<Instruction>(V), CostKind);
5126 }) +
5127 ScalarGEPCost;
5128 // The cost of masked gather.
5129 InstructionCost MaskedGatherCost =
5131 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5132 /*VariableMask=*/false, CommonAlignment, CostKind) +
5133 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5134 InstructionCost GatherCost =
5135 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5136 /*Extract=*/false, CostKind) +
5137 ScalarLoadsCost;
5138 // The list of loads is small or perform partial check already - directly
5139 // compare masked gather cost and gather cost.
5140 constexpr unsigned ListLimit = 4;
5141 if (!TryRecursiveCheck || VL.size() < ListLimit)
5142 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5143
5144 // FIXME: The following code has not been updated for non-power-of-2
5145 // vectors. The splitting logic here does not cover the original
5146 // vector if the vector factor is not a power of two. FIXME
5147 if (!has_single_bit(VL.size()))
5148 return false;
5149
5150 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5151 unsigned MinVF = getMinVF(2 * Sz);
5152 DemandedElts.clearAllBits();
5153 // Iterate through possible vectorization factors and check if vectorized +
5154 // shuffles is better than just gather.
5155 for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
5157 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5158 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5160 SmallVector<Value *> PointerOps;
5161 LoadsState LS =
5162 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5163 /*TryRecursiveCheck=*/false);
5164 // Check that the sorted loads are consecutive.
5165 if (LS == LoadsState::Gather) {
5166 if (BestVF) {
5167 DemandedElts.setAllBits();
5168 break;
5169 }
5170 DemandedElts.setBits(Cnt, Cnt + VF);
5171 continue;
5172 }
5173 // If need the reorder - consider as high-cost masked gather for now.
5174 if ((LS == LoadsState::Vectorize ||
5176 !Order.empty() && !isReverseOrder(Order))
5178 States.push_back(LS);
5179 }
5180 if (DemandedElts.isAllOnes())
5181 // All loads gathered - try smaller VF.
5182 continue;
5183 // Can be vectorized later as a serie of loads/insertelements.
5184 InstructionCost VecLdCost = 0;
5185 if (!DemandedElts.isZero()) {
5186 VecLdCost =
5187 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5188 /*Extract=*/false, CostKind) +
5189 ScalarGEPCost;
5190 for (unsigned Idx : seq<unsigned>(VL.size()))
5191 if (DemandedElts[Idx])
5192 VecLdCost +=
5193 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5194 }
5195 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5196 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5197 for (auto [I, LS] : enumerate(States)) {
5198 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5199 InstructionCost VectorGEPCost =
5200 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5201 ? 0
5202 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5203 LI0->getPointerOperand(),
5204 Instruction::GetElementPtr, CostKind, ScalarTy,
5205 SubVecTy)
5206 .second;
5207 if (LS == LoadsState::ScatterVectorize) {
5208 if (static_cast<unsigned>(
5209 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5210 PointerOps.size() - 1 ||
5211 any_of(PointerOps, [&](Value *V) {
5212 return getUnderlyingObject(V) !=
5213 getUnderlyingObject(PointerOps.front());
5214 }))
5215 VectorGEPCost += TTI.getScalarizationOverhead(
5216 SubVecTy, APInt::getAllOnes(VF),
5217 /*Insert=*/true, /*Extract=*/false, CostKind);
5218 else
5219 VectorGEPCost +=
5221 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5222 /*Insert=*/true, /*Extract=*/false, CostKind) +
5223 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5224 CostKind);
5225 }
5226 switch (LS) {
5228 VecLdCost +=
5229 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5230 LI0->getPointerAddressSpace(), CostKind,
5232 VectorGEPCost;
5233 break;
5235 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5236 LI0->getPointerOperand(),
5237 /*VariableMask=*/false,
5238 CommonAlignment, CostKind) +
5239 VectorGEPCost;
5240 break;
5242 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5243 LI0->getPointerOperand(),
5244 /*VariableMask=*/false,
5245 CommonAlignment, CostKind) +
5246 VectorGEPCost;
5247 break;
5248 case LoadsState::Gather:
5249 // Gathers are already calculated - ignore.
5250 continue;
5251 }
5252 SmallVector<int> ShuffleMask(VL.size());
5253 for (int Idx : seq<int>(0, VL.size()))
5254 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5255 if (I > 0)
5256 VecLdCost +=
5257 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5258 CostKind, I * VF, SubVecTy);
5259 }
5260 // If masked gather cost is higher - better to vectorize, so
5261 // consider it as a gather node. It will be better estimated
5262 // later.
5263 if (MaskedGatherCost >= VecLdCost &&
5264 VecLdCost - GatherCost < -SLPCostThreshold) {
5265 if (BestVF)
5266 *BestVF = VF;
5267 return true;
5268 }
5269 }
5270 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5271 };
5272 // TODO: need to improve analysis of the pointers, if not all of them are
5273 // GEPs or have > 2 operands, we end up with a gather node, which just
5274 // increases the cost.
5275 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5276 bool ProfitableGatherPointers =
5277 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5278 return L->isLoopInvariant(V);
5279 })) <= Sz / 2;
5280 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5281 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5282 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5283 (GEP && GEP->getNumOperands() == 2 &&
5284 isa<Constant, Instruction>(GEP->getOperand(1)));
5285 })) {
5286 // Check if potential masked gather can be represented as series
5287 // of loads + insertsubvectors.
5288 // If masked gather cost is higher - better to vectorize, so
5289 // consider it as a gather node. It will be better estimated
5290 // later.
5291 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5292 ProfitableGatherPointers))
5294 }
5295
5296 return LoadsState::Gather;
5297}
5298
5300 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5301 const DataLayout &DL, ScalarEvolution &SE,
5302 SmallVectorImpl<unsigned> &SortedIndices) {
5303 assert(
5304 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5305 "Expected list of pointer operands.");
5306 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5307 // Ptr into, sort and return the sorted indices with values next to one
5308 // another.
5311 Bases;
5312 Bases
5313 .try_emplace(std::make_pair(
5315 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5316
5317 SortedIndices.clear();
5318 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5319 auto Key = std::make_pair(BBs[Cnt + 1],
5321 bool Found = any_of(Bases.try_emplace(Key).first->second,
5322 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5323 std::optional<int> Diff = getPointersDiff(
5324 ElemTy, std::get<0>(Base.front()), ElemTy,
5325 Ptr, DL, SE,
5326 /*StrictCheck=*/true);
5327 if (!Diff)
5328 return false;
5329
5330 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5331 return true;
5332 });
5333
5334 if (!Found) {
5335 // If we haven't found enough to usefully cluster, return early.
5336 if (Bases.size() > VL.size() / 2 - 1)
5337 return false;
5338
5339 // Not found already - add a new Base
5340 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5341 }
5342 }
5343
5344 if (Bases.size() == VL.size())
5345 return false;
5346
5347 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5348 Bases.front().second.size() == VL.size()))
5349 return false;
5350
5351 // For each of the bases sort the pointers by Offset and check if any of the
5352 // base become consecutively allocated.
5353 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5354 SmallPtrSet<Value *, 13> FirstPointers;
5355 SmallPtrSet<Value *, 13> SecondPointers;
5356 Value *P1 = Ptr1;
5357 Value *P2 = Ptr2;
5358 if (P1 == P2)
5359 return false;
5360 unsigned Depth = 0;
5361 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) &&
5363 FirstPointers.insert(P1);
5364 SecondPointers.insert(P2);
5365 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5366 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5367 ++Depth;
5368 }
5369 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5370 "Unable to find matching root.");
5371 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5372 };
5373 for (auto &Base : Bases) {
5374 for (auto &Vec : Base.second) {
5375 if (Vec.size() > 1) {
5376 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5377 const std::tuple<Value *, int, unsigned> &Y) {
5378 return std::get<1>(X) < std::get<1>(Y);
5379 });
5380 int InitialOffset = std::get<1>(Vec[0]);
5381 bool AnyConsecutive =
5382 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5383 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5384 });
5385 // Fill SortedIndices array only if it looks worth-while to sort the
5386 // ptrs.
5387 if (!AnyConsecutive)
5388 return false;
5389 }
5390 }
5391 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5392 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5393 });
5394 }
5395
5396 for (auto &T : Bases)
5397 for (const auto &Vec : T.second)
5398 for (const auto &P : Vec)
5399 SortedIndices.push_back(std::get<2>(P));
5400
5401 assert(SortedIndices.size() == VL.size() &&
5402 "Expected SortedIndices to be the size of VL");
5403 return true;
5404}
5405
5406std::optional<BoUpSLP::OrdersType>
5407BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5408 assert(TE.isGather() && "Expected gather node only.");
5409 Type *ScalarTy = TE.Scalars[0]->getType();
5410
5412 Ptrs.reserve(TE.Scalars.size());
5414 BBs.reserve(TE.Scalars.size());
5415 for (Value *V : TE.Scalars) {
5416 auto *L = dyn_cast<LoadInst>(V);
5417 if (!L || !L->isSimple())
5418 return std::nullopt;
5419 Ptrs.push_back(L->getPointerOperand());
5420 BBs.push_back(L->getParent());
5421 }
5422
5423 BoUpSLP::OrdersType Order;
5424 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5425 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5426 return std::move(Order);
5427 return std::nullopt;
5428}
5429
5430/// Check if two insertelement instructions are from the same buildvector.
5433 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5434 // Instructions must be from the same basic blocks.
5435 if (VU->getParent() != V->getParent())
5436 return false;
5437 // Checks if 2 insertelements are from the same buildvector.
5438 if (VU->getType() != V->getType())
5439 return false;
5440 // Multiple used inserts are separate nodes.
5441 if (!VU->hasOneUse() && !V->hasOneUse())
5442 return false;
5443 auto *IE1 = VU;
5444 auto *IE2 = V;
5445 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5446 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5447 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5448 return false;
5449 // Go through the vector operand of insertelement instructions trying to find
5450 // either VU as the original vector for IE2 or V as the original vector for
5451 // IE1.
5452 SmallBitVector ReusedIdx(
5453 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5454 bool IsReusedIdx = false;
5455 do {
5456 if (IE2 == VU && !IE1)
5457 return VU->hasOneUse();
5458 if (IE1 == V && !IE2)
5459 return V->hasOneUse();
5460 if (IE1 && IE1 != V) {
5461 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5462 IsReusedIdx |= ReusedIdx.test(Idx1);
5463 ReusedIdx.set(Idx1);
5464 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5465 IE1 = nullptr;
5466 else
5467 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5468 }
5469 if (IE2 && IE2 != VU) {
5470 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5471 IsReusedIdx |= ReusedIdx.test(Idx2);
5472 ReusedIdx.set(Idx2);
5473 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5474 IE2 = nullptr;
5475 else
5476 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5477 }
5478 } while (!IsReusedIdx && (IE1 || IE2));
5479 return false;
5480}
5481
5482std::optional<BoUpSLP::OrdersType>
5483BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5484 // No need to reorder if need to shuffle reuses, still need to shuffle the
5485 // node.
5486 if (!TE.ReuseShuffleIndices.empty()) {
5487 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5488 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5489 "Reshuffling scalars not yet supported for nodes with padding");
5490
5491 if (isSplat(TE.Scalars))
5492 return std::nullopt;
5493 // Check if reuse shuffle indices can be improved by reordering.
5494 // For this, check that reuse mask is "clustered", i.e. each scalar values
5495 // is used once in each submask of size <number_of_scalars>.
5496 // Example: 4 scalar values.
5497 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5498 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5499 // element 3 is used twice in the second submask.
5500 unsigned Sz = TE.Scalars.size();
5501 if (TE.isGather()) {
5502 if (std::optional<OrdersType> CurrentOrder =
5504 SmallVector<int> Mask;
5505 fixupOrderingIndices(*CurrentOrder);
5506 inversePermutation(*CurrentOrder, Mask);
5507 ::addMask(Mask, TE.ReuseShuffleIndices);
5508 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5509 unsigned Sz = TE.Scalars.size();
5510 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5511 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5512 if (Idx != PoisonMaskElem)
5513 Res[Idx + K * Sz] = I + K * Sz;
5514 }
5515 return std::move(Res);
5516 }
5517 }
5518 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5519 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5520 2 * TE.getVectorFactor())) == 1)
5521 return std::nullopt;
5522 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5523 Sz)) {
5524 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5525 if (TE.ReorderIndices.empty())
5526 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5527 else
5528 inversePermutation(TE.ReorderIndices, ReorderMask);
5529 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5530 unsigned VF = ReorderMask.size();
5531 OrdersType ResOrder(VF, VF);
5532 unsigned NumParts = divideCeil(VF, Sz);
5533 SmallBitVector UsedVals(NumParts);
5534 for (unsigned I = 0; I < VF; I += Sz) {
5535 int Val = PoisonMaskElem;
5536 unsigned UndefCnt = 0;
5537 unsigned Limit = std::min(Sz, VF - I);
5538 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5539 [&](int Idx) {
5540 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5541 Val = Idx;
5542 if (Idx == PoisonMaskElem)
5543 ++UndefCnt;
5544 return Idx != PoisonMaskElem && Idx != Val;
5545 }) ||
5546 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5547 UndefCnt > Sz / 2)
5548 return std::nullopt;
5549 UsedVals.set(Val);
5550 for (unsigned K = 0; K < NumParts; ++K) {
5551 unsigned Idx = Val + Sz * K;
5552 if (Idx < VF)
5553 ResOrder[Idx] = I + K;
5554 }
5555 }
5556 return std::move(ResOrder);
5557 }
5558 unsigned VF = TE.getVectorFactor();
5559 // Try build correct order for extractelement instructions.
5560 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5561 TE.ReuseShuffleIndices.end());
5562 if (TE.getOpcode() == Instruction::ExtractElement &&
5563 all_of(TE.Scalars, [Sz](Value *V) {
5564 if (isa<PoisonValue>(V))
5565 return true;
5566 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5567 return Idx && *Idx < Sz;
5568 })) {
5569 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5570 "by BinaryOperator and CastInst.");
5571 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5572 if (TE.ReorderIndices.empty())
5573 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5574 else
5575 inversePermutation(TE.ReorderIndices, ReorderMask);
5576 for (unsigned I = 0; I < VF; ++I) {
5577 int &Idx = ReusedMask[I];
5578 if (Idx == PoisonMaskElem)
5579 continue;
5580 Value *V = TE.Scalars[ReorderMask[Idx]];
5581 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5582 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5583 }
5584 }
5585 // Build the order of the VF size, need to reorder reuses shuffles, they are
5586 // always of VF size.
5587 OrdersType ResOrder(VF);
5588 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5589 auto *It = ResOrder.begin();
5590 for (unsigned K = 0; K < VF; K += Sz) {
5591 OrdersType CurrentOrder(TE.ReorderIndices);
5592 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5593 if (SubMask.front() == PoisonMaskElem)
5594 std::iota(SubMask.begin(), SubMask.end(), 0);
5595 reorderOrder(CurrentOrder, SubMask);
5596 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5597 std::advance(It, Sz);
5598 }
5599 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5600 return Data.index() == Data.value();
5601 }))
5602 return std::nullopt; // No need to reorder.
5603 return std::move(ResOrder);
5604 }
5605 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5606 any_of(TE.UserTreeIndices,
5607 [](const EdgeInfo &EI) {
5608 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5609 }) &&
5610 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5611 return std::nullopt;
5612 if ((TE.State == TreeEntry::Vectorize ||
5613 TE.State == TreeEntry::StridedVectorize) &&
5614 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5615 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5616 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5617 "BinaryOperator and CastInst.");
5618 return TE.ReorderIndices;
5619 }
5620 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5621 if (!TE.ReorderIndices.empty())
5622 return TE.ReorderIndices;
5623
5624 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5625 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5626 if (!V->hasNUsesOrMore(1))
5627 continue;
5628 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5629 if (!II)
5630 continue;
5631 Instruction *BVHead = nullptr;
5632 BasicBlock *BB = II->getParent();
5633 while (II && II->hasOneUse() && II->getParent() == BB) {
5634 BVHead = II;
5635 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5636 }
5637 I = BVHead;
5638 }
5639
5640 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5641 assert(BB1 != BB2 && "Expected different basic blocks.");
5642 auto *NodeA = DT->getNode(BB1);
5643 auto *NodeB = DT->getNode(BB2);
5644 assert(NodeA && "Should only process reachable instructions");
5645 assert(NodeB && "Should only process reachable instructions");
5646 assert((NodeA == NodeB) ==
5647 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5648 "Different nodes should have different DFS numbers");
5649 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5650 };
5651 auto PHICompare = [&](unsigned I1, unsigned I2) {
5652 Value *V1 = TE.Scalars[I1];
5653 Value *V2 = TE.Scalars[I2];
5654 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5655 return false;
5656 if (isa<PoisonValue>(V1))
5657 return true;
5658 if (isa<PoisonValue>(V2))
5659 return false;
5660 if (V1->getNumUses() < V2->getNumUses())
5661 return true;
5662 if (V1->getNumUses() > V2->getNumUses())
5663 return false;
5664 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5665 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5666 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5667 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5668 FirstUserOfPhi2->getParent());
5669 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5670 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5671 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5672 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5673 if (IE1 && !IE2)
5674 return true;
5675 if (!IE1 && IE2)
5676 return false;
5677 if (IE1 && IE2) {
5678 if (UserBVHead[I1] && !UserBVHead[I2])
5679 return true;
5680 if (!UserBVHead[I1])
5681 return false;
5682 if (UserBVHead[I1] == UserBVHead[I2])
5683 return getElementIndex(IE1) < getElementIndex(IE2);
5684 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5685 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5686 UserBVHead[I2]->getParent());
5687 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5688 }
5689 if (EE1 && !EE2)
5690 return true;
5691 if (!EE1 && EE2)
5692 return false;
5693 if (EE1 && EE2) {
5694 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5695 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5696 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5697 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5698 if (!Inst2 && !P2)
5699 return Inst1 || P1;
5700 if (EE1->getOperand(0) == EE2->getOperand(0))
5701 return getElementIndex(EE1) < getElementIndex(EE2);
5702 if (!Inst1 && Inst2)
5703 return false;
5704 if (Inst1 && Inst2) {
5705 if (Inst1->getParent() != Inst2->getParent())
5706 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5707 return Inst1->comesBefore(Inst2);
5708 }
5709 if (!P1 && P2)
5710 return false;
5711 assert(P1 && P2 &&
5712 "Expected either instructions or arguments vector operands.");
5713 return P1->getArgNo() < P2->getArgNo();
5714 }
5715 return false;
5716 };
5717 OrdersType Phis(TE.Scalars.size());
5718 std::iota(Phis.begin(), Phis.end(), 0);
5719 stable_sort(Phis, PHICompare);
5720 if (isIdentityOrder(Phis))
5721 return std::nullopt; // No need to reorder.
5722 return std::move(Phis);
5723 }
5724 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5725 // TODO: add analysis of other gather nodes with extractelement
5726 // instructions and other values/instructions, not only undefs.
5727 if ((TE.getOpcode() == Instruction::ExtractElement ||
5728 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5729 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5730 all_of(TE.Scalars, [](Value *V) {
5731 auto *EE = dyn_cast<ExtractElementInst>(V);
5732 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5733 })) {
5734 // Check that gather of extractelements can be represented as
5735 // just a shuffle of a single vector.
5736 OrdersType CurrentOrder;
5737 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5738 /*ResizeAllowed=*/true);
5739 if (Reuse || !CurrentOrder.empty())
5740 return std::move(CurrentOrder);
5741 }
5742 // If the gather node is <undef, v, .., poison> and
5743 // insertelement poison, v, 0 [+ permute]
5744 // is cheaper than
5745 // insertelement poison, v, n - try to reorder.
5746 // If rotating the whole graph, exclude the permute cost, the whole graph
5747 // might be transformed.
5748 int Sz = TE.Scalars.size();
5749 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5750 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5751 const auto *It =
5752 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5753 if (It == TE.Scalars.begin())
5754 return OrdersType();
5755 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5756 if (It != TE.Scalars.end()) {
5757 OrdersType Order(Sz, Sz);
5758 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5759 Order[Idx] = 0;
5760 fixupOrderingIndices(Order);
5761 SmallVector<int> Mask;
5762 inversePermutation(Order, Mask);
5763 InstructionCost PermuteCost =
5764 TopToBottom
5765 ? 0
5767 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5768 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5769 PoisonValue::get(Ty), *It);
5770 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5771 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5772 PoisonValue::get(Ty), *It);
5773 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5774 OrdersType Order(Sz, Sz);
5775 Order[Idx] = 0;
5776 return std::move(Order);
5777 }
5778 }
5779 }
5780 if (isSplat(TE.Scalars))
5781 return std::nullopt;
5782 if (TE.Scalars.size() >= 3)
5783 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5784 return Order;
5785 // Check if can include the order of vectorized loads. For masked gathers do
5786 // extra analysis later, so include such nodes into a special list.
5787 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5788 SmallVector<Value *> PointerOps;
5789 OrdersType CurrentOrder;
5790 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5791 CurrentOrder, PointerOps);
5793 return std::move(CurrentOrder);
5794 }
5795 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5796 // has been auditted for correctness with non-power-of-two vectors.
5797 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5798 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5799 return CurrentOrder;
5800 }
5801 return std::nullopt;
5802}
5803
5804/// Checks if the given mask is a "clustered" mask with the same clusters of
5805/// size \p Sz, which are not identity submasks.
5807 unsigned Sz) {
5808 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5809 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5810 return false;
5811 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5812 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5813 if (Cluster != FirstCluster)
5814 return false;
5815 }
5816 return true;
5817}
5818
5819void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5820 // Reorder reuses mask.
5821 reorderReuses(TE.ReuseShuffleIndices, Mask);
5822 const unsigned Sz = TE.Scalars.size();
5823 // For vectorized and non-clustered reused no need to do anything else.
5824 if (!TE.isGather() ||
5826 Sz) ||
5827 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5828 return;
5829 SmallVector<int> NewMask;
5830 inversePermutation(TE.ReorderIndices, NewMask);
5831 addMask(NewMask, TE.ReuseShuffleIndices);
5832 // Clear reorder since it is going to be applied to the new mask.
5833 TE.ReorderIndices.clear();
5834 // Try to improve gathered nodes with clustered reuses, if possible.
5835 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5836 SmallVector<unsigned> NewOrder(Slice);
5837 inversePermutation(NewOrder, NewMask);
5838 reorderScalars(TE.Scalars, NewMask);
5839 // Fill the reuses mask with the identity submasks.
5840 for (auto *It = TE.ReuseShuffleIndices.begin(),
5841 *End = TE.ReuseShuffleIndices.end();
5842 It != End; std::advance(It, Sz))
5843 std::iota(It, std::next(It, Sz), 0);
5844}
5845
5847 ArrayRef<unsigned> SecondaryOrder) {
5848 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5849 "Expected same size of orders");
5850 unsigned Sz = Order.size();
5851 SmallBitVector UsedIndices(Sz);
5852 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5853 if (Order[Idx] != Sz)
5854 UsedIndices.set(Order[Idx]);
5855 }
5856 if (SecondaryOrder.empty()) {
5857 for (unsigned Idx : seq<unsigned>(0, Sz))
5858 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5859 Order[Idx] = Idx;
5860 } else {
5861 for (unsigned Idx : seq<unsigned>(0, Sz))
5862 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5863 !UsedIndices.test(SecondaryOrder[Idx]))
5864 Order[Idx] = SecondaryOrder[Idx];
5865 }
5866}
5867
5869 // Maps VF to the graph nodes.
5871 // ExtractElement gather nodes which can be vectorized and need to handle
5872 // their ordering.
5874
5875 // Phi nodes can have preferred ordering based on their result users
5877
5878 // AltShuffles can also have a preferred ordering that leads to fewer
5879 // instructions, e.g., the addsub instruction in x86.
5880 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5881
5882 // Maps a TreeEntry to the reorder indices of external users.
5884 ExternalUserReorderMap;
5885 // Find all reorderable nodes with the given VF.
5886 // Currently the are vectorized stores,loads,extracts + some gathering of
5887 // extracts.
5888 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5889 const std::unique_ptr<TreeEntry> &TE) {
5890 // Look for external users that will probably be vectorized.
5891 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5892 findExternalStoreUsersReorderIndices(TE.get());
5893 if (!ExternalUserReorderIndices.empty()) {
5894 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5895 ExternalUserReorderMap.try_emplace(TE.get(),
5896 std::move(ExternalUserReorderIndices));
5897 }
5898
5899 // Patterns like [fadd,fsub] can be combined into a single instruction in
5900 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5901 // to take into account their order when looking for the most used order.
5902 if (TE->isAltShuffle()) {
5903 VectorType *VecTy =
5904 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5905 unsigned Opcode0 = TE->getOpcode();
5906 unsigned Opcode1 = TE->getAltOpcode();
5907 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5908 // If this pattern is supported by the target then we consider the order.
5909 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5910 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5911 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5912 }
5913 // TODO: Check the reverse order too.
5914 }
5915
5916 if (std::optional<OrdersType> CurrentOrder =
5917 getReorderingData(*TE, /*TopToBottom=*/true)) {
5918 // Do not include ordering for nodes used in the alt opcode vectorization,
5919 // better to reorder them during bottom-to-top stage. If follow the order
5920 // here, it causes reordering of the whole graph though actually it is
5921 // profitable just to reorder the subgraph that starts from the alternate
5922 // opcode vectorization node. Such nodes already end-up with the shuffle
5923 // instruction and it is just enough to change this shuffle rather than
5924 // rotate the scalars for the whole graph.
5925 unsigned Cnt = 0;
5926 const TreeEntry *UserTE = TE.get();
5927 while (UserTE && Cnt < RecursionMaxDepth) {
5928 if (UserTE->UserTreeIndices.size() != 1)
5929 break;
5930 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5931 return EI.UserTE->State == TreeEntry::Vectorize &&
5932 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5933 }))
5934 return;
5935 UserTE = UserTE->UserTreeIndices.back().UserTE;
5936 ++Cnt;
5937 }
5938 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5939 if (!(TE->State == TreeEntry::Vectorize ||
5940 TE->State == TreeEntry::StridedVectorize) ||
5941 !TE->ReuseShuffleIndices.empty())
5942 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5943 if (TE->State == TreeEntry::Vectorize &&
5944 TE->getOpcode() == Instruction::PHI)
5945 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5946 }
5947 });
5948
5949 // Reorder the graph nodes according to their vectorization factor.
5950 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5951 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5952 auto It = VFToOrderedEntries.find(VF);
5953 if (It == VFToOrderedEntries.end())
5954 continue;
5955 // Try to find the most profitable order. We just are looking for the most
5956 // used order and reorder scalar elements in the nodes according to this
5957 // mostly used order.
5958 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5959 // Delete VF entry upon exit.
5960 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5961
5962 // All operands are reordered and used only in this node - propagate the
5963 // most used order to the user node.
5966 OrdersUses;
5968 for (const TreeEntry *OpTE : OrderedEntries) {
5969 // No need to reorder this nodes, still need to extend and to use shuffle,
5970 // just need to merge reordering shuffle and the reuse shuffle.
5971 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5972 continue;
5973 // Count number of orders uses.
5974 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5975 &PhisToOrders]() -> const OrdersType & {
5976 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5977 auto It = GathersToOrders.find(OpTE);
5978 if (It != GathersToOrders.end())
5979 return It->second;
5980 }
5981 if (OpTE->isAltShuffle()) {
5982 auto It = AltShufflesToOrders.find(OpTE);
5983 if (It != AltShufflesToOrders.end())
5984 return It->second;
5985 }
5986 if (OpTE->State == TreeEntry::Vectorize &&
5987 OpTE->getOpcode() == Instruction::PHI) {
5988 auto It = PhisToOrders.find(OpTE);
5989 if (It != PhisToOrders.end())
5990 return It->second;
5991 }
5992 return OpTE->ReorderIndices;
5993 }();
5994 // First consider the order of the external scalar users.
5995 auto It = ExternalUserReorderMap.find(OpTE);
5996 if (It != ExternalUserReorderMap.end()) {
5997 const auto &ExternalUserReorderIndices = It->second;
5998 // If the OpTE vector factor != number of scalars - use natural order,
5999 // it is an attempt to reorder node with reused scalars but with
6000 // external uses.
6001 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6002 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6003 ExternalUserReorderIndices.size();
6004 } else {
6005 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
6006 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6007 }
6008 // No other useful reorder data in this entry.
6009 if (Order.empty())
6010 continue;
6011 }
6012 // Stores actually store the mask, not the order, need to invert.
6013 if (OpTE->State == TreeEntry::Vectorize &&
6014 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6015 assert(!OpTE->isAltShuffle() &&
6016 "Alternate instructions are only supported by BinaryOperator "
6017 "and CastInst.");
6018 SmallVector<int> Mask;
6019 inversePermutation(Order, Mask);
6020 unsigned E = Order.size();
6021 OrdersType CurrentOrder(E, E);
6022 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6023 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6024 });
6025 fixupOrderingIndices(CurrentOrder);
6026 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6027 } else {
6028 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6029 }
6030 }
6031 if (OrdersUses.empty())
6032 continue;
6033 // Choose the most used order.
6034 unsigned IdentityCnt = 0;
6035 unsigned FilledIdentityCnt = 0;
6036 OrdersType IdentityOrder(VF, VF);
6037 for (auto &Pair : OrdersUses) {
6038 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6039 if (!Pair.first.empty())
6040 FilledIdentityCnt += Pair.second;
6041 IdentityCnt += Pair.second;
6042 combineOrders(IdentityOrder, Pair.first);
6043 }
6044 }
6045 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6046 unsigned Cnt = IdentityCnt;
6047 for (auto &Pair : OrdersUses) {
6048 // Prefer identity order. But, if filled identity found (non-empty order)
6049 // with same number of uses, as the new candidate order, we can choose
6050 // this candidate order.
6051 if (Cnt < Pair.second ||
6052 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6053 Cnt == Pair.second && !BestOrder.empty() &&
6054 isIdentityOrder(BestOrder))) {
6055 combineOrders(Pair.first, BestOrder);
6056 BestOrder = Pair.first;
6057 Cnt = Pair.second;
6058 } else {
6059 combineOrders(BestOrder, Pair.first);
6060 }
6061 }
6062 // Set order of the user node.
6063 if (isIdentityOrder(BestOrder))
6064 continue;
6065 fixupOrderingIndices(BestOrder);
6066 SmallVector<int> Mask;
6067 inversePermutation(BestOrder, Mask);
6068 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6069 unsigned E = BestOrder.size();
6070 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6071 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6072 });
6073 // Do an actual reordering, if profitable.
6074 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6075 // Just do the reordering for the nodes with the given VF.
6076 if (TE->Scalars.size() != VF) {
6077 if (TE->ReuseShuffleIndices.size() == VF) {
6078 // Need to reorder the reuses masks of the operands with smaller VF to
6079 // be able to find the match between the graph nodes and scalar
6080 // operands of the given node during vectorization/cost estimation.
6081 assert(all_of(TE->UserTreeIndices,
6082 [VF, &TE](const EdgeInfo &EI) {
6083 return EI.UserTE->Scalars.size() == VF ||
6084 EI.UserTE->Scalars.size() ==
6085 TE->Scalars.size();
6086 }) &&
6087 "All users must be of VF size.");
6088 if (SLPReVec) {
6089 assert(SLPReVec && "Only supported by REVEC.");
6090 // ShuffleVectorInst does not do reorderOperands (and it should not
6091 // because ShuffleVectorInst supports only a limited set of
6092 // patterns). Only do reorderNodeWithReuses if all of the users are
6093 // not ShuffleVectorInst.
6094 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6095 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6096 }))
6097 continue;
6098 assert(none_of(TE->UserTreeIndices,
6099 [&](const EdgeInfo &EI) {
6100 return isa<ShuffleVectorInst>(
6101 EI.UserTE->getMainOp());
6102 }) &&
6103 "Does not know how to reorder.");
6104 }
6105 // Update ordering of the operands with the smaller VF than the given
6106 // one.
6107 reorderNodeWithReuses(*TE, Mask);
6108 }
6109 continue;
6110 }
6111 if ((TE->State == TreeEntry::Vectorize ||
6112 TE->State == TreeEntry::StridedVectorize) &&
6114 InsertElementInst>(TE->getMainOp()) ||
6115 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6116 assert(!TE->isAltShuffle() &&
6117 "Alternate instructions are only supported by BinaryOperator "
6118 "and CastInst.");
6119 // Build correct orders for extract{element,value}, loads and
6120 // stores.
6121 reorderOrder(TE->ReorderIndices, Mask);
6122 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6123 TE->reorderOperands(Mask);
6124 } else {
6125 // Reorder the node and its operands.
6126 TE->reorderOperands(Mask);
6127 assert(TE->ReorderIndices.empty() &&
6128 "Expected empty reorder sequence.");
6129 reorderScalars(TE->Scalars, Mask);
6130 }
6131 if (!TE->ReuseShuffleIndices.empty()) {
6132 // Apply reversed order to keep the original ordering of the reused
6133 // elements to avoid extra reorder indices shuffling.
6134 OrdersType CurrentOrder;
6135 reorderOrder(CurrentOrder, MaskOrder);
6136 SmallVector<int> NewReuses;
6137 inversePermutation(CurrentOrder, NewReuses);
6138 addMask(NewReuses, TE->ReuseShuffleIndices);
6139 TE->ReuseShuffleIndices.swap(NewReuses);
6140 }
6141 }
6142 }
6143}
6144
6145bool BoUpSLP::canReorderOperands(
6146 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6147 ArrayRef<TreeEntry *> ReorderableGathers,
6148 SmallVectorImpl<TreeEntry *> &GatherOps) {
6149 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6150 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6151 return OpData.first == I &&
6152 (OpData.second->State == TreeEntry::Vectorize ||
6153 OpData.second->State == TreeEntry::StridedVectorize);
6154 }))
6155 continue;
6156 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6157 // Do not reorder if operand node is used by many user nodes.
6158 if (any_of(TE->UserTreeIndices,
6159 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6160 return false;
6161 // Add the node to the list of the ordered nodes with the identity
6162 // order.
6163 Edges.emplace_back(I, TE);
6164 // Add ScatterVectorize nodes to the list of operands, where just
6165 // reordering of the scalars is required. Similar to the gathers, so
6166 // simply add to the list of gathered ops.
6167 // If there are reused scalars, process this node as a regular vectorize
6168 // node, just reorder reuses mask.
6169 if (TE->State != TreeEntry::Vectorize &&
6170 TE->State != TreeEntry::StridedVectorize &&
6171 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6172 GatherOps.push_back(TE);
6173 continue;
6174 }
6175 TreeEntry *Gather = nullptr;
6176 if (count_if(ReorderableGathers,
6177 [&Gather, UserTE, I](TreeEntry *TE) {
6178 assert(TE->State != TreeEntry::Vectorize &&
6179 TE->State != TreeEntry::StridedVectorize &&
6180 "Only non-vectorized nodes are expected.");
6181 if (any_of(TE->UserTreeIndices,
6182 [UserTE, I](const EdgeInfo &EI) {
6183 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6184 })) {
6185 assert(TE->isSame(UserTE->getOperand(I)) &&
6186 "Operand entry does not match operands.");
6187 Gather = TE;
6188 return true;
6189 }
6190 return false;
6191 }) > 1 &&
6192 !allConstant(UserTE->getOperand(I)))
6193 return false;
6194 if (Gather)
6195 GatherOps.push_back(Gather);
6196 }
6197 return true;
6198}
6199
6200void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6201 SetVector<TreeEntry *> OrderedEntries;
6202 DenseSet<const TreeEntry *> GathersToOrders;
6203 // Find all reorderable leaf nodes with the given VF.
6204 // Currently the are vectorized loads,extracts without alternate operands +
6205 // some gathering of extracts.
6206 SmallVector<TreeEntry *> NonVectorized;
6207 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6208 if (TE->State != TreeEntry::Vectorize &&
6209 TE->State != TreeEntry::StridedVectorize)
6210 NonVectorized.push_back(TE.get());
6211 if (std::optional<OrdersType> CurrentOrder =
6212 getReorderingData(*TE, /*TopToBottom=*/false)) {
6213 OrderedEntries.insert(TE.get());
6214 if (!(TE->State == TreeEntry::Vectorize ||
6215 TE->State == TreeEntry::StridedVectorize) ||
6216 !TE->ReuseShuffleIndices.empty())
6217 GathersToOrders.insert(TE.get());
6218 }
6219 }
6220
6221 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6222 // I.e., if the node has operands, that are reordered, try to make at least
6223 // one operand order in the natural order and reorder others + reorder the
6224 // user node itself.
6226 while (!OrderedEntries.empty()) {
6227 // 1. Filter out only reordered nodes.
6228 // 2. If the entry has multiple uses - skip it and jump to the next node.
6230 SmallVector<TreeEntry *> Filtered;
6231 for (TreeEntry *TE : OrderedEntries) {
6232 if (!(TE->State == TreeEntry::Vectorize ||
6233 TE->State == TreeEntry::StridedVectorize ||
6234 (TE->isGather() && GathersToOrders.contains(TE))) ||
6235 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6236 !all_of(drop_begin(TE->UserTreeIndices),
6237 [TE](const EdgeInfo &EI) {
6238 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6239 }) ||
6240 !Visited.insert(TE).second) {
6241 Filtered.push_back(TE);
6242 continue;
6243 }
6244 // Build a map between user nodes and their operands order to speedup
6245 // search. The graph currently does not provide this dependency directly.
6246 for (EdgeInfo &EI : TE->UserTreeIndices)
6247 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6248 }
6249 // Erase filtered entries.
6250 for (TreeEntry *TE : Filtered)
6251 OrderedEntries.remove(TE);
6253 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6254 UsersVec(Users.begin(), Users.end());
6255 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6256 return Data1.first->Idx > Data2.first->Idx;
6257 });
6258 for (auto &Data : UsersVec) {
6259 // Check that operands are used only in the User node.
6260 SmallVector<TreeEntry *> GatherOps;
6261 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6262 GatherOps)) {
6263 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6264 OrderedEntries.remove(Op.second);
6265 continue;
6266 }
6267 // All operands are reordered and used only in this node - propagate the
6268 // most used order to the user node.
6271 OrdersUses;
6272 // Do the analysis for each tree entry only once, otherwise the order of
6273 // the same node my be considered several times, though might be not
6274 // profitable.
6277 for (const auto &Op : Data.second) {
6278 TreeEntry *OpTE = Op.second;
6279 if (!VisitedOps.insert(OpTE).second)
6280 continue;
6281 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6282 continue;
6283 const auto Order = [&]() -> const OrdersType {
6284 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6285 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6286 .value_or(OrdersType(1));
6287 return OpTE->ReorderIndices;
6288 }();
6289 // The order is partially ordered, skip it in favor of fully non-ordered
6290 // orders.
6291 if (Order.size() == 1)
6292 continue;
6293 unsigned NumOps = count_if(
6294 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6295 return P.second == OpTE;
6296 });
6297 // Stores actually store the mask, not the order, need to invert.
6298 if (OpTE->State == TreeEntry::Vectorize &&
6299 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6300 assert(!OpTE->isAltShuffle() &&
6301 "Alternate instructions are only supported by BinaryOperator "
6302 "and CastInst.");
6303 SmallVector<int> Mask;
6304 inversePermutation(Order, Mask);
6305 unsigned E = Order.size();
6306 OrdersType CurrentOrder(E, E);
6307 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6308 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6309 });
6310 fixupOrderingIndices(CurrentOrder);
6311 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6312 NumOps;
6313 } else {
6314 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6315 }
6316 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6317 const auto AllowsReordering = [&](const TreeEntry *TE) {
6318 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6319 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6320 (IgnoreReorder && TE->Idx == 0))
6321 return true;
6322 if (TE->isGather()) {
6323 if (GathersToOrders.contains(TE))
6324 return !getReorderingData(*TE, /*TopToBottom=*/false)
6325 .value_or(OrdersType(1))
6326 .empty();
6327 return true;
6328 }
6329 return false;
6330 };
6331 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6332 TreeEntry *UserTE = EI.UserTE;
6333 if (!VisitedUsers.insert(UserTE).second)
6334 continue;
6335 // May reorder user node if it requires reordering, has reused
6336 // scalars, is an alternate op vectorize node or its op nodes require
6337 // reordering.
6338 if (AllowsReordering(UserTE))
6339 continue;
6340 // Check if users allow reordering.
6341 // Currently look up just 1 level of operands to avoid increase of
6342 // the compile time.
6343 // Profitable to reorder if definitely more operands allow
6344 // reordering rather than those with natural order.
6346 if (static_cast<unsigned>(count_if(
6347 Ops, [UserTE, &AllowsReordering](
6348 const std::pair<unsigned, TreeEntry *> &Op) {
6349 return AllowsReordering(Op.second) &&
6350 all_of(Op.second->UserTreeIndices,
6351 [UserTE](const EdgeInfo &EI) {
6352 return EI.UserTE == UserTE;
6353 });
6354 })) <= Ops.size() / 2)
6355 ++Res.first->second;
6356 }
6357 }
6358 if (OrdersUses.empty()) {
6359 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6360 OrderedEntries.remove(Op.second);
6361 continue;
6362 }
6363 // Choose the most used order.
6364 unsigned IdentityCnt = 0;
6365 unsigned VF = Data.second.front().second->getVectorFactor();
6366 OrdersType IdentityOrder(VF, VF);
6367 for (auto &Pair : OrdersUses) {
6368 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6369 IdentityCnt += Pair.second;
6370 combineOrders(IdentityOrder, Pair.first);
6371 }
6372 }
6373 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6374 unsigned Cnt = IdentityCnt;
6375 for (auto &Pair : OrdersUses) {
6376 // Prefer identity order. But, if filled identity found (non-empty
6377 // order) with same number of uses, as the new candidate order, we can
6378 // choose this candidate order.
6379 if (Cnt < Pair.second) {
6380 combineOrders(Pair.first, BestOrder);
6381 BestOrder = Pair.first;
6382 Cnt = Pair.second;
6383 } else {
6384 combineOrders(BestOrder, Pair.first);
6385 }
6386 }
6387 // Set order of the user node.
6388 if (isIdentityOrder(BestOrder)) {
6389 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6390 OrderedEntries.remove(Op.second);
6391 continue;
6392 }
6393 fixupOrderingIndices(BestOrder);
6394 // Erase operands from OrderedEntries list and adjust their orders.
6395 VisitedOps.clear();
6396 SmallVector<int> Mask;
6397 inversePermutation(BestOrder, Mask);
6398 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6399 unsigned E = BestOrder.size();
6400 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6401 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6402 });
6403 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6404 TreeEntry *TE = Op.second;
6405 OrderedEntries.remove(TE);
6406 if (!VisitedOps.insert(TE).second)
6407 continue;
6408 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6409 reorderNodeWithReuses(*TE, Mask);
6410 continue;
6411 }
6412 // Gathers are processed separately.
6413 if (TE->State != TreeEntry::Vectorize &&
6414 TE->State != TreeEntry::StridedVectorize &&
6415 (TE->State != TreeEntry::ScatterVectorize ||
6416 TE->ReorderIndices.empty()))
6417 continue;
6418 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6419 TE->ReorderIndices.empty()) &&
6420 "Non-matching sizes of user/operand entries.");
6421 reorderOrder(TE->ReorderIndices, Mask);
6422 if (IgnoreReorder && TE == VectorizableTree.front().get())
6423 IgnoreReorder = false;
6424 }
6425 // For gathers just need to reorder its scalars.
6426 for (TreeEntry *Gather : GatherOps) {
6427 assert(Gather->ReorderIndices.empty() &&
6428 "Unexpected reordering of gathers.");
6429 if (!Gather->ReuseShuffleIndices.empty()) {
6430 // Just reorder reuses indices.
6431 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6432 continue;
6433 }
6434 reorderScalars(Gather->Scalars, Mask);
6435 OrderedEntries.remove(Gather);
6436 }
6437 // Reorder operands of the user node and set the ordering for the user
6438 // node itself.
6439 if (Data.first->State != TreeEntry::Vectorize ||
6440 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6441 Data.first->getMainOp()) ||
6442 Data.first->isAltShuffle())
6443 Data.first->reorderOperands(Mask);
6444 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6445 Data.first->isAltShuffle() ||
6446 Data.first->State == TreeEntry::StridedVectorize) {
6447 reorderScalars(Data.first->Scalars, Mask);
6448 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6449 /*BottomOrder=*/true);
6450 if (Data.first->ReuseShuffleIndices.empty() &&
6451 !Data.first->ReorderIndices.empty() &&
6452 !Data.first->isAltShuffle()) {
6453 // Insert user node to the list to try to sink reordering deeper in
6454 // the graph.
6455 OrderedEntries.insert(Data.first);
6456 }
6457 } else {
6458 reorderOrder(Data.first->ReorderIndices, Mask);
6459 }
6460 }
6461 }
6462 // If the reordering is unnecessary, just remove the reorder.
6463 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6464 VectorizableTree.front()->ReuseShuffleIndices.empty())
6465 VectorizableTree.front()->ReorderIndices.clear();
6466}
6467
6468Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6469 if ((Entry.getOpcode() == Instruction::Store ||
6470 Entry.getOpcode() == Instruction::Load) &&
6471 Entry.State == TreeEntry::StridedVectorize &&
6472 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6473 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6474 return dyn_cast<Instruction>(Entry.Scalars.front());
6475}
6476
6478 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6479 DenseMap<Value *, unsigned> ScalarToExtUses;
6480 // Collect the values that we need to extract from the tree.
6481 for (auto &TEPtr : VectorizableTree) {
6482 TreeEntry *Entry = TEPtr.get();
6483
6484 // No need to handle users of gathered values.
6485 if (Entry->isGather())
6486 continue;
6487
6488 // For each lane:
6489 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6490 Value *Scalar = Entry->Scalars[Lane];
6491 if (!isa<Instruction>(Scalar))
6492 continue;
6493 // All uses must be replaced already? No need to do it again.
6494 auto It = ScalarToExtUses.find(Scalar);
6495 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6496 continue;
6497
6498 // Check if the scalar is externally used as an extra arg.
6499 const auto ExtI = ExternallyUsedValues.find(Scalar);
6500 if (ExtI != ExternallyUsedValues.end()) {
6501 int FoundLane = Entry->findLaneForValue(Scalar);
6502 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6503 << FoundLane << " from " << *Scalar << ".\n");
6504 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6505 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6506 continue;
6507 }
6508 for (User *U : Scalar->users()) {
6509 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6510
6511 Instruction *UserInst = dyn_cast<Instruction>(U);
6512 if (!UserInst || isDeleted(UserInst))
6513 continue;
6514
6515 // Ignore users in the user ignore list.
6516 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6517 continue;
6518
6519 // Skip in-tree scalars that become vectors
6520 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6521 // Some in-tree scalars will remain as scalar in vectorized
6522 // instructions. If that is the case, the one in FoundLane will
6523 // be used.
6524 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6526 Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
6527 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6528 << ".\n");
6529 assert(!UseEntry->isGather() && "Bad state");
6530 continue;
6531 }
6532 U = nullptr;
6533 if (It != ScalarToExtUses.end()) {
6534 ExternalUses[It->second].User = nullptr;
6535 break;
6536 }
6537 }
6538
6539 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6540 U = nullptr;
6541 int FoundLane = Entry->findLaneForValue(Scalar);
6542 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6543 << " from lane " << FoundLane << " from " << *Scalar
6544 << ".\n");
6545 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6546 ExternalUses.emplace_back(Scalar, U, FoundLane);
6547 if (!U)
6548 break;
6549 }
6550 }
6551 }
6552}
6553
6555BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6558 PtrToStoresMap;
6559 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6560 Value *V = TE->Scalars[Lane];
6561 // Don't iterate over the users of constant data.
6562 if (!isa<Instruction>(V))
6563 continue;
6564 // To save compilation time we don't visit if we have too many users.
6565 if (V->hasNUsesOrMore(UsesLimit))
6566 break;
6567
6568 // Collect stores per pointer object.
6569 for (User *U : V->users()) {
6570 auto *SI = dyn_cast<StoreInst>(U);
6571 // Test whether we can handle the store. V might be a global, which could
6572 // be used in a different function.
6573 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6574 !isValidElementType(SI->getValueOperand()->getType()))
6575 continue;
6576 // Skip entry if already
6577 if (getTreeEntry(U))
6578 continue;
6579
6580 Value *Ptr =
6581 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6582 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6583 SI->getValueOperand()->getType(), Ptr}];
6584 // For now just keep one store per pointer object per lane.
6585 // TODO: Extend this to support multiple stores per pointer per lane
6586 if (StoresVec.size() > Lane)
6587 continue;
6588 if (!StoresVec.empty()) {
6589 std::optional<int> Diff = getPointersDiff(
6590 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6591 SI->getValueOperand()->getType(),
6592 StoresVec.front()->getPointerOperand(), *DL, *SE,
6593 /*StrictCheck=*/true);
6594 // We failed to compare the pointers so just abandon this store.
6595 if (!Diff)
6596 continue;
6597 }
6598 StoresVec.push_back(SI);
6599 }
6600 }
6601 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6602 unsigned I = 0;
6603 for (auto &P : PtrToStoresMap) {
6604 Res[I].swap(P.second);
6605 ++I;
6606 }
6607 return Res;
6608}
6609
6610bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6611 OrdersType &ReorderIndices) const {
6612 // We check whether the stores in StoreVec can form a vector by sorting them
6613 // and checking whether they are consecutive.
6614
6615 // To avoid calling getPointersDiff() while sorting we create a vector of
6616 // pairs {store, offset from first} and sort this instead.
6618 StoreInst *S0 = StoresVec[0];
6619 StoreOffsetVec.emplace_back(0, 0);
6620 Type *S0Ty = S0->getValueOperand()->getType();
6621 Value *S0Ptr = S0->getPointerOperand();
6622 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6623 StoreInst *SI = StoresVec[Idx];
6624 std::optional<int> Diff =
6625 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6626 SI->getPointerOperand(), *DL, *SE,
6627 /*StrictCheck=*/true);
6628 StoreOffsetVec.emplace_back(*Diff, Idx);
6629 }
6630
6631 // Check if the stores are consecutive by checking if their difference is 1.
6632 if (StoreOffsetVec.size() != StoresVec.size())
6633 return false;
6634 sort(StoreOffsetVec,
6635 [](const std::pair<int, unsigned> &L,
6636 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6637 unsigned Idx = 0;
6638 int PrevDist = 0;
6639 for (const auto &P : StoreOffsetVec) {
6640 if (Idx > 0 && P.first != PrevDist + 1)
6641 return false;
6642 PrevDist = P.first;
6643 ++Idx;
6644 }
6645
6646 // Calculate the shuffle indices according to their offset against the sorted
6647 // StoreOffsetVec.
6648 ReorderIndices.assign(StoresVec.size(), 0);
6649 bool IsIdentity = true;
6650 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6651 ReorderIndices[P.second] = I;
6652 IsIdentity &= P.second == I;
6653 }
6654 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6655 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6656 // same convention here.
6657 if (IsIdentity)
6658 ReorderIndices.clear();
6659
6660 return true;
6661}
6662
6663#ifndef NDEBUG
6665 for (unsigned Idx : Order)
6666 dbgs() << Idx << ", ";
6667 dbgs() << "\n";
6668}
6669#endif
6670
6672BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6673 unsigned NumLanes = TE->Scalars.size();
6674
6675 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6676
6677 // Holds the reorder indices for each candidate store vector that is a user of
6678 // the current TreeEntry.
6679 SmallVector<OrdersType, 1> ExternalReorderIndices;
6680
6681 // Now inspect the stores collected per pointer and look for vectorization
6682 // candidates. For each candidate calculate the reorder index vector and push
6683 // it into `ExternalReorderIndices`
6684 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6685 // If we have fewer than NumLanes stores, then we can't form a vector.
6686 if (StoresVec.size() != NumLanes)
6687 continue;
6688
6689 // If the stores are not consecutive then abandon this StoresVec.
6690 OrdersType ReorderIndices;
6691 if (!canFormVector(StoresVec, ReorderIndices))
6692 continue;
6693
6694 // We now know that the scalars in StoresVec can form a vector instruction,
6695 // so set the reorder indices.
6696 ExternalReorderIndices.push_back(ReorderIndices);
6697 }
6698 return ExternalReorderIndices;
6699}
6700
6702 const SmallDenseSet<Value *> &UserIgnoreLst) {
6703 deleteTree();
6704 UserIgnoreList = &UserIgnoreLst;
6705 if (!allSameType(Roots))
6706 return;
6707 buildTree_rec(Roots, 0, EdgeInfo());
6708}
6709
6711 deleteTree();
6712 if (!allSameType(Roots))
6713 return;
6714 buildTree_rec(Roots, 0, EdgeInfo());
6715}
6716
6717/// Tries to find subvector of loads and builds new vector of only loads if can
6718/// be profitable.
6720 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6722 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6723 bool AddNew = true) {
6724 if (VL.empty())
6725 return;
6726 Type *ScalarTy = getValueType(VL.front());
6727 if (!isValidElementType(ScalarTy))
6728 return;
6730 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6731 for (Value *V : VL) {
6732 auto *LI = dyn_cast<LoadInst>(V);
6733 if (!LI)
6734 continue;
6735 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6736 continue;
6737 bool IsFound = false;
6738 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6739 assert(LI->getParent() == Data.front().first->getParent() &&
6740 LI->getType() == Data.front().first->getType() &&
6741 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6742 getUnderlyingObject(Data.front().first->getPointerOperand(),
6744 "Expected loads with the same type, same parent and same "
6745 "underlying pointer.");
6746 std::optional<int> Dist = getPointersDiff(
6747 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6748 Data.front().first->getPointerOperand(), DL, SE,
6749 /*StrictCheck=*/true);
6750 if (!Dist)
6751 continue;
6752 auto It = Map.find(*Dist);
6753 if (It != Map.end() && It->second != LI)
6754 continue;
6755 if (It == Map.end()) {
6756 Data.emplace_back(LI, *Dist);
6757 Map.try_emplace(*Dist, LI);
6758 }
6759 IsFound = true;
6760 break;
6761 }
6762 if (!IsFound) {
6763 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6764 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6765 }
6766 }
6767 auto FindMatchingLoads =
6770 &GatheredLoads,
6771 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6772 int &Offset, unsigned &Start) {
6773 if (Loads.empty())
6774 return GatheredLoads.end();
6776 LoadInst *LI = Loads.front().first;
6777 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6778 if (Idx < Start)
6779 continue;
6780 ToAdd.clear();
6781 if (LI->getParent() != Data.front().first->getParent() ||
6782 LI->getType() != Data.front().first->getType())
6783 continue;
6784 std::optional<int> Dist =
6786 Data.front().first->getType(),
6787 Data.front().first->getPointerOperand(), DL, SE,
6788 /*StrictCheck=*/true);
6789 if (!Dist)
6790 continue;
6791 SmallSet<int, 4> DataDists;
6793 for (std::pair<LoadInst *, int> P : Data) {
6794 DataDists.insert(P.second);
6795 DataLoads.insert(P.first);
6796 }
6797 // Found matching gathered loads - check if all loads are unique or
6798 // can be effectively vectorized.
6799 unsigned NumUniques = 0;
6800 for (auto [Cnt, Pair] : enumerate(Loads)) {
6801 bool Used = DataLoads.contains(Pair.first);
6802 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6803 ++NumUniques;
6804 ToAdd.insert(Cnt);
6805 } else if (Used) {
6806 Repeated.insert(Cnt);
6807 }
6808 }
6809 if (NumUniques > 0 &&
6810 (Loads.size() == NumUniques ||
6811 (Loads.size() - NumUniques >= 2 &&
6812 Loads.size() - NumUniques >= Loads.size() / 2 &&
6813 (has_single_bit(Data.size() + NumUniques) ||
6814 bit_ceil(Data.size()) <
6815 bit_ceil(Data.size() + NumUniques))))) {
6816 Offset = *Dist;
6817 Start = Idx + 1;
6818 return std::next(GatheredLoads.begin(), Idx);
6819 }
6820 }
6821 ToAdd.clear();
6822 return GatheredLoads.end();
6823 };
6824 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6825 unsigned Start = 0;
6826 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6827 int Offset = 0;
6828 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6829 Offset, Start);
6830 while (It != GatheredLoads.end()) {
6831 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6832 for (unsigned Idx : LocalToAdd)
6833 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6834 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6835 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6836 Start);
6837 }
6838 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6839 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6840 })) {
6841 auto AddNewLoads =
6843 for (unsigned Idx : seq<unsigned>(Data.size())) {
6844 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6845 continue;
6846 Loads.push_back(Data[Idx]);
6847 }
6848 };
6849 if (!AddNew) {
6850 LoadInst *LI = Data.front().first;
6851 It = find_if(
6852 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6853 return PD.front().first->getParent() == LI->getParent() &&
6854 PD.front().first->getType() == LI->getType();
6855 });
6856 while (It != GatheredLoads.end()) {
6857 AddNewLoads(*It);
6858 It = std::find_if(
6859 std::next(It), GatheredLoads.end(),
6860 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6861 return PD.front().first->getParent() == LI->getParent() &&
6862 PD.front().first->getType() == LI->getType();
6863 });
6864 }
6865 }
6866 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6867 AddNewLoads(GatheredLoads.emplace_back());
6868 }
6869 }
6870}
6871
6872void BoUpSLP::tryToVectorizeGatheredLoads(
6873 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6874 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6875 8> &GatheredLoads) {
6876 GatheredLoadsEntriesFirst = VectorizableTree.size();
6877
6878 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6879 LoadEntriesToVectorize.size());
6880 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6881 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6882 VectorizableTree[Idx]->Scalars.end());
6883
6884 // Sort loads by distance.
6885 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6886 const std::pair<LoadInst *, int> &L2) {
6887 return L1.second > L2.second;
6888 };
6889
6890 auto IsMaskedGatherSupported = [&](ArrayRef<LoadInst *> Loads) {
6891 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6892 Loads.size());
6893 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6894 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6895 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6896 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6897 };
6898
6899 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6900 BoUpSLP::ValueSet &VectorizedLoads,
6901 SmallVectorImpl<LoadInst *> &NonVectorized,
6902 bool Final, unsigned MaxVF) {
6904 unsigned StartIdx = 0;
6905 SmallVector<int> CandidateVFs;
6906 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6907 CandidateVFs.push_back(MaxVF);
6908 for (int NumElts = getFloorFullVectorNumberOfElements(
6909 *TTI, Loads.front()->getType(), MaxVF);
6910 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6911 *TTI, Loads.front()->getType(), NumElts - 1)) {
6912 CandidateVFs.push_back(NumElts);
6913 if (VectorizeNonPowerOf2 && NumElts > 2)
6914 CandidateVFs.push_back(NumElts - 1);
6915 }
6916
6917 if (Final && CandidateVFs.empty())
6918 return Results;
6919
6920 unsigned BestVF = Final ? CandidateVFs.back() : 0;
6921 for (unsigned NumElts : CandidateVFs) {
6922 if (Final && NumElts > BestVF)
6923 continue;
6924 SmallVector<unsigned> MaskedGatherVectorized;
6925 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
6926 ++Cnt) {
6927 ArrayRef<LoadInst *> Slice =
6928 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
6929 if (VectorizedLoads.count(Slice.front()) ||
6930 VectorizedLoads.count(Slice.back()) ||
6932 continue;
6933 // Check if it is profitable to try vectorizing gathered loads. It is
6934 // profitable if we have more than 3 consecutive loads or if we have
6935 // less but all users are vectorized or deleted.
6936 bool AllowToVectorize = false;
6937 // Check if it is profitable to vectorize 2-elements loads.
6938 if (NumElts == 2) {
6939 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
6940 Slice.front()->getType(), ElementCount::getFixed(NumElts));
6941 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
6942 for (LoadInst *LI : Slice) {
6943 // If single use/user - allow to vectorize.
6944 if (LI->hasOneUse())
6945 continue;
6946 // 1. Check if number of uses equals number of users.
6947 // 2. All users are deleted.
6948 // 3. The load broadcasts are not allowed or the load is not
6949 // broadcasted.
6950 if (static_cast<unsigned int>(std::distance(
6951 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6952 return false;
6953 if (!IsLegalBroadcastLoad)
6954 continue;
6955 if (LI->hasNUsesOrMore(UsesLimit))
6956 return false;
6957 for (User *U : LI->users()) {
6958 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
6959 continue;
6960 if (const TreeEntry *UTE = getTreeEntry(U)) {
6961 for (int I : seq<int>(UTE->getNumOperands())) {
6962 if (all_of(UTE->getOperand(I),
6963 [LI](Value *V) { return V == LI; }))
6964 // Found legal broadcast - do not vectorize.
6965 return false;
6966 }
6967 }
6968 }
6969 }
6970 return true;
6971 };
6972 AllowToVectorize = CheckIfAllowed(Slice);
6973 } else {
6974 AllowToVectorize =
6975 (NumElts >= 3 ||
6976 any_of(ValueToGatherNodes.at(Slice.front()),
6977 [=](const TreeEntry *TE) {
6978 return TE->Scalars.size() == 2 &&
6979 ((TE->Scalars.front() == Slice.front() &&
6980 TE->Scalars.back() == Slice.back()) ||
6981 (TE->Scalars.front() == Slice.back() &&
6982 TE->Scalars.back() == Slice.front()));
6983 })) &&
6984 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
6985 Slice.size());
6986 }
6987 if (AllowToVectorize) {
6988 SmallVector<Value *> PointerOps;
6989 OrdersType CurrentOrder;
6990 // Try to build vector load.
6991 ArrayRef<Value *> Values(
6992 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
6993 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
6994 PointerOps, &BestVF);
6995 if (LS != LoadsState::Gather ||
6996 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
6997 if (LS == LoadsState::ScatterVectorize) {
6998 if (MaskedGatherVectorized.empty() ||
6999 Cnt >= MaskedGatherVectorized.back() + NumElts)
7000 MaskedGatherVectorized.push_back(Cnt);
7001 continue;
7002 }
7003 if (LS != LoadsState::Gather) {
7004 Results.emplace_back(Values, LS);
7005 VectorizedLoads.insert(Slice.begin(), Slice.end());
7006 // If we vectorized initial block, no need to try to vectorize it
7007 // again.
7008 if (Cnt == StartIdx)
7009 StartIdx += NumElts;
7010 }
7011 // Check if the whole array was vectorized already - exit.
7012 if (StartIdx >= Loads.size())
7013 break;
7014 // Erase last masked gather candidate, if another candidate within
7015 // the range is found to be better.
7016 if (!MaskedGatherVectorized.empty() &&
7017 Cnt < MaskedGatherVectorized.back() + NumElts)
7018 MaskedGatherVectorized.pop_back();
7019 Cnt += NumElts - 1;
7020 continue;
7021 }
7022 }
7023 if (!AllowToVectorize || BestVF == 0)
7025 }
7026 // Mark masked gathers candidates as vectorized, if any.
7027 for (unsigned Cnt : MaskedGatherVectorized) {
7028 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7029 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7030 ArrayRef<Value *> Values(
7031 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7032 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7033 VectorizedLoads.insert(Slice.begin(), Slice.end());
7034 // If we vectorized initial block, no need to try to vectorize it again.
7035 if (Cnt == StartIdx)
7036 StartIdx += NumElts;
7037 }
7038 }
7039 for (LoadInst *LI : Loads) {
7040 if (!VectorizedLoads.contains(LI))
7041 NonVectorized.push_back(LI);
7042 }
7043 return Results;
7044 };
7045 auto ProcessGatheredLoads =
7046 [&, &TTI = *TTI](
7048 bool Final = false) {
7049 SmallVector<LoadInst *> NonVectorized;
7050 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7051 if (LoadsDists.size() <= 1) {
7052 NonVectorized.push_back(LoadsDists.back().first);
7053 continue;
7054 }
7055 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7056 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7057 transform(
7058 LoadsDists, OriginalLoads.begin(),
7059 [](const std::pair<LoadInst *, int> &L) { return L.first; });
7060 stable_sort(LocalLoadsDists, LoadSorter);
7062 unsigned MaxConsecutiveDistance = 0;
7063 unsigned CurrentConsecutiveDist = 1;
7064 int LastDist = LocalLoadsDists.front().second;
7065 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7066 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7067 if (getTreeEntry(L.first))
7068 continue;
7069 assert(LastDist >= L.second &&
7070 "Expected first distance always not less than second");
7071 if (static_cast<unsigned>(LastDist - L.second) ==
7072 CurrentConsecutiveDist) {
7073 ++CurrentConsecutiveDist;
7074 MaxConsecutiveDistance =
7075 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7076 Loads.push_back(L.first);
7077 continue;
7078 }
7079 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7080 !Loads.empty())
7081 Loads.pop_back();
7082 CurrentConsecutiveDist = 1;
7083 LastDist = L.second;
7084 Loads.push_back(L.first);
7085 }
7086 if (Loads.size() <= 1)
7087 continue;
7088 if (AllowMaskedGather)
7089 MaxConsecutiveDistance = Loads.size();
7090 else if (MaxConsecutiveDistance < 2)
7091 continue;
7092 BoUpSLP::ValueSet VectorizedLoads;
7093 SmallVector<LoadInst *> SortedNonVectorized;
7095 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7096 Final, MaxConsecutiveDistance);
7097 if (!Results.empty() && !SortedNonVectorized.empty() &&
7098 OriginalLoads.size() == Loads.size() &&
7099 MaxConsecutiveDistance == Loads.size() &&
7101 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7102 return P.second == LoadsState::ScatterVectorize;
7103 })) {
7104 VectorizedLoads.clear();
7105 SmallVector<LoadInst *> UnsortedNonVectorized;
7107 UnsortedResults =
7108 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7109 UnsortedNonVectorized, Final,
7110 OriginalLoads.size());
7111 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7112 SortedNonVectorized.swap(UnsortedNonVectorized);
7113 Results.swap(UnsortedResults);
7114 }
7115 }
7116 for (auto [Slice, _] : Results) {
7117 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7118 << Slice.size() << ")\n");
7119 if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
7120 for (Value *L : Slice)
7121 if (!getTreeEntry(L))
7122 SortedNonVectorized.push_back(cast<LoadInst>(L));
7123 continue;
7124 }
7125
7126 // Select maximum VF as a maximum of user gathered nodes and
7127 // distance between scalar loads in these nodes.
7128 unsigned MaxVF = Slice.size();
7129 unsigned UserMaxVF = 0;
7130 unsigned InterleaveFactor = 0;
7131 if (MaxVF == 2) {
7132 UserMaxVF = MaxVF;
7133 } else {
7134 // Found distance between segments of the interleaved loads.
7135 std::optional<unsigned> InterleavedLoadsDistance = 0;
7136 unsigned Order = 0;
7137 std::optional<unsigned> CommonVF = 0;
7139 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7140 for (auto [Idx, V] : enumerate(Slice)) {
7141 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7142 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7143 unsigned Pos =
7144 EntryToPosition.try_emplace(E, Idx).first->second;
7145 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7146 if (CommonVF) {
7147 if (*CommonVF == 0) {
7148 CommonVF = E->Scalars.size();
7149 continue;
7150 }
7151 if (*CommonVF != E->Scalars.size())
7152 CommonVF.reset();
7153 }
7154 // Check if the load is the part of the interleaved load.
7155 if (Pos != Idx && InterleavedLoadsDistance) {
7156 if (!DeinterleavedNodes.contains(E) &&
7157 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7158 if (isa<Constant>(V))
7159 return false;
7160 if (getTreeEntry(V))
7161 return true;
7162 const auto &Nodes = ValueToGatherNodes.at(V);
7163 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7164 !is_contained(Slice, V);
7165 })) {
7166 InterleavedLoadsDistance.reset();
7167 continue;
7168 }
7169 DeinterleavedNodes.insert(E);
7170 if (*InterleavedLoadsDistance == 0) {
7171 InterleavedLoadsDistance = Idx - Pos;
7172 continue;
7173 }
7174 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7175 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7176 InterleavedLoadsDistance.reset();
7177 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7178 }
7179 }
7180 }
7181 DeinterleavedNodes.clear();
7182 // Check if the large load represents interleaved load operation.
7183 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7184 CommonVF.value_or(0) != 0) {
7185 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7186 unsigned VF = *CommonVF;
7187 OrdersType Order;
7188 SmallVector<Value *> PointerOps;
7189 // Segmented load detected - vectorize at maximum vector factor.
7190 if (InterleaveFactor <= Slice.size() &&
7192 getWidenedType(Slice.front()->getType(), VF),
7193 InterleaveFactor,
7194 cast<LoadInst>(Slice.front())->getAlign(),
7195 cast<LoadInst>(Slice.front())
7197 canVectorizeLoads(Slice, Slice.front(), Order,
7198 PointerOps) == LoadsState::Vectorize) {
7199 UserMaxVF = InterleaveFactor * VF;
7200 } else {
7201 InterleaveFactor = 0;
7202 }
7203 }
7204 // Cannot represent the loads as consecutive vectorizable nodes -
7205 // just exit.
7206 unsigned ConsecutiveNodesSize = 0;
7207 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7208 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7209 [&, Slice = Slice](const auto &P) {
7210 const auto *It = find_if(Slice, [&](Value *V) {
7211 return std::get<1>(P).contains(V);
7212 });
7213 if (It == Slice.end())
7214 return false;
7216 VectorizableTree[std::get<0>(P)]->Scalars;
7217 ConsecutiveNodesSize += VL.size();
7218 unsigned Start = std::distance(Slice.begin(), It);
7219 unsigned Sz = Slice.size() - Start;
7220 return Sz < VL.size() ||
7221 Slice.slice(std::distance(Slice.begin(), It),
7222 VL.size()) != VL;
7223 }))
7224 continue;
7225 // Try to build long masked gather loads.
7226 UserMaxVF = bit_ceil(UserMaxVF);
7227 if (InterleaveFactor == 0 &&
7228 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7229 [&, Slice = Slice](unsigned Idx) {
7230 OrdersType Order;
7231 SmallVector<Value *> PointerOps;
7232 return canVectorizeLoads(
7233 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7234 Slice[Idx * UserMaxVF], Order,
7235 PointerOps) ==
7236 LoadsState::ScatterVectorize;
7237 }))
7238 UserMaxVF = MaxVF;
7239 if (Slice.size() != ConsecutiveNodesSize)
7240 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7241 }
7242 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7243 bool IsVectorized = true;
7244 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7245 ArrayRef<Value *> SubSlice =
7246 Slice.slice(I, std::min(VF, E - I));
7247 if (getTreeEntry(SubSlice.front()))
7248 continue;
7249 // Check if the subslice is to be-vectorized entry, which is not
7250 // equal to entry.
7251 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7252 [&](const auto &P) {
7253 return !SubSlice.equals(
7254 VectorizableTree[std::get<0>(P)]
7255 ->Scalars) &&
7256 set_is_subset(SubSlice, std::get<1>(P));
7257 }))
7258 continue;
7259 unsigned Sz = VectorizableTree.size();
7260 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7261 if (Sz == VectorizableTree.size()) {
7262 IsVectorized = false;
7263 // Try non-interleaved vectorization with smaller vector
7264 // factor.
7265 if (InterleaveFactor > 0) {
7266 VF = 2 * (MaxVF / InterleaveFactor);
7267 InterleaveFactor = 0;
7268 }
7269 continue;
7270 }
7271 }
7272 if (IsVectorized)
7273 break;
7274 }
7275 }
7276 NonVectorized.append(SortedNonVectorized);
7277 }
7278 return NonVectorized;
7279 };
7280 for (const auto &GLs : GatheredLoads) {
7281 const auto &Ref = GLs.second;
7282 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7283 if (!Ref.empty() && !NonVectorized.empty() &&
7284 std::accumulate(
7285 Ref.begin(), Ref.end(), 0u,
7286 [](unsigned S, ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
7287 return S + LoadsDists.size();
7288 }) != NonVectorized.size() &&
7289 IsMaskedGatherSupported(NonVectorized)) {
7291 for (LoadInst *LI : NonVectorized) {
7292 // Reinsert non-vectorized loads to other list of loads with the same
7293 // base pointers.
7294 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7295 FinalGatheredLoads,
7296 /*AddNew=*/false);
7297 }
7298 // Final attempt to vectorize non-vectorized loads.
7299 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7300 }
7301 }
7302 // Try to vectorize postponed load entries, previously marked as gathered.
7303 for (unsigned Idx : LoadEntriesToVectorize) {
7304 const TreeEntry &E = *VectorizableTree[Idx];
7305 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7306 // Avoid reordering, if possible.
7307 if (!E.ReorderIndices.empty()) {
7308 // Build a mask out of the reorder indices and reorder scalars per this
7309 // mask.
7310 SmallVector<int> ReorderMask;
7311 inversePermutation(E.ReorderIndices, ReorderMask);
7312 reorderScalars(GatheredScalars, ReorderMask);
7313 }
7314 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7315 }
7316 // If no new entries created, consider it as no gathered loads entries must be
7317 // handled.
7318 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7319 VectorizableTree.size())
7320 GatheredLoadsEntriesFirst.reset();
7321}
7322
7323/// \return true if the specified list of values has only one instruction that
7324/// requires scheduling, false otherwise.
7325#ifndef NDEBUG
7327 Value *NeedsScheduling = nullptr;
7328 for (Value *V : VL) {
7330 continue;
7331 if (!NeedsScheduling) {
7332 NeedsScheduling = V;
7333 continue;
7334 }
7335 return false;
7336 }
7337 return NeedsScheduling;
7338}
7339#endif
7340
7341/// Generates key/subkey pair for the given value to provide effective sorting
7342/// of the values and better detection of the vectorizable values sequences. The
7343/// keys/subkeys can be used for better sorting of the values themselves (keys)
7344/// and in values subgroups (subkeys).
7345static std::pair<size_t, size_t> generateKeySubkey(
7346 Value *V, const TargetLibraryInfo *TLI,
7347 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7348 bool AllowAlternate) {
7349 hash_code Key = hash_value(V->getValueID() + 2);
7350 hash_code SubKey = hash_value(0);
7351 // Sort the loads by the distance between the pointers.
7352 if (auto *LI = dyn_cast<LoadInst>(V)) {
7353 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7354 if (LI->isSimple())
7355 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7356 else
7357 Key = SubKey = hash_value(LI);
7358 } else if (isVectorLikeInstWithConstOps(V)) {
7359 // Sort extracts by the vector operands.
7360 if (isa<ExtractElementInst, UndefValue>(V))
7361 Key = hash_value(Value::UndefValueVal + 1);
7362 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7363 if (!isUndefVector(EI->getVectorOperand()).all() &&
7364 !isa<UndefValue>(EI->getIndexOperand()))
7365 SubKey = hash_value(EI->getVectorOperand());
7366 }
7367 } else if (auto *I = dyn_cast<Instruction>(V)) {
7368 // Sort other instructions just by the opcodes except for CMPInst.
7369 // For CMP also sort by the predicate kind.
7370 if ((isa<BinaryOperator, CastInst>(I)) &&
7371 isValidForAlternation(I->getOpcode())) {
7372 if (AllowAlternate)
7373 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7374 else
7375 Key = hash_combine(hash_value(I->getOpcode()), Key);
7376 SubKey = hash_combine(
7377 hash_value(I->getOpcode()), hash_value(I->getType()),
7378 hash_value(isa<BinaryOperator>(I)
7379 ? I->getType()
7380 : cast<CastInst>(I)->getOperand(0)->getType()));
7381 // For casts, look through the only operand to improve compile time.
7382 if (isa<CastInst>(I)) {
7383 std::pair<size_t, size_t> OpVals =
7384 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7385 /*AllowAlternate=*/true);
7386 Key = hash_combine(OpVals.first, Key);
7387 SubKey = hash_combine(OpVals.first, SubKey);
7388 }
7389 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7390 CmpInst::Predicate Pred = CI->getPredicate();
7391 if (CI->isCommutative())
7392 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7394 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7395 hash_value(SwapPred),
7396 hash_value(CI->getOperand(0)->getType()));
7397 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7400 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7401 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7402 SubKey = hash_combine(hash_value(I->getOpcode()),
7403 hash_value(Call->getCalledFunction()));
7404 } else {
7405 Key = hash_combine(hash_value(Call), Key);
7406 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7407 }
7408 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7409 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7410 hash_value(Op.Tag), SubKey);
7411 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7412 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7413 SubKey = hash_value(Gep->getPointerOperand());
7414 else
7415 SubKey = hash_value(Gep);
7416 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7417 !isa<ConstantInt>(I->getOperand(1))) {
7418 // Do not try to vectorize instructions with potentially high cost.
7419 SubKey = hash_value(I);
7420 } else {
7421 SubKey = hash_value(I->getOpcode());
7422 }
7423 Key = hash_combine(hash_value(I->getParent()), Key);
7424 }
7425 return std::make_pair(Key, SubKey);
7426}
7427
7428/// Checks if the specified instruction \p I is an alternate operation for
7429/// the given \p MainOp and \p AltOp instructions.
7430static bool isAlternateInstruction(const Instruction *I,
7431 const Instruction *MainOp,
7432 const Instruction *AltOp,
7433 const TargetLibraryInfo &TLI);
7434
7435bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7436 ArrayRef<Value *> VL) const {
7437 unsigned Opcode0 = S.getOpcode();
7438 unsigned Opcode1 = S.getAltOpcode();
7439 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7440 // If this pattern is supported by the target then consider it profitable.
7441 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7442 Opcode0, Opcode1, OpcodeMask))
7443 return true;
7445 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7446 Operands.emplace_back();
7447 // Prepare the operand vector.
7448 for (Value *V : VL) {
7449 if (isa<PoisonValue>(V)) {
7450 Operands.back().push_back(
7451 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7452 continue;
7453 }
7454 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7455 }
7456 }
7457 if (Operands.size() == 2) {
7458 // Try find best operands candidates.
7459 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7461 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7462 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7463 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7464 std::optional<int> Res = findBestRootPair(Candidates);
7465 switch (Res.value_or(0)) {
7466 case 0:
7467 break;
7468 case 1:
7469 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7470 break;
7471 case 2:
7472 std::swap(Operands[0][I], Operands[1][I]);
7473 break;
7474 default:
7475 llvm_unreachable("Unexpected index.");
7476 }
7477 }
7478 }
7479 DenseSet<unsigned> UniqueOpcodes;
7480 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7481 unsigned NonInstCnt = 0;
7482 // Estimate number of instructions, required for the vectorized node and for
7483 // the buildvector node.
7484 unsigned UndefCnt = 0;
7485 // Count the number of extra shuffles, required for vector nodes.
7486 unsigned ExtraShuffleInsts = 0;
7487 // Check that operands do not contain same values and create either perfect
7488 // diamond match or shuffled match.
7489 if (Operands.size() == 2) {
7490 // Do not count same operands twice.
7491 if (Operands.front() == Operands.back()) {
7492 Operands.erase(Operands.begin());
7493 } else if (!allConstant(Operands.front()) &&
7494 all_of(Operands.front(), [&](Value *V) {
7495 return is_contained(Operands.back(), V);
7496 })) {
7497 Operands.erase(Operands.begin());
7498 ++ExtraShuffleInsts;
7499 }
7500 }
7501 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7502 // Vectorize node, if:
7503 // 1. at least single operand is constant or splat.
7504 // 2. Operands have many loop invariants (the instructions are not loop
7505 // invariants).
7506 // 3. At least single unique operands is supposed to vectorized.
7507 return none_of(Operands,
7508 [&](ArrayRef<Value *> Op) {
7509 if (allConstant(Op) ||
7510 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7511 getSameOpcode(Op, *TLI)))
7512 return false;
7514 for (Value *V : Op) {
7515 if (isa<Constant, ExtractElementInst>(V) ||
7516 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
7517 if (isa<UndefValue>(V))
7518 ++UndefCnt;
7519 continue;
7520 }
7521 auto Res = Uniques.try_emplace(V, 0);
7522 // Found first duplicate - need to add shuffle.
7523 if (!Res.second && Res.first->second == 1)
7524 ++ExtraShuffleInsts;
7525 ++Res.first->getSecond();
7526 if (auto *I = dyn_cast<Instruction>(V))
7527 UniqueOpcodes.insert(I->getOpcode());
7528 else if (Res.second)
7529 ++NonInstCnt;
7530 }
7531 return none_of(Uniques, [&](const auto &P) {
7532 return P.first->hasNUsesOrMore(P.second + 1) &&
7533 none_of(P.first->users(), [&](User *U) {
7534 return getTreeEntry(U) || Uniques.contains(U);
7535 });
7536 });
7537 }) ||
7538 // Do not vectorize node, if estimated number of vector instructions is
7539 // more than estimated number of buildvector instructions. Number of
7540 // vector operands is number of vector instructions + number of vector
7541 // instructions for operands (buildvectors). Number of buildvector
7542 // instructions is just number_of_operands * number_of_scalars.
7543 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7544 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7545 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7546}
7547
7548BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7549 const InstructionsState &S, ArrayRef<Value *> VL,
7550 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7551 SmallVectorImpl<Value *> &PointerOps) {
7552 assert(S.getMainOp() &&
7553 "Expected instructions with same/alternate opcodes only.");
7554
7555 unsigned ShuffleOrOp =
7556 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7557 Instruction *VL0 = S.getMainOp();
7558 switch (ShuffleOrOp) {
7559 case Instruction::PHI: {
7560 // Too many operands - gather, most probably won't be vectorized.
7561 if (VL0->getNumOperands() > MaxPHINumOperands)
7562 return TreeEntry::NeedToGather;
7563 // Check for terminator values (e.g. invoke).
7564 for (Value *V : VL) {
7565 auto *PHI = dyn_cast<PHINode>(V);
7566 if (!PHI)
7567 continue;
7568 for (Value *Incoming : PHI->incoming_values()) {
7569 Instruction *Term = dyn_cast<Instruction>(Incoming);
7570 if (Term && Term->isTerminator()) {
7572 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7573 return TreeEntry::NeedToGather;
7574 }
7575 }
7576 }
7577
7578 return TreeEntry::Vectorize;
7579 }
7580 case Instruction::ExtractValue:
7581 case Instruction::ExtractElement: {
7582 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7583 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7584 if (!has_single_bit(VL.size()))
7585 return TreeEntry::NeedToGather;
7586 if (Reuse || !CurrentOrder.empty())
7587 return TreeEntry::Vectorize;
7588 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7589 return TreeEntry::NeedToGather;
7590 }
7591 case Instruction::InsertElement: {
7592 // Check that we have a buildvector and not a shuffle of 2 or more
7593 // different vectors.
7594 ValueSet SourceVectors;
7595 for (Value *V : VL) {
7596 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7597 assert(getElementIndex(V) != std::nullopt &&
7598 "Non-constant or undef index?");
7599 }
7600
7601 if (count_if(VL, [&SourceVectors](Value *V) {
7602 return !SourceVectors.contains(V);
7603 }) >= 2) {
7604 // Found 2nd source vector - cancel.
7605 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7606 "different source vectors.\n");
7607 return TreeEntry::NeedToGather;
7608 }
7609
7610 if (any_of(VL, [&SourceVectors](Value *V) {
7611 // The last InsertElement can have multiple uses.
7612 return SourceVectors.contains(V) && !V->hasOneUse();
7613 })) {
7614 assert(SLPReVec && "Only supported by REVEC.");
7615 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7616 "multiple uses.\n");
7617 return TreeEntry::NeedToGather;
7618 }
7619
7620 return TreeEntry::Vectorize;
7621 }
7622 case Instruction::Load: {
7623 // Check that a vectorized load would load the same memory as a scalar
7624 // load. For example, we don't want to vectorize loads that are smaller
7625 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7626 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7627 // from such a struct, we read/write packed bits disagreeing with the
7628 // unvectorized version.
7629 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7631 return TreeEntry::Vectorize;
7633 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7634 // Delay slow vectorized nodes for better vectorization attempts.
7635 LoadEntriesToVectorize.insert(VectorizableTree.size());
7636 return TreeEntry::NeedToGather;
7637 }
7638 return TreeEntry::ScatterVectorize;
7640 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7641 // Delay slow vectorized nodes for better vectorization attempts.
7642 LoadEntriesToVectorize.insert(VectorizableTree.size());
7643 return TreeEntry::NeedToGather;
7644 }
7645 return TreeEntry::StridedVectorize;
7646 case LoadsState::Gather:
7647#ifndef NDEBUG
7648 Type *ScalarTy = VL0->getType();
7649 if (DL->getTypeSizeInBits(ScalarTy) !=
7650 DL->getTypeAllocSizeInBits(ScalarTy))
7651 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7652 else if (any_of(VL, [](Value *V) {
7653 auto *LI = dyn_cast<LoadInst>(V);
7654 return !LI || !LI->isSimple();
7655 }))
7656 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7657 else
7658 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7659#endif // NDEBUG
7661 return TreeEntry::NeedToGather;
7662 }
7663 llvm_unreachable("Unexpected state of loads");
7664 }
7665 case Instruction::ZExt:
7666 case Instruction::SExt:
7667 case Instruction::FPToUI:
7668 case Instruction::FPToSI:
7669 case Instruction::FPExt:
7670 case Instruction::PtrToInt:
7671 case Instruction::IntToPtr:
7672 case Instruction::SIToFP:
7673 case Instruction::UIToFP:
7674 case Instruction::Trunc:
7675 case Instruction::FPTrunc:
7676 case Instruction::BitCast: {
7677 Type *SrcTy = VL0->getOperand(0)->getType();
7678 for (Value *V : VL) {
7679 if (isa<PoisonValue>(V))
7680 continue;
7681 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7682 if (Ty != SrcTy || !isValidElementType(Ty)) {
7683 LLVM_DEBUG(
7684 dbgs() << "SLP: Gathering casts with different src types.\n");
7685 return TreeEntry::NeedToGather;
7686 }
7687 }
7688 return TreeEntry::Vectorize;
7689 }
7690 case Instruction::ICmp:
7691 case Instruction::FCmp: {
7692 // Check that all of the compares have the same predicate.
7693 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7695 Type *ComparedTy = VL0->getOperand(0)->getType();
7696 for (Value *V : VL) {
7697 if (isa<PoisonValue>(V))
7698 continue;
7699 auto *Cmp = cast<CmpInst>(V);
7700 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7701 Cmp->getOperand(0)->getType() != ComparedTy) {
7702 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7703 return TreeEntry::NeedToGather;
7704 }
7705 }
7706 return TreeEntry::Vectorize;
7707 }
7708 case Instruction::Select:
7709 case Instruction::FNeg:
7710 case Instruction::Add:
7711 case Instruction::FAdd:
7712 case Instruction::Sub:
7713 case Instruction::FSub:
7714 case Instruction::Mul:
7715 case Instruction::FMul:
7716 case Instruction::UDiv:
7717 case Instruction::SDiv:
7718 case Instruction::FDiv:
7719 case Instruction::URem:
7720 case Instruction::SRem:
7721 case Instruction::FRem:
7722 case Instruction::Shl:
7723 case Instruction::LShr:
7724 case Instruction::AShr:
7725 case Instruction::And:
7726 case Instruction::Or:
7727 case Instruction::Xor:
7728 case Instruction::Freeze:
7729 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7731 auto *I = dyn_cast<Instruction>(V);
7732 return I && I->isBinaryOp() && !I->isFast();
7733 }))
7734 return TreeEntry::NeedToGather;
7735 return TreeEntry::Vectorize;
7736 case Instruction::GetElementPtr: {
7737 // We don't combine GEPs with complicated (nested) indexing.
7738 for (Value *V : VL) {
7739 auto *I = dyn_cast<GetElementPtrInst>(V);
7740 if (!I)
7741 continue;
7742 if (I->getNumOperands() != 2) {
7743 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7744 return TreeEntry::NeedToGather;
7745 }
7746 }
7747
7748 // We can't combine several GEPs into one vector if they operate on
7749 // different types.
7750 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7751 for (Value *V : VL) {
7752 auto *GEP = dyn_cast<GEPOperator>(V);
7753 if (!GEP)
7754 continue;
7755 Type *CurTy = GEP->getSourceElementType();
7756 if (Ty0 != CurTy) {
7757 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7758 return TreeEntry::NeedToGather;
7759 }
7760 }
7761
7762 // We don't combine GEPs with non-constant indexes.
7763 Type *Ty1 = VL0->getOperand(1)->getType();
7764 for (Value *V : VL) {
7765 auto *I = dyn_cast<GetElementPtrInst>(V);
7766 if (!I)
7767 continue;
7768 auto *Op = I->getOperand(1);
7769 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7770 (Op->getType() != Ty1 &&
7771 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7772 Op->getType()->getScalarSizeInBits() >
7773 DL->getIndexSizeInBits(
7774 V->getType()->getPointerAddressSpace())))) {
7775 LLVM_DEBUG(
7776 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7777 return TreeEntry::NeedToGather;
7778 }
7779 }
7780
7781 return TreeEntry::Vectorize;
7782 }
7783 case Instruction::Store: {
7784 // Check if the stores are consecutive or if we need to swizzle them.
7785 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7786 // Avoid types that are padded when being allocated as scalars, while
7787 // being packed together in a vector (such as i1).
7788 if (DL->getTypeSizeInBits(ScalarTy) !=
7789 DL->getTypeAllocSizeInBits(ScalarTy)) {
7790 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7791 return TreeEntry::NeedToGather;
7792 }
7793 // Make sure all stores in the bundle are simple - we can't vectorize
7794 // atomic or volatile stores.
7795 for (Value *V : VL) {
7796 auto *SI = cast<StoreInst>(V);
7797 if (!SI->isSimple()) {
7798 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7799 return TreeEntry::NeedToGather;
7800 }
7801 PointerOps.push_back(SI->getPointerOperand());
7802 }
7803
7804 // Check the order of pointer operands.
7805 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7806 Value *Ptr0;
7807 Value *PtrN;
7808 if (CurrentOrder.empty()) {
7809 Ptr0 = PointerOps.front();
7810 PtrN = PointerOps.back();
7811 } else {
7812 Ptr0 = PointerOps[CurrentOrder.front()];
7813 PtrN = PointerOps[CurrentOrder.back()];
7814 }
7815 std::optional<int> Dist =
7816 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7817 // Check that the sorted pointer operands are consecutive.
7818 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7819 return TreeEntry::Vectorize;
7820 }
7821
7822 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7823 return TreeEntry::NeedToGather;
7824 }
7825 case Instruction::Call: {
7826 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7828 auto *I = dyn_cast<Instruction>(V);
7829 return I && !I->isFast();
7830 }))
7831 return TreeEntry::NeedToGather;
7832 // Check if the calls are all to the same vectorizable intrinsic or
7833 // library function.
7834 CallInst *CI = cast<CallInst>(VL0);
7836
7837 VFShape Shape = VFShape::get(
7838 CI->getFunctionType(),
7839 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7840 false /*HasGlobalPred*/);
7841 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7842
7843 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7844 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7845 return TreeEntry::NeedToGather;
7846 }
7847 Function *F = CI->getCalledFunction();
7848 unsigned NumArgs = CI->arg_size();
7849 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7850 for (unsigned J = 0; J != NumArgs; ++J)
7852 ScalarArgs[J] = CI->getArgOperand(J);
7853 for (Value *V : VL) {
7854 CallInst *CI2 = dyn_cast<CallInst>(V);
7855 if (!CI2 || CI2->getCalledFunction() != F ||
7856 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7857 (VecFunc &&
7858 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7860 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7861 << "\n");
7862 return TreeEntry::NeedToGather;
7863 }
7864 // Some intrinsics have scalar arguments and should be same in order for
7865 // them to be vectorized.
7866 for (unsigned J = 0; J != NumArgs; ++J) {
7868 Value *A1J = CI2->getArgOperand(J);
7869 if (ScalarArgs[J] != A1J) {
7871 << "SLP: mismatched arguments in call:" << *CI
7872 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7873 return TreeEntry::NeedToGather;
7874 }
7875 }
7876 }
7877 // Verify that the bundle operands are identical between the two calls.
7878 if (CI->hasOperandBundles() &&
7879 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7880 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7881 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7882 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7883 << "!=" << *V << '\n');
7884 return TreeEntry::NeedToGather;
7885 }
7886 }
7887
7888 return TreeEntry::Vectorize;
7889 }
7890 case Instruction::ShuffleVector: {
7891 if (!S.isAltShuffle()) {
7892 // REVEC can support non alternate shuffle.
7894 return TreeEntry::Vectorize;
7895 // If this is not an alternate sequence of opcode like add-sub
7896 // then do not vectorize this instruction.
7897 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7898 return TreeEntry::NeedToGather;
7899 }
7900 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7901 LLVM_DEBUG(
7902 dbgs()
7903 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7904 "the whole alt sequence is not profitable.\n");
7905 return TreeEntry::NeedToGather;
7906 }
7907
7908 return TreeEntry::Vectorize;
7909 }
7910 default:
7911 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
7912 return TreeEntry::NeedToGather;
7913 }
7914}
7915
7916namespace {
7917/// Allows to correctly handle operands of the phi nodes based on the \p Main
7918/// PHINode order of incoming basic blocks/values.
7919class PHIHandler {
7920 DominatorTree &DT;
7921 PHINode *Main = nullptr;
7924
7925public:
7926 PHIHandler() = delete;
7927 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
7928 : DT(DT), Main(Main), Phis(Phis),
7929 Operands(Main->getNumIncomingValues(),
7930 SmallVector<Value *>(Phis.size(), nullptr)) {}
7931 void buildOperands() {
7932 constexpr unsigned FastLimit = 4;
7933 if (Main->getNumIncomingValues() <= FastLimit) {
7934 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7935 BasicBlock *InBB = Main->getIncomingBlock(I);
7936 if (!DT.isReachableFromEntry(InBB)) {
7937 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7938 continue;
7939 }
7940 // Prepare the operand vector.
7941 for (auto [Idx, V] : enumerate(Phis)) {
7942 auto *P = dyn_cast<PHINode>(V);
7943 if (!P) {
7944 assert(isa<PoisonValue>(V) &&
7945 "Expected isa instruction or poison value.");
7946 Operands[I][Idx] = V;
7947 continue;
7948 }
7949 if (P->getIncomingBlock(I) == InBB)
7950 Operands[I][Idx] = P->getIncomingValue(I);
7951 else
7952 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
7953 }
7954 }
7955 return;
7956 }
7958 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7959 BasicBlock *InBB = Main->getIncomingBlock(I);
7960 if (!DT.isReachableFromEntry(InBB)) {
7961 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7962 continue;
7963 }
7964 Blocks.try_emplace(InBB).first->second.push_back(I);
7965 }
7966 for (auto [Idx, V] : enumerate(Phis)) {
7967 if (isa<PoisonValue>(V)) {
7968 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
7969 Operands[I][Idx] = V;
7970 continue;
7971 }
7972 auto *P = cast<PHINode>(V);
7973 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
7974 BasicBlock *InBB = P->getIncomingBlock(I);
7975 if (InBB == Main->getIncomingBlock(I)) {
7976 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
7977 continue;
7978 Operands[I][Idx] = P->getIncomingValue(I);
7979 continue;
7980 }
7981 auto It = Blocks.find(InBB);
7982 if (It == Blocks.end())
7983 continue;
7984 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
7985 }
7986 }
7987 for (const auto &P : Blocks) {
7988 if (P.getSecond().size() <= 1)
7989 continue;
7990 unsigned BasicI = P.getSecond().front();
7991 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
7993 [&](const auto &Data) {
7994 return !Data.value() ||
7995 Data.value() == Operands[BasicI][Data.index()];
7996 }) &&
7997 "Expected empty operands list.");
7998 Operands[I] = Operands[BasicI];
7999 }
8000 }
8001 }
8002 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
8003};
8004} // namespace
8005
8006void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8007 const EdgeInfo &UserTreeIdx,
8008 unsigned InterleaveFactor) {
8009 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
8010
8011 SmallVector<int> ReuseShuffleIndices;
8012 SmallVector<Value *> UniqueValues;
8013 SmallVector<Value *> NonUniqueValueVL;
8014 auto TryToFindDuplicates = [&](const InstructionsState &S,
8015 bool DoNotFail = false) {
8016 // Check that every instruction appears once in this bundle.
8017 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8018 for (Value *V : VL) {
8019 if (isConstant(V)) {
8020 ReuseShuffleIndices.emplace_back(
8021 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8022 UniqueValues.emplace_back(V);
8023 continue;
8024 }
8025 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8026 ReuseShuffleIndices.emplace_back(Res.first->second);
8027 if (Res.second)
8028 UniqueValues.emplace_back(V);
8029 }
8030 size_t NumUniqueScalarValues = UniqueValues.size();
8031 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8032 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8033 if (NumUniqueScalarValues == VL.size() &&
8034 (VectorizeNonPowerOf2 || IsFullVectors)) {
8035 ReuseShuffleIndices.clear();
8036 } else {
8037 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8038 if ((UserTreeIdx.UserTE &&
8039 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8040 !has_single_bit(VL.size())) {
8041 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8042 "for nodes with padding.\n");
8043 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8044 return false;
8045 }
8046 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8047 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8048 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8049 return isa<UndefValue>(V) || !isConstant(V);
8050 }))) {
8051 if (DoNotFail && UniquePositions.size() > 1 &&
8052 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8053 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8054 // Find the number of elements, which forms full vectors.
8055 unsigned PWSz = getFullVectorNumberOfElements(
8056 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8057 if (PWSz == VL.size()) {
8058 ReuseShuffleIndices.clear();
8059 } else {
8060 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8061 NonUniqueValueVL.append(
8062 PWSz - UniqueValues.size(),
8063 PoisonValue::get(UniqueValues.front()->getType()));
8064 VL = NonUniqueValueVL;
8065 }
8066 return true;
8067 }
8068 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8069 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8070 return false;
8071 }
8072 VL = UniqueValues;
8073 }
8074 return true;
8075 };
8076
8077 InstructionsState S = getSameOpcode(VL, *TLI);
8078
8079 // Don't go into catchswitch blocks, which can happen with PHIs.
8080 // Such blocks can only have PHIs and the catchswitch. There is no
8081 // place to insert a shuffle if we need to, so just avoid that issue.
8082 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8083 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8084 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8085 return;
8086 }
8087
8088 // Check if this is a duplicate of another entry.
8089 if (S) {
8090 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8091 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
8092 << ".\n");
8093 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8094 auto It = MultiNodeScalars.find(S.getMainOp());
8095 if (It != MultiNodeScalars.end()) {
8096 auto *TEIt = find_if(It->getSecond(),
8097 [&](TreeEntry *ME) { return ME->isSame(VL); });
8098 if (TEIt != It->getSecond().end())
8099 E = *TEIt;
8100 else
8101 E = nullptr;
8102 } else {
8103 E = nullptr;
8104 }
8105 }
8106 if (!E) {
8107 if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8108 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
8109 if (TryToFindDuplicates(S))
8110 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8111 ReuseShuffleIndices);
8112 return;
8113 }
8115 Nodes.insert(getTreeEntry(S.getMainOp()));
8116 for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8117 Nodes.insert(E);
8118 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8119 if (any_of(Nodes, [&](const TreeEntry *E) {
8120 if (all_of(E->Scalars,
8121 [&](Value *V) { return Values.contains(V); }))
8122 return true;
8123 SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8124 E->Scalars.end());
8125 return (
8126 all_of(VL, [&](Value *V) { return EValues.contains(V); }));
8127 })) {
8128 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8129 if (TryToFindDuplicates(S))
8130 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8131 ReuseShuffleIndices);
8132 return;
8133 }
8134 } else {
8135 // Record the reuse of the tree node. FIXME, currently this is only
8136 // used to properly draw the graph rather than for the actual
8137 // vectorization.
8138 E->UserTreeIndices.push_back(UserTreeIdx);
8139 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8140 << ".\n");
8141 return;
8142 }
8143 }
8144 }
8145
8146 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8147 // a load), in which case peek through to include it in the tree, without
8148 // ballooning over-budget.
8149 if (Depth >= RecursionMaxDepth &&
8150 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8151 (match(S.getMainOp(), m_Load(m_Value())) ||
8152 all_of(VL, [&S](const Value *I) {
8153 return match(I,
8155 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8156 })))) {
8157 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8158 if (TryToFindDuplicates(S))
8159 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8160 ReuseShuffleIndices);
8161 return;
8162 }
8163
8164 // Don't handle scalable vectors
8165 if (S && S.getOpcode() == Instruction::ExtractElement &&
8166 isa<ScalableVectorType>(
8167 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8168 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8169 if (TryToFindDuplicates(S))
8170 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8171 ReuseShuffleIndices);
8172 return;
8173 }
8174
8175 // Don't handle vectors.
8176 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8177 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8178 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8179 return;
8180 }
8181
8182 // If all of the operands are identical or constant we have a simple solution.
8183 // If we deal with insert/extract instructions, they all must have constant
8184 // indices, otherwise we should gather them, not try to vectorize.
8185 // If alternate op node with 2 elements with gathered operands - do not
8186 // vectorize.
8187 auto &&NotProfitableForVectorization = [&S, this,
8189 if (!S || !S.isAltShuffle() || VL.size() > 2)
8190 return false;
8191 if (VectorizableTree.size() < MinTreeSize)
8192 return false;
8193 if (Depth >= RecursionMaxDepth - 1)
8194 return true;
8195 // Check if all operands are extracts, part of vector node or can build a
8196 // regular vectorize node.
8197 SmallVector<unsigned, 8> InstsCount;
8198 for (Value *V : VL) {
8199 auto *I = cast<Instruction>(V);
8200 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8201 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8202 }));
8203 }
8204 bool IsCommutative =
8205 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8206 if ((IsCommutative &&
8207 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8208 (!IsCommutative &&
8209 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8210 return true;
8211 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8213 auto *I1 = cast<Instruction>(VL.front());
8214 auto *I2 = cast<Instruction>(VL.back());
8215 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8216 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8217 I2->getOperand(Op));
8218 if (static_cast<unsigned>(count_if(
8219 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8221 })) >= S.getMainOp()->getNumOperands() / 2)
8222 return false;
8223 if (S.getMainOp()->getNumOperands() > 2)
8224 return true;
8225 if (IsCommutative) {
8226 // Check permuted operands.
8227 Candidates.clear();
8228 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8229 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8230 I2->getOperand((Op + 1) % E));
8231 if (any_of(
8232 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8234 }))
8235 return false;
8236 }
8237 return true;
8238 };
8239 SmallVector<unsigned> SortedIndices;
8240 BasicBlock *BB = nullptr;
8241 bool IsScatterVectorizeUserTE =
8242 UserTreeIdx.UserTE &&
8243 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8244 bool AreAllSameBlock = S && allSameBlock(VL);
8245 bool AreScatterAllGEPSameBlock =
8246 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8247 VL.size() > 2 &&
8248 all_of(VL,
8249 [&BB](Value *V) {
8250 auto *I = dyn_cast<GetElementPtrInst>(V);
8251 if (!I)
8252 return doesNotNeedToBeScheduled(V);
8253 if (!BB)
8254 BB = I->getParent();
8255 return BB == I->getParent() && I->getNumOperands() == 2;
8256 }) &&
8257 BB &&
8258 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8259 SortedIndices));
8260 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8261 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
8262 (S &&
8263 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8264 S.getMainOp()) &&
8266 NotProfitableForVectorization(VL)) {
8267 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8268 if (TryToFindDuplicates(S))
8269 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8270 ReuseShuffleIndices);
8271 return;
8272 }
8273
8274 // Don't vectorize ephemeral values.
8275 if (S && !EphValues.empty()) {
8276 for (Value *V : VL) {
8277 if (EphValues.count(V)) {
8278 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8279 << ") is ephemeral.\n");
8280 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8281 return;
8282 }
8283 }
8284 }
8285
8286 // We now know that this is a vector of instructions of the same type from
8287 // the same block.
8288
8289 // Check that none of the instructions in the bundle are already in the tree.
8290 for (Value *V : VL) {
8291 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8293 continue;
8294 if (getTreeEntry(V)) {
8295 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8296 << ") is already in tree.\n");
8297 if (TryToFindDuplicates(S))
8298 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8299 ReuseShuffleIndices);
8300 return;
8301 }
8302 }
8303
8304 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8305 if (UserIgnoreList && !UserIgnoreList->empty()) {
8306 for (Value *V : VL) {
8307 if (UserIgnoreList->contains(V)) {
8308 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8309 if (TryToFindDuplicates(S))
8310 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8311 ReuseShuffleIndices);
8312 return;
8313 }
8314 }
8315 }
8316
8317 // Special processing for sorted pointers for ScatterVectorize node with
8318 // constant indeces only.
8319 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8320 assert(VL.front()->getType()->isPointerTy() &&
8321 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8322 "Expected pointers only.");
8323 // Reset S to make it GetElementPtr kind of node.
8324 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8325 assert(It != VL.end() && "Expected at least one GEP.");
8326 S = getSameOpcode(*It, *TLI);
8327 }
8328
8329 // Check that all of the users of the scalars that we want to vectorize are
8330 // schedulable.
8331 Instruction *VL0 = S.getMainOp();
8332 BB = VL0->getParent();
8333
8334 if (S &&
8335 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8336 !DT->isReachableFromEntry(BB))) {
8337 // Don't go into unreachable blocks. They may contain instructions with
8338 // dependency cycles which confuse the final scheduling.
8339 // Do not vectorize EH and non-returning blocks, not profitable in most
8340 // cases.
8341 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8342 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8343 return;
8344 }
8345
8346 // Check that every instruction appears once in this bundle.
8347 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8348 return;
8349
8350 // Perform specific checks for each particular instruction kind.
8351 OrdersType CurrentOrder;
8352 SmallVector<Value *> PointerOps;
8353 TreeEntry::EntryState State = getScalarsVectorizationState(
8354 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8355 if (State == TreeEntry::NeedToGather) {
8356 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8357 ReuseShuffleIndices);
8358 return;
8359 }
8360
8361 auto &BSRef = BlocksSchedules[BB];
8362 if (!BSRef)
8363 BSRef = std::make_unique<BlockScheduling>(BB);
8364
8365 BlockScheduling &BS = *BSRef;
8366
8367 std::optional<ScheduleData *> Bundle =
8368 BS.tryScheduleBundle(UniqueValues, this, S);
8369#ifdef EXPENSIVE_CHECKS
8370 // Make sure we didn't break any internal invariants
8371 BS.verify();
8372#endif
8373 if (!Bundle) {
8374 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8375 assert((!BS.getScheduleData(VL0) ||
8376 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8377 "tryScheduleBundle should cancelScheduling on failure");
8378 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8379 ReuseShuffleIndices);
8380 NonScheduledFirst.insert(VL.front());
8381 if (S.getOpcode() == Instruction::Load &&
8382 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8384 return;
8385 }
8386 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8387
8388 unsigned ShuffleOrOp =
8389 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8390 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8391 // Postpone PHI nodes creation
8392 SmallVector<unsigned> PHIOps;
8393 for (unsigned I : seq<unsigned>(Operands.size())) {
8395 if (Op.empty())
8396 continue;
8397 InstructionsState S = getSameOpcode(Op, *TLI);
8398 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8399 buildTree_rec(Op, Depth + 1, {TE, I});
8400 else
8401 PHIOps.push_back(I);
8402 }
8403 for (unsigned I : PHIOps)
8404 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8405 };
8406 switch (ShuffleOrOp) {
8407 case Instruction::PHI: {
8408 auto *PH = cast<PHINode>(VL0);
8409
8410 TreeEntry *TE =
8411 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8412 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
8413 TE->dump());
8414
8415 // Keeps the reordered operands to avoid code duplication.
8416 PHIHandler Handler(*DT, PH, VL);
8417 Handler.buildOperands();
8418 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8419 TE->setOperand(I, Handler.getOperands(I));
8420 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8421 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8422 Operands[I] = Handler.getOperands(I);
8423 CreateOperandNodes(TE, Operands);
8424 return;
8425 }
8426 case Instruction::ExtractValue:
8427 case Instruction::ExtractElement: {
8428 if (CurrentOrder.empty()) {
8429 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8430 } else {
8431 LLVM_DEBUG({
8432 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8433 "with order";
8434 for (unsigned Idx : CurrentOrder)
8435 dbgs() << " " << Idx;
8436 dbgs() << "\n";
8437 });
8438 fixupOrderingIndices(CurrentOrder);
8439 }
8440 // Insert new order with initial value 0, if it does not exist,
8441 // otherwise return the iterator to the existing one.
8442 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8443 ReuseShuffleIndices, CurrentOrder);
8444 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
8445 "(ExtractValueInst/ExtractElementInst).\n";
8446 TE->dump());
8447 // This is a special case, as it does not gather, but at the same time
8448 // we are not extending buildTree_rec() towards the operands.
8449 TE->setOperand(*this);
8450 return;
8451 }
8452 case Instruction::InsertElement: {
8453 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8454
8455 auto OrdCompare = [](const std::pair<int, int> &P1,
8456 const std::pair<int, int> &P2) {
8457 return P1.first > P2.first;
8458 };
8460 decltype(OrdCompare)>
8461 Indices(OrdCompare);
8462 for (int I = 0, E = VL.size(); I < E; ++I) {
8463 unsigned Idx = *getElementIndex(VL[I]);
8464 Indices.emplace(Idx, I);
8465 }
8466 OrdersType CurrentOrder(VL.size(), VL.size());
8467 bool IsIdentity = true;
8468 for (int I = 0, E = VL.size(); I < E; ++I) {
8469 CurrentOrder[Indices.top().second] = I;
8470 IsIdentity &= Indices.top().second == I;
8471 Indices.pop();
8472 }
8473 if (IsIdentity)
8474 CurrentOrder.clear();
8475 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8476 {}, CurrentOrder);
8477 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
8478 TE->dump());
8479
8480 TE->setOperand(*this);
8481 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8482 return;
8483 }
8484 case Instruction::Load: {
8485 // Check that a vectorized load would load the same memory as a scalar
8486 // load. For example, we don't want to vectorize loads that are smaller
8487 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8488 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8489 // from such a struct, we read/write packed bits disagreeing with the
8490 // unvectorized version.
8491 TreeEntry *TE = nullptr;
8492 fixupOrderingIndices(CurrentOrder);
8493 switch (State) {
8494 case TreeEntry::Vectorize:
8495 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8496 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8497 if (CurrentOrder.empty())
8498 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
8499 TE->dump());
8500 else
8502 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
8503 TE->dump());
8504 break;
8505 case TreeEntry::StridedVectorize:
8506 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8507 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8508 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8509 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
8510 TE->dump());
8511 break;
8512 case TreeEntry::ScatterVectorize:
8513 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8514 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8515 UserTreeIdx, ReuseShuffleIndices);
8516 LLVM_DEBUG(
8517 dbgs()
8518 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8519 TE->dump());
8520 break;
8521 case TreeEntry::CombinedVectorize:
8522 case TreeEntry::NeedToGather:
8523 llvm_unreachable("Unexpected loads state.");
8524 }
8525 TE->setOperand(*this);
8526 if (State == TreeEntry::ScatterVectorize)
8527 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8528 return;
8529 }
8530 case Instruction::ZExt:
8531 case Instruction::SExt:
8532 case Instruction::FPToUI:
8533 case Instruction::FPToSI:
8534 case Instruction::FPExt:
8535 case Instruction::PtrToInt:
8536 case Instruction::IntToPtr:
8537 case Instruction::SIToFP:
8538 case Instruction::UIToFP:
8539 case Instruction::Trunc:
8540 case Instruction::FPTrunc:
8541 case Instruction::BitCast: {
8542 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8543 std::make_pair(std::numeric_limits<unsigned>::min(),
8544 std::numeric_limits<unsigned>::max()));
8545 if (ShuffleOrOp == Instruction::ZExt ||
8546 ShuffleOrOp == Instruction::SExt) {
8547 CastMaxMinBWSizes = std::make_pair(
8548 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8549 PrevMaxBW),
8550 std::min<unsigned>(
8551 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8552 PrevMinBW));
8553 } else if (ShuffleOrOp == Instruction::Trunc) {
8554 CastMaxMinBWSizes = std::make_pair(
8555 std::max<unsigned>(
8556 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8557 PrevMaxBW),
8558 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8559 PrevMinBW));
8560 }
8561 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8562 ReuseShuffleIndices);
8563 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
8564 TE->dump());
8565
8566 TE->setOperand(*this);
8567 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8568 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8569 if (ShuffleOrOp == Instruction::Trunc) {
8570 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8571 } else if (ShuffleOrOp == Instruction::SIToFP ||
8572 ShuffleOrOp == Instruction::UIToFP) {
8573 unsigned NumSignBits =
8574 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8575 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8576 APInt Mask = DB->getDemandedBits(OpI);
8577 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8578 }
8579 if (NumSignBits * 2 >=
8580 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8581 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8582 }
8583 return;
8584 }
8585 case Instruction::ICmp:
8586 case Instruction::FCmp: {
8587 // Check that all of the compares have the same predicate.
8588 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8589 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8590 ReuseShuffleIndices);
8591 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
8592 TE->dump());
8593
8595 VLOperands Ops(VL, VL0, *this);
8596 if (cast<CmpInst>(VL0)->isCommutative()) {
8597 // Commutative predicate - collect + sort operands of the instructions
8598 // so that each side is more likely to have the same opcode.
8600 "Commutative Predicate mismatch");
8601 Ops.reorder();
8602 Left = Ops.getVL(0);
8603 Right = Ops.getVL(1);
8604 } else {
8605 // Collect operands - commute if it uses the swapped predicate.
8606 for (Value *V : VL) {
8607 if (isa<PoisonValue>(V)) {
8608 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8609 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8610 continue;
8611 }
8612 auto *Cmp = cast<CmpInst>(V);
8613 Value *LHS = Cmp->getOperand(0);
8614 Value *RHS = Cmp->getOperand(1);
8615 if (Cmp->getPredicate() != P0)
8616 std::swap(LHS, RHS);
8617 Left.push_back(LHS);
8618 Right.push_back(RHS);
8619 }
8620 }
8621 TE->setOperand(0, Left);
8622 TE->setOperand(1, Right);
8623 buildTree_rec(Left, Depth + 1, {TE, 0});
8624 buildTree_rec(Right, Depth + 1, {TE, 1});
8625 if (ShuffleOrOp == Instruction::ICmp) {
8626 unsigned NumSignBits0 =
8627 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8628 if (NumSignBits0 * 2 >=
8629 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8630 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8631 unsigned NumSignBits1 =
8632 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8633 if (NumSignBits1 * 2 >=
8634 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8635 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8636 }
8637 return;
8638 }
8639 case Instruction::Select:
8640 case Instruction::FNeg:
8641 case Instruction::Add:
8642 case Instruction::FAdd:
8643 case Instruction::Sub:
8644 case Instruction::FSub:
8645 case Instruction::Mul:
8646 case Instruction::FMul:
8647 case Instruction::UDiv:
8648 case Instruction::SDiv:
8649 case Instruction::FDiv:
8650 case Instruction::URem:
8651 case Instruction::SRem:
8652 case Instruction::FRem:
8653 case Instruction::Shl:
8654 case Instruction::LShr:
8655 case Instruction::AShr:
8656 case Instruction::And:
8657 case Instruction::Or:
8658 case Instruction::Xor:
8659 case Instruction::Freeze: {
8660 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8661 ReuseShuffleIndices);
8662 LLVM_DEBUG(
8663 dbgs() << "SLP: added a new TreeEntry "
8664 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8665 TE->dump());
8666
8667 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8668 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8669 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8670 return;
8671 }
8672 case Instruction::GetElementPtr: {
8673 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8674 ReuseShuffleIndices);
8675 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
8676 TE->dump());
8678 // Prepare the operand vector for pointer operands.
8679 for (Value *V : VL) {
8680 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8681 if (!GEP) {
8682 Operands.front().push_back(V);
8683 continue;
8684 }
8685 Operands.front().push_back(GEP->getPointerOperand());
8686 }
8687 TE->setOperand(0, Operands.front());
8688 // Need to cast all indices to the same type before vectorization to
8689 // avoid crash.
8690 // Required to be able to find correct matches between different gather
8691 // nodes and reuse the vectorized values rather than trying to gather them
8692 // again.
8693 int IndexIdx = 1;
8694 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8695 Type *Ty = all_of(VL,
8696 [VL0Ty, IndexIdx](Value *V) {
8697 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8698 if (!GEP)
8699 return true;
8700 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8701 })
8702 ? VL0Ty
8703 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8704 ->getPointerOperandType()
8705 ->getScalarType());
8706 // Prepare the operand vector.
8707 for (Value *V : VL) {
8708 auto *I = dyn_cast<GetElementPtrInst>(V);
8709 if (!I) {
8710 Operands.back().push_back(
8711 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8712 continue;
8713 }
8714 auto *Op = I->getOperand(IndexIdx);
8715 auto *CI = dyn_cast<ConstantInt>(Op);
8716 if (!CI)
8717 Operands.back().push_back(Op);
8718 else
8719 Operands.back().push_back(ConstantFoldIntegerCast(
8720 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8721 }
8722 TE->setOperand(IndexIdx, Operands.back());
8723
8724 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8725 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8726 return;
8727 }
8728 case Instruction::Store: {
8729 bool Consecutive = CurrentOrder.empty();
8730 if (!Consecutive)
8731 fixupOrderingIndices(CurrentOrder);
8732 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8733 ReuseShuffleIndices, CurrentOrder);
8734 if (Consecutive)
8735 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
8736 TE->dump());
8737 else
8738 LLVM_DEBUG(
8739 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
8740 TE->dump());
8741 TE->setOperand(*this);
8742 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8743 return;
8744 }
8745 case Instruction::Call: {
8746 // Check if the calls are all to the same vectorizable intrinsic or
8747 // library function.
8748 CallInst *CI = cast<CallInst>(VL0);
8750
8751 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8752 ReuseShuffleIndices);
8753 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
8754 TE->dump());
8755 TE->setOperand(*this, isCommutative(VL0));
8756 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8757 // For scalar operands no need to create an entry since no need to
8758 // vectorize it.
8760 continue;
8761 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8762 }
8763 return;
8764 }
8765 case Instruction::ShuffleVector: {
8766 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8767 ReuseShuffleIndices);
8768 if (S.isAltShuffle()) {
8769 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
8770 TE->dump());
8771 } else {
8772 assert(SLPReVec && "Only supported by REVEC.");
8773 LLVM_DEBUG(
8774 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8775 TE->dump());
8776 }
8777
8778 // Reorder operands if reordering would enable vectorization.
8779 auto *CI = dyn_cast<CmpInst>(VL0);
8780 if (CI && any_of(VL, [](Value *V) {
8781 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8782 })) {
8783 auto *MainCI = cast<CmpInst>(S.getMainOp());
8784 auto *AltCI = cast<CmpInst>(S.getAltOp());
8785 CmpInst::Predicate MainP = MainCI->getPredicate();
8786 CmpInst::Predicate AltP = AltCI->getPredicate();
8787 assert(MainP != AltP &&
8788 "Expected different main/alternate predicates.");
8790 // Collect operands - commute if it uses the swapped predicate or
8791 // alternate operation.
8792 for (Value *V : VL) {
8793 if (isa<PoisonValue>(V)) {
8794 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8795 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8796 continue;
8797 }
8798 auto *Cmp = cast<CmpInst>(V);
8799 Value *LHS = Cmp->getOperand(0);
8800 Value *RHS = Cmp->getOperand(1);
8801
8802 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8803 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8804 std::swap(LHS, RHS);
8805 } else {
8806 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8807 std::swap(LHS, RHS);
8808 }
8809 Left.push_back(LHS);
8810 Right.push_back(RHS);
8811 }
8812 TE->setOperand(0, Left);
8813 TE->setOperand(1, Right);
8814 buildTree_rec(Left, Depth + 1, {TE, 0});
8815 buildTree_rec(Right, Depth + 1, {TE, 1});
8816 return;
8817 }
8818
8819 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8820 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8821 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8822 return;
8823 }
8824 default:
8825 break;
8826 }
8827 llvm_unreachable("Unexpected vectorization of the instructions.");
8828}
8829
8831 unsigned N = 1;
8832 Type *EltTy = T;
8833
8834 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8835 if (EltTy->isEmptyTy())
8836 return 0;
8837 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8838 // Check that struct is homogeneous.
8839 for (const auto *Ty : ST->elements())
8840 if (Ty != *ST->element_begin())
8841 return 0;
8842 N *= ST->getNumElements();
8843 EltTy = *ST->element_begin();
8844 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8845 N *= AT->getNumElements();
8846 EltTy = AT->getElementType();
8847 } else {
8848 auto *VT = cast<FixedVectorType>(EltTy);
8849 N *= VT->getNumElements();
8850 EltTy = VT->getElementType();
8851 }
8852 }
8853
8854 if (!isValidElementType(EltTy))
8855 return 0;
8856 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8857 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8858 VTSize != DL->getTypeStoreSizeInBits(T))
8859 return 0;
8860 return N;
8861}
8862
8863bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
8864 SmallVectorImpl<unsigned> &CurrentOrder,
8865 bool ResizeAllowed) const {
8866 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8867 assert(It != VL.end() && "Expected at least one extract instruction.");
8868 auto *E0 = cast<Instruction>(*It);
8869 assert(
8870 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8871 "Invalid opcode");
8872 // Check if all of the extracts come from the same vector and from the
8873 // correct offset.
8874 Value *Vec = E0->getOperand(0);
8875
8876 CurrentOrder.clear();
8877
8878 // We have to extract from a vector/aggregate with the same number of elements.
8879 unsigned NElts;
8880 if (E0->getOpcode() == Instruction::ExtractValue) {
8881 NElts = canMapToVector(Vec->getType());
8882 if (!NElts)
8883 return false;
8884 // Check if load can be rewritten as load of vector.
8885 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8886 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8887 return false;
8888 } else {
8889 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8890 }
8891
8892 unsigned E = VL.size();
8893 if (!ResizeAllowed && NElts != E)
8894 return false;
8895 SmallVector<int> Indices(E, PoisonMaskElem);
8896 unsigned MinIdx = NElts, MaxIdx = 0;
8897 for (auto [I, V] : enumerate(VL)) {
8898 auto *Inst = dyn_cast<Instruction>(V);
8899 if (!Inst)
8900 continue;
8901 if (Inst->getOperand(0) != Vec)
8902 return false;
8903 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8904 if (isa<UndefValue>(EE->getIndexOperand()))
8905 continue;
8906 std::optional<unsigned> Idx = getExtractIndex(Inst);
8907 if (!Idx)
8908 return false;
8909 const unsigned ExtIdx = *Idx;
8910 if (ExtIdx >= NElts)
8911 continue;
8912 Indices[I] = ExtIdx;
8913 if (MinIdx > ExtIdx)
8914 MinIdx = ExtIdx;
8915 if (MaxIdx < ExtIdx)
8916 MaxIdx = ExtIdx;
8917 }
8918 if (MaxIdx - MinIdx + 1 > E)
8919 return false;
8920 if (MaxIdx + 1 <= E)
8921 MinIdx = 0;
8922
8923 // Check that all of the indices extract from the correct offset.
8924 bool ShouldKeepOrder = true;
8925 // Assign to all items the initial value E + 1 so we can check if the extract
8926 // instruction index was used already.
8927 // Also, later we can check that all the indices are used and we have a
8928 // consecutive access in the extract instructions, by checking that no
8929 // element of CurrentOrder still has value E + 1.
8930 CurrentOrder.assign(E, E);
8931 for (unsigned I = 0; I < E; ++I) {
8932 if (Indices[I] == PoisonMaskElem)
8933 continue;
8934 const unsigned ExtIdx = Indices[I] - MinIdx;
8935 if (CurrentOrder[ExtIdx] != E) {
8936 CurrentOrder.clear();
8937 return false;
8938 }
8939 ShouldKeepOrder &= ExtIdx == I;
8940 CurrentOrder[ExtIdx] = I;
8941 }
8942 if (ShouldKeepOrder)
8943 CurrentOrder.clear();
8944
8945 return ShouldKeepOrder;
8946}
8947
8948bool BoUpSLP::areAllUsersVectorized(
8949 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
8950 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
8951 all_of(I->users(), [this](User *U) {
8952 return ScalarToTreeEntry.contains(U) ||
8953 isVectorLikeInstWithConstOps(U) ||
8954 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8955 });
8956}
8957
8958static std::pair<InstructionCost, InstructionCost>
8961 ArrayRef<Type *> ArgTys) {
8963
8964 // Calculate the cost of the scalar and vector calls.
8965 FastMathFlags FMF;
8966 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
8967 FMF = FPCI->getFastMathFlags();
8969 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
8970 dyn_cast<IntrinsicInst>(CI));
8971 auto IntrinsicCost =
8973
8974 auto Shape = VFShape::get(CI->getFunctionType(),
8976 false /*HasGlobalPred*/);
8977 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
8978 auto LibCost = IntrinsicCost;
8979 if (!CI->isNoBuiltin() && VecFunc) {
8980 // Calculate the cost of the vector library call.
8981 // If the corresponding vector call is cheaper, return its cost.
8982 LibCost =
8983 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
8984 }
8985 return {IntrinsicCost, LibCost};
8986}
8987
8988void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
8989 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
8990 SmallVectorImpl<Value *> *OpScalars,
8991 SmallVectorImpl<Value *> *AltScalars) const {
8992 unsigned Sz = Scalars.size();
8993 Mask.assign(Sz, PoisonMaskElem);
8994 SmallVector<int> OrderMask;
8995 if (!ReorderIndices.empty())
8996 inversePermutation(ReorderIndices, OrderMask);
8997 for (unsigned I = 0; I < Sz; ++I) {
8998 unsigned Idx = I;
8999 if (!ReorderIndices.empty())
9000 Idx = OrderMask[I];
9001 if (isa<PoisonValue>(Scalars[Idx]))
9002 continue;
9003 auto *OpInst = cast<Instruction>(Scalars[Idx]);
9004 if (IsAltOp(OpInst)) {
9005 Mask[I] = Sz + Idx;
9006 if (AltScalars)
9007 AltScalars->push_back(OpInst);
9008 } else {
9009 Mask[I] = Idx;
9010 if (OpScalars)
9011 OpScalars->push_back(OpInst);
9012 }
9013 }
9014 if (!ReuseShuffleIndices.empty()) {
9015 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
9016 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
9017 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9018 });
9019 Mask.swap(NewMask);
9020 }
9021}
9022
9024 const Instruction *MainOp,
9025 const Instruction *AltOp,
9026 const TargetLibraryInfo &TLI) {
9027 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9028 auto *AltCI = cast<CmpInst>(AltOp);
9029 CmpInst::Predicate MainP = MainCI->getPredicate();
9030 CmpInst::Predicate AltP = AltCI->getPredicate();
9031 assert(MainP != AltP && "Expected different main/alternate predicates.");
9032 auto *CI = cast<CmpInst>(I);
9033 if (isCmpSameOrSwapped(MainCI, CI, TLI))
9034 return false;
9035 if (isCmpSameOrSwapped(AltCI, CI, TLI))
9036 return true;
9037 CmpInst::Predicate P = CI->getPredicate();
9039
9040 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
9041 "CmpInst expected to match either main or alternate predicate or "
9042 "their swap.");
9043 (void)AltP;
9044 return MainP != P && MainP != SwappedP;
9045 }
9046 return I->getOpcode() == AltOp->getOpcode();
9047}
9048
9049TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9050 assert(!Ops.empty());
9051 const auto *Op0 = Ops.front();
9052
9053 const bool IsConstant = all_of(Ops, [](Value *V) {
9054 // TODO: We should allow undef elements here
9055 return isConstant(V) && !isa<UndefValue>(V);
9056 });
9057 const bool IsUniform = all_of(Ops, [=](Value *V) {
9058 // TODO: We should allow undef elements here
9059 return V == Op0;
9060 });
9061 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9062 // TODO: We should allow undef elements here
9063 if (auto *CI = dyn_cast<ConstantInt>(V))
9064 return CI->getValue().isPowerOf2();
9065 return false;
9066 });
9067 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9068 // TODO: We should allow undef elements here
9069 if (auto *CI = dyn_cast<ConstantInt>(V))
9070 return CI->getValue().isNegatedPowerOf2();
9071 return false;
9072 });
9073
9075 if (IsConstant && IsUniform)
9077 else if (IsConstant)
9079 else if (IsUniform)
9081
9083 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9084 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9085
9086 return {VK, VP};
9087}
9088
9089namespace {
9090/// The base class for shuffle instruction emission and shuffle cost estimation.
9091class BaseShuffleAnalysis {
9092protected:
9093 Type *ScalarTy = nullptr;
9094
9095 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9096
9097 /// V is expected to be a vectorized value.
9098 /// When REVEC is disabled, there is no difference between VF and
9099 /// VNumElements.
9100 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9101 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9102 /// of 8.
9103 unsigned getVF(Value *V) const {
9104 assert(V && "V cannot be nullptr");
9105 assert(isa<FixedVectorType>(V->getType()) &&
9106 "V does not have FixedVectorType");
9107 assert(ScalarTy && "ScalarTy cannot be nullptr");
9108 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9109 unsigned VNumElements =
9110 cast<FixedVectorType>(V->getType())->getNumElements();
9111 assert(VNumElements > ScalarTyNumElements &&
9112 "the number of elements of V is not large enough");
9113 assert(VNumElements % ScalarTyNumElements == 0 &&
9114 "the number of elements of V is not a vectorized value");
9115 return VNumElements / ScalarTyNumElements;
9116 }
9117
9118 /// Checks if the mask is an identity mask.
9119 /// \param IsStrict if is true the function returns false if mask size does
9120 /// not match vector size.
9121 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9122 bool IsStrict) {
9123 int Limit = Mask.size();
9124 int VF = VecTy->getNumElements();
9125 int Index = -1;
9126 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9127 return true;
9128 if (!IsStrict) {
9129 // Consider extract subvector starting from index 0.
9131 Index == 0)
9132 return true;
9133 // All VF-size submasks are identity (e.g.
9134 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9135 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9136 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9137 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9139 }))
9140 return true;
9141 }
9142 return false;
9143 }
9144
9145 /// Tries to combine 2 different masks into single one.
9146 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9147 /// change the size of the vector, \p LocalVF is the original size of the
9148 /// shuffled vector.
9149 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9150 ArrayRef<int> ExtMask) {
9151 unsigned VF = Mask.size();
9152 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9153 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9154 if (ExtMask[I] == PoisonMaskElem)
9155 continue;
9156 int MaskedIdx = Mask[ExtMask[I] % VF];
9157 NewMask[I] =
9158 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9159 }
9160 Mask.swap(NewMask);
9161 }
9162
9163 /// Looks through shuffles trying to reduce final number of shuffles in the
9164 /// code. The function looks through the previously emitted shuffle
9165 /// instructions and properly mark indices in mask as undef.
9166 /// For example, given the code
9167 /// \code
9168 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9169 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9170 /// \endcode
9171 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9172 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9173 /// <0, 1, 2, 3> for the shuffle.
9174 /// If 2 operands are of different size, the smallest one will be resized and
9175 /// the mask recalculated properly.
9176 /// For example, given the code
9177 /// \code
9178 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9179 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9180 /// \endcode
9181 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9182 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9183 /// <0, 1, 2, 3> for the shuffle.
9184 /// So, it tries to transform permutations to simple vector merge, if
9185 /// possible.
9186 /// \param V The input vector which must be shuffled using the given \p Mask.
9187 /// If the better candidate is found, \p V is set to this best candidate
9188 /// vector.
9189 /// \param Mask The input mask for the shuffle. If the best candidate is found
9190 /// during looking-through-shuffles attempt, it is updated accordingly.
9191 /// \param SinglePermute true if the shuffle operation is originally a
9192 /// single-value-permutation. In this case the look-through-shuffles procedure
9193 /// may look for resizing shuffles as the best candidates.
9194 /// \return true if the shuffle results in the non-resizing identity shuffle
9195 /// (and thus can be ignored), false - otherwise.
9196 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9197 bool SinglePermute) {
9198 Value *Op = V;
9199 ShuffleVectorInst *IdentityOp = nullptr;
9200 SmallVector<int> IdentityMask;
9201 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9202 // Exit if not a fixed vector type or changing size shuffle.
9203 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9204 if (!SVTy)
9205 break;
9206 // Remember the identity or broadcast mask, if it is not a resizing
9207 // shuffle. If no better candidates are found, this Op and Mask will be
9208 // used in the final shuffle.
9209 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9210 if (!IdentityOp || !SinglePermute ||
9211 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9213 IdentityMask.size()))) {
9214 IdentityOp = SV;
9215 // Store current mask in the IdentityMask so later we did not lost
9216 // this info if IdentityOp is selected as the best candidate for the
9217 // permutation.
9218 IdentityMask.assign(Mask);
9219 }
9220 }
9221 // Remember the broadcast mask. If no better candidates are found, this Op
9222 // and Mask will be used in the final shuffle.
9223 // Zero splat can be used as identity too, since it might be used with
9224 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9225 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9226 // expensive, the analysis founds out, that the source vector is just a
9227 // broadcast, this original mask can be transformed to identity mask <0,
9228 // 1, 2, 3>.
9229 // \code
9230 // %0 = shuffle %v, poison, zeroinitalizer
9231 // %res = shuffle %0, poison, <3, 1, 2, 0>
9232 // \endcode
9233 // may be transformed to
9234 // \code
9235 // %0 = shuffle %v, poison, zeroinitalizer
9236 // %res = shuffle %0, poison, <0, 1, 2, 3>
9237 // \endcode
9238 if (SV->isZeroEltSplat()) {
9239 IdentityOp = SV;
9240 IdentityMask.assign(Mask);
9241 }
9242 int LocalVF = Mask.size();
9243 if (auto *SVOpTy =
9244 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9245 LocalVF = SVOpTy->getNumElements();
9246 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9247 for (auto [Idx, I] : enumerate(Mask)) {
9248 if (I == PoisonMaskElem ||
9249 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9250 continue;
9251 ExtMask[Idx] = SV->getMaskValue(I);
9252 }
9253 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9254 SV->getOperand(0),
9255 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9256 .all();
9257 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9258 SV->getOperand(1),
9259 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9260 .all();
9261 if (!IsOp1Undef && !IsOp2Undef) {
9262 // Update mask and mark undef elems.
9263 for (int &I : Mask) {
9264 if (I == PoisonMaskElem)
9265 continue;
9266 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9268 I = PoisonMaskElem;
9269 }
9270 break;
9271 }
9272 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9273 combineMasks(LocalVF, ShuffleMask, Mask);
9274 Mask.swap(ShuffleMask);
9275 if (IsOp2Undef)
9276 Op = SV->getOperand(0);
9277 else
9278 Op = SV->getOperand(1);
9279 }
9280 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9281 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9283 if (IdentityOp) {
9284 V = IdentityOp;
9285 assert(Mask.size() == IdentityMask.size() &&
9286 "Expected masks of same sizes.");
9287 // Clear known poison elements.
9288 for (auto [I, Idx] : enumerate(Mask))
9289 if (Idx == PoisonMaskElem)
9290 IdentityMask[I] = PoisonMaskElem;
9291 Mask.swap(IdentityMask);
9292 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9293 return SinglePermute &&
9294 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9295 /*IsStrict=*/true) ||
9296 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9297 Shuffle->isZeroEltSplat() &&
9299 }
9300 V = Op;
9301 return false;
9302 }
9303 V = Op;
9304 return true;
9305 }
9306
9307 /// Smart shuffle instruction emission, walks through shuffles trees and
9308 /// tries to find the best matching vector for the actual shuffle
9309 /// instruction.
9310 template <typename T, typename ShuffleBuilderTy>
9311 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9312 ShuffleBuilderTy &Builder) {
9313 assert(V1 && "Expected at least one vector value.");
9314 if (V2)
9315 Builder.resizeToMatch(V1, V2);
9316 int VF = Mask.size();
9317 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9318 VF = FTy->getNumElements();
9319 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9320 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9321 .all()) {
9322 // Peek through shuffles.
9323 Value *Op1 = V1;
9324 Value *Op2 = V2;
9325 int VF =
9326 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9327 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9328 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9329 for (int I = 0, E = Mask.size(); I < E; ++I) {
9330 if (Mask[I] < VF)
9331 CombinedMask1[I] = Mask[I];
9332 else
9333 CombinedMask2[I] = Mask[I] - VF;
9334 }
9335 Value *PrevOp1;
9336 Value *PrevOp2;
9337 do {
9338 PrevOp1 = Op1;
9339 PrevOp2 = Op2;
9340 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9341 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9342 // Check if we have 2 resizing shuffles - need to peek through operands
9343 // again.
9344 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9345 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9346 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9347 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9348 if (I == PoisonMaskElem)
9349 continue;
9350 ExtMask1[Idx] = SV1->getMaskValue(I);
9351 }
9352 SmallBitVector UseMask1 = buildUseMask(
9353 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9354 ->getNumElements(),
9355 ExtMask1, UseMask::SecondArg);
9356 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9357 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9358 if (I == PoisonMaskElem)
9359 continue;
9360 ExtMask2[Idx] = SV2->getMaskValue(I);
9361 }
9362 SmallBitVector UseMask2 = buildUseMask(
9363 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9364 ->getNumElements(),
9365 ExtMask2, UseMask::SecondArg);
9366 if (SV1->getOperand(0)->getType() ==
9367 SV2->getOperand(0)->getType() &&
9368 SV1->getOperand(0)->getType() != SV1->getType() &&
9369 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9370 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9371 Op1 = SV1->getOperand(0);
9372 Op2 = SV2->getOperand(0);
9373 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9374 int LocalVF = ShuffleMask1.size();
9375 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9376 LocalVF = FTy->getNumElements();
9377 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9378 CombinedMask1.swap(ShuffleMask1);
9379 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9380 LocalVF = ShuffleMask2.size();
9381 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9382 LocalVF = FTy->getNumElements();
9383 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9384 CombinedMask2.swap(ShuffleMask2);
9385 }
9386 }
9387 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9388 Builder.resizeToMatch(Op1, Op2);
9389 VF = std::max(cast<VectorType>(Op1->getType())
9390 ->getElementCount()
9391 .getKnownMinValue(),
9392 cast<VectorType>(Op2->getType())
9393 ->getElementCount()
9394 .getKnownMinValue());
9395 for (int I = 0, E = Mask.size(); I < E; ++I) {
9396 if (CombinedMask2[I] != PoisonMaskElem) {
9397 assert(CombinedMask1[I] == PoisonMaskElem &&
9398 "Expected undefined mask element");
9399 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9400 }
9401 }
9402 if (Op1 == Op2 &&
9403 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9404 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9405 isa<ShuffleVectorInst>(Op1) &&
9406 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9407 ArrayRef(CombinedMask1))))
9408 return Builder.createIdentity(Op1);
9409 return Builder.createShuffleVector(
9410 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9411 CombinedMask1);
9412 }
9413 if (isa<PoisonValue>(V1))
9414 return Builder.createPoison(
9415 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9416 SmallVector<int> NewMask(Mask);
9417 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9418 assert(V1 && "Expected non-null value after looking through shuffles.");
9419
9420 if (!IsIdentity)
9421 return Builder.createShuffleVector(V1, NewMask);
9422 return Builder.createIdentity(V1);
9423 }
9424};
9425} // namespace
9426
9427/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9428static std::pair<InstructionCost, InstructionCost>
9430 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9431 Type *ScalarTy, VectorType *VecTy) {
9432 InstructionCost ScalarCost = 0;
9433 InstructionCost VecCost = 0;
9434 // Here we differentiate two cases: (1) when Ptrs represent a regular
9435 // vectorization tree node (as they are pointer arguments of scattered
9436 // loads) or (2) when Ptrs are the arguments of loads or stores being
9437 // vectorized as plane wide unit-stride load/store since all the
9438 // loads/stores are known to be from/to adjacent locations.
9439 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9440 // Case 2: estimate costs for pointer related costs when vectorizing to
9441 // a wide load/store.
9442 // Scalar cost is estimated as a set of pointers with known relationship
9443 // between them.
9444 // For vector code we will use BasePtr as argument for the wide load/store
9445 // but we also need to account all the instructions which are going to
9446 // stay in vectorized code due to uses outside of these scalar
9447 // loads/stores.
9448 ScalarCost = TTI.getPointersChainCost(
9449 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9450 CostKind);
9451
9452 SmallVector<const Value *> PtrsRetainedInVecCode;
9453 for (Value *V : Ptrs) {
9454 if (V == BasePtr) {
9455 PtrsRetainedInVecCode.push_back(V);
9456 continue;
9457 }
9458 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9459 // For simplicity assume Ptr to stay in vectorized code if it's not a
9460 // GEP instruction. We don't care since it's cost considered free.
9461 // TODO: We should check for any uses outside of vectorizable tree
9462 // rather than just single use.
9463 if (!Ptr || !Ptr->hasOneUse())
9464 PtrsRetainedInVecCode.push_back(V);
9465 }
9466
9467 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9468 // If all pointers stay in vectorized code then we don't have
9469 // any savings on that.
9470 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9471 }
9472 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9473 TTI::PointersChainInfo::getKnownStride(),
9474 VecTy, CostKind);
9475 } else {
9476 // Case 1: Ptrs are the arguments of loads that we are going to transform
9477 // into masked gather load intrinsic.
9478 // All the scalar GEPs will be removed as a result of vectorization.
9479 // For any external uses of some lanes extract element instructions will
9480 // be generated (which cost is estimated separately).
9481 TTI::PointersChainInfo PtrsInfo =
9482 all_of(Ptrs,
9483 [](const Value *V) {
9484 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9485 return Ptr && !Ptr->hasAllConstantIndices();
9486 })
9487 ? TTI::PointersChainInfo::getUnknownStride()
9488 : TTI::PointersChainInfo::getKnownStride();
9489
9490 ScalarCost =
9491 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9492 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9493 if (!BaseGEP) {
9494 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9495 if (It != Ptrs.end())
9496 BaseGEP = cast<GEPOperator>(*It);
9497 }
9498 if (BaseGEP) {
9499 SmallVector<const Value *> Indices(BaseGEP->indices());
9500 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9501 BaseGEP->getPointerOperand(), Indices, VecTy,
9502 CostKind);
9503 }
9504 }
9505
9506 return std::make_pair(ScalarCost, VecCost);
9507}
9508
9509void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9510 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9511 "Expected gather node without reordering.");
9513 SmallSet<size_t, 2> LoadKeyUsed;
9514
9515 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9516 // instructions have same opcode already.
9517 if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
9518 all_of(TE.Scalars, isConstant))
9519 return;
9520
9521 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9522 return VectorizableTree[Idx]->isSame(TE.Scalars);
9523 }))
9524 return;
9525
9526 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9527 Key = hash_combine(hash_value(LI->getParent()), Key);
9528 Value *Ptr =
9530 if (LoadKeyUsed.contains(Key)) {
9531 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9532 if (LIt != LoadsMap.end()) {
9533 for (LoadInst *RLI : LIt->second) {
9534 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9535 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9536 /*StrictCheck=*/true))
9537 return hash_value(RLI->getPointerOperand());
9538 }
9539 for (LoadInst *RLI : LIt->second) {
9541 LI->getPointerOperand(), *TLI)) {
9542 hash_code SubKey = hash_value(RLI->getPointerOperand());
9543 return SubKey;
9544 }
9545 }
9546 if (LIt->second.size() > 2) {
9547 hash_code SubKey =
9548 hash_value(LIt->second.back()->getPointerOperand());
9549 return SubKey;
9550 }
9551 }
9552 }
9553 LoadKeyUsed.insert(Key);
9554 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9555 return hash_value(LI->getPointerOperand());
9556 };
9559 bool IsOrdered = true;
9560 unsigned NumInstructions = 0;
9561 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9562 // nodes.
9563 for (auto [I, V] : enumerate(TE.Scalars)) {
9564 size_t Key = 1, Idx = 1;
9565 if (auto *Inst = dyn_cast<Instruction>(V);
9566 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9567 !isDeleted(Inst) && !isVectorized(V)) {
9568 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9569 /*AllowAlternate=*/false);
9570 ++NumInstructions;
9571 }
9572 auto &Container = SortedValues[Key];
9573 if (IsOrdered && !KeyToIndex.contains(V) &&
9574 !(isa<Constant, ExtractElementInst>(V) ||
9576 ((Container.contains(Idx) &&
9577 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9578 (!Container.empty() && !Container.contains(Idx) &&
9579 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9580 IsOrdered = false;
9581 auto &KTI = KeyToIndex[V];
9582 if (KTI.empty())
9583 Container[Idx].push_back(V);
9584 KTI.push_back(I);
9585 }
9587 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9588 if (!IsOrdered && NumInstructions > 1) {
9589 unsigned Cnt = 0;
9590 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9591 for (const auto &D : SortedValues) {
9592 for (const auto &P : D.second) {
9593 unsigned Sz = 0;
9594 for (Value *V : P.second) {
9595 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9596 for (auto [K, Idx] : enumerate(Indices)) {
9597 TE.ReorderIndices[Cnt + K] = Idx;
9598 TE.Scalars[Cnt + K] = V;
9599 }
9600 Sz += Indices.size();
9601 Cnt += Indices.size();
9602 }
9603 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9604 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9605 *TTI, TE.Scalars.front()->getType(), Sz);
9606 SubVectors.emplace_back(Cnt - Sz, SubVF);
9607 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9608 DemandedElts.clearBit(I);
9609 } else if (!P.second.empty() && isConstant(P.second.front())) {
9610 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9611 DemandedElts.clearBit(I);
9612 }
9613 }
9614 }
9615 }
9616 // Reuses always require shuffles, so consider it as profitable.
9617 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9618 return;
9619 // Do simple cost estimation.
9622 auto *ScalarTy = TE.Scalars.front()->getType();
9623 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9624 for (auto [Idx, Sz] : SubVectors) {
9626 Idx, getWidenedType(ScalarTy, Sz));
9627 }
9628 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9629 assert(SLPReVec && "Only supported by REVEC.");
9630 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9631 // of CreateInsertElement.
9632 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9633 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9634 if (DemandedElts[I])
9635 Cost +=
9636 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9637 CostKind, I * ScalarTyNumElements, FTy);
9638 } else {
9639 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9640 /*Extract=*/false, CostKind);
9641 }
9642 int Sz = TE.Scalars.size();
9643 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9644 TE.ReorderIndices.end());
9645 for (unsigned I : seq<unsigned>(Sz)) {
9646 Value *V = TE.getOrdered(I);
9647 if (isa<PoisonValue>(V)) {
9648 ReorderMask[I] = PoisonMaskElem;
9649 } else if (isConstant(V) || DemandedElts[I]) {
9650 ReorderMask[I] = I + TE.ReorderIndices.size();
9651 }
9652 }
9654 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9657 VecTy, ReorderMask);
9658 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9659 ReorderMask.assign(Sz, PoisonMaskElem);
9660 for (unsigned I : seq<unsigned>(Sz)) {
9661 Value *V = TE.getOrdered(I);
9662 if (isConstant(V)) {
9663 DemandedElts.clearBit(I);
9664 if (!isa<PoisonValue>(V))
9665 ReorderMask[I] = I;
9666 } else {
9667 ReorderMask[I] = I + Sz;
9668 }
9669 }
9671 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9672 if (!DemandedElts.isAllOnes())
9673 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9674 if (Cost >= BVCost) {
9675 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9676 reorderScalars(TE.Scalars, Mask);
9677 TE.ReorderIndices.clear();
9678 }
9679}
9680
9683 BaseGraphSize = VectorizableTree.size();
9684 // Turn graph transforming mode on and off, when done.
9685 class GraphTransformModeRAAI {
9686 bool &SavedIsGraphTransformMode;
9687
9688 public:
9689 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9690 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9691 IsGraphTransformMode = true;
9692 }
9693 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9694 } TransformContext(IsGraphTransformMode);
9695 // Operands are profitable if they are:
9696 // 1. At least one constant
9697 // or
9698 // 2. Splats
9699 // or
9700 // 3. Results in good vectorization opportunity, i.e. may generate vector
9701 // nodes and reduce cost of the graph.
9702 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9703 const InstructionsState &S) {
9705 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9706 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9707 I2->getOperand(Op));
9708 return all_of(
9709 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9710 return all_of(Cand,
9711 [](const std::pair<Value *, Value *> &P) {
9712 return isa<Constant>(P.first) ||
9713 isa<Constant>(P.second) || P.first == P.second;
9714 }) ||
9716 });
9717 };
9718
9719 // Try to reorder gather nodes for better vectorization opportunities.
9720 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9721 TreeEntry &E = *VectorizableTree[Idx];
9722 if (E.isGather())
9723 reorderGatherNode(E);
9724 }
9725
9726 // The tree may grow here, so iterate over nodes, built before.
9727 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9728 TreeEntry &E = *VectorizableTree[Idx];
9729 if (E.isGather()) {
9730 ArrayRef<Value *> VL = E.Scalars;
9731 const unsigned Sz = getVectorElementSize(VL.front());
9732 unsigned MinVF = getMinVF(2 * Sz);
9733 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9734 // same opcode and same parent block or all constants.
9735 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9736 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9737 E.isAltShuffle() || !allSameBlock(VL)) ||
9738 allConstant(VL) || isSplat(VL))
9739 continue;
9740 // Try to find vectorizable sequences and transform them into a series of
9741 // insertvector instructions.
9742 unsigned StartIdx = 0;
9743 unsigned End = VL.size();
9744 for (unsigned VF = getFloorFullVectorNumberOfElements(
9745 *TTI, VL.front()->getType(), VL.size() - 1);
9746 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9747 *TTI, VL.front()->getType(), VF - 1)) {
9748 if (StartIdx + VF > End)
9749 continue;
9751 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9752 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9753 // If any instruction is vectorized already - do not try again.
9754 // Reuse the existing node, if it fully matches the slice.
9755 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9756 SE || getTreeEntry(Slice.back())) {
9757 if (!SE)
9758 continue;
9759 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9760 continue;
9761 }
9762 // Constant already handled effectively - skip.
9763 if (allConstant(Slice))
9764 continue;
9765 // Do not try to vectorize small splats (less than vector register and
9766 // only with the single non-undef element).
9767 bool IsSplat = isSplat(Slice);
9768 if (Slices.empty() || !IsSplat ||
9769 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9770 Slice.front()->getType(), VF)),
9771 1U, VF - 1) !=
9773 Slice.front()->getType(), 2 * VF)),
9774 1U, 2 * VF)) ||
9775 count(Slice, Slice.front()) ==
9776 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9777 : 1)) {
9778 if (IsSplat)
9779 continue;
9780 InstructionsState S = getSameOpcode(Slice, *TLI);
9781 if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9782 (S.getOpcode() == Instruction::Load &&
9784 (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
9785 continue;
9786 if (VF == 2) {
9787 // Try to vectorize reduced values or if all users are vectorized.
9788 // For expensive instructions extra extracts might be profitable.
9789 if ((!UserIgnoreList || E.Idx != 0) &&
9790 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9792 !all_of(Slice, [&](Value *V) {
9793 if (isa<PoisonValue>(V))
9794 return true;
9795 return areAllUsersVectorized(cast<Instruction>(V),
9796 UserIgnoreList);
9797 }))
9798 continue;
9799 if (S.getOpcode() == Instruction::Load) {
9800 OrdersType Order;
9801 SmallVector<Value *> PointerOps;
9802 LoadsState Res =
9803 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9804 // Do not vectorize gathers.
9805 if (Res == LoadsState::ScatterVectorize ||
9806 Res == LoadsState::Gather) {
9807 if (Res == LoadsState::Gather) {
9809 // If reductions and the scalars from the root node are
9810 // analyzed - mark as non-vectorizable reduction.
9811 if (UserIgnoreList && E.Idx == 0)
9812 analyzedReductionVals(Slice);
9813 }
9814 continue;
9815 }
9816 } else if (S.getOpcode() == Instruction::ExtractElement ||
9817 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9819 !CheckOperandsProfitability(
9820 S.getMainOp(),
9821 cast<Instruction>(*find_if(reverse(Slice),
9822 IsaPred<Instruction>)),
9823 S))) {
9824 // Do not vectorize extractelements (handled effectively
9825 // alread). Do not vectorize non-profitable instructions (with
9826 // low cost and non-vectorizable operands.)
9827 continue;
9828 }
9829 }
9830 }
9831 Slices.emplace_back(Cnt, Slice.size());
9832 }
9833 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9834 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9835 if (StartIdx == Cnt)
9836 StartIdx = Cnt + Sz;
9837 if (End == Cnt + Sz)
9838 End = Cnt;
9839 };
9840 for (auto [Cnt, Sz] : Slices) {
9841 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9842 // If any instruction is vectorized already - do not try again.
9843 if (TreeEntry *SE = getTreeEntry(Slice.front());
9844 SE || getTreeEntry(Slice.back())) {
9845 if (!SE)
9846 continue;
9847 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9848 continue;
9849 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9850 AddCombinedNode(SE->Idx, Cnt, Sz);
9851 continue;
9852 }
9853 unsigned PrevSize = VectorizableTree.size();
9854 [[maybe_unused]] unsigned PrevEntriesSize =
9855 LoadEntriesToVectorize.size();
9856 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9857 if (PrevSize + 1 == VectorizableTree.size() &&
9858 VectorizableTree[PrevSize]->isGather() &&
9859 VectorizableTree[PrevSize]->getOpcode() !=
9860 Instruction::ExtractElement &&
9861 !isSplat(Slice)) {
9862 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9863 analyzedReductionVals(Slice);
9864 VectorizableTree.pop_back();
9865 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9866 "LoadEntriesToVectorize expected to remain the same");
9867 continue;
9868 }
9869 AddCombinedNode(PrevSize, Cnt, Sz);
9870 }
9871 }
9872 // Restore ordering, if no extra vectorization happened.
9873 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9874 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9875 reorderScalars(E.Scalars, Mask);
9876 E.ReorderIndices.clear();
9877 }
9878 }
9879 switch (E.getOpcode()) {
9880 case Instruction::Load: {
9881 // No need to reorder masked gather loads, just reorder the scalar
9882 // operands.
9883 if (E.State != TreeEntry::Vectorize)
9884 break;
9885 Type *ScalarTy = E.getMainOp()->getType();
9886 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9887 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9888 // Check if profitable to represent consecutive load + reverse as strided
9889 // load with stride -1.
9890 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9891 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9892 SmallVector<int> Mask;
9893 inversePermutation(E.ReorderIndices, Mask);
9894 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9895 InstructionCost OriginalVecCost =
9896 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9901 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9902 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9903 if (StridedCost < OriginalVecCost)
9904 // Strided load is more profitable than consecutive load + reverse -
9905 // transform the node to strided load.
9906 E.State = TreeEntry::StridedVectorize;
9907 }
9908 break;
9909 }
9910 case Instruction::Store: {
9911 Type *ScalarTy =
9912 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9913 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9914 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9915 // Check if profitable to represent consecutive load + reverse as strided
9916 // load with stride -1.
9917 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9918 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9919 SmallVector<int> Mask;
9920 inversePermutation(E.ReorderIndices, Mask);
9921 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9922 InstructionCost OriginalVecCost =
9923 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9928 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9929 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
9930 if (StridedCost < OriginalVecCost)
9931 // Strided store is more profitable than reverse + consecutive store -
9932 // transform the node to strided store.
9933 E.State = TreeEntry::StridedVectorize;
9934 } else if (!E.ReorderIndices.empty()) {
9935 // Check for interleaved stores.
9936 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
9937 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9938 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
9939 if (Mask.size() < 4)
9940 return 0u;
9941 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9943 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
9945 VecTy, Factor, BaseSI->getAlign(),
9946 BaseSI->getPointerAddressSpace()))
9947 return Factor;
9948 }
9949
9950 return 0u;
9951 };
9952 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9953 unsigned InterleaveFactor = IsInterleaveMask(Mask);
9954 if (InterleaveFactor != 0)
9955 E.setInterleave(InterleaveFactor);
9956 }
9957 break;
9958 }
9959 case Instruction::Select: {
9960 if (E.State != TreeEntry::Vectorize)
9961 break;
9962 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
9963 if (MinMaxID == Intrinsic::not_intrinsic)
9964 break;
9965 // This node is a minmax node.
9966 E.CombinedOp = TreeEntry::MinMax;
9967 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
9968 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9969 CondEntry->State == TreeEntry::Vectorize) {
9970 // The condition node is part of the combined minmax node.
9971 CondEntry->State = TreeEntry::CombinedVectorize;
9972 }
9973 break;
9974 }
9975 default:
9976 break;
9977 }
9978 }
9979
9980 if (LoadEntriesToVectorize.empty()) {
9981 // Single load node - exit.
9982 if (VectorizableTree.size() <= 1 &&
9983 VectorizableTree.front()->getOpcode() == Instruction::Load)
9984 return;
9985 // Small graph with small VF - exit.
9986 constexpr unsigned SmallTree = 3;
9987 constexpr unsigned SmallVF = 2;
9988 if ((VectorizableTree.size() <= SmallTree &&
9989 VectorizableTree.front()->Scalars.size() == SmallVF) ||
9990 (VectorizableTree.size() <= 2 && UserIgnoreList))
9991 return;
9992
9993 if (VectorizableTree.front()->isNonPowOf2Vec() &&
9994 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
9995 getCanonicalGraphSize() <= SmallTree &&
9996 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
9997 [](const std::unique_ptr<TreeEntry> &TE) {
9998 return TE->isGather() &&
9999 TE->getOpcode() == Instruction::Load &&
10000 !allSameBlock(TE->Scalars);
10001 }) == 1)
10002 return;
10003 }
10004
10005 // A list of loads to be gathered during the vectorization process. We can
10006 // try to vectorize them at the end, if profitable.
10009 GatheredLoads;
10010
10011 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10012 TreeEntry &E = *TE;
10013 if (E.isGather() &&
10014 (E.getOpcode() == Instruction::Load ||
10015 (!E.getOpcode() && any_of(E.Scalars,
10016 [&](Value *V) {
10017 return isa<LoadInst>(V) &&
10018 !isVectorized(V) &&
10019 !isDeleted(cast<Instruction>(V));
10020 }))) &&
10021 !isSplat(E.Scalars)) {
10022 for (Value *V : E.Scalars) {
10023 auto *LI = dyn_cast<LoadInst>(V);
10024 if (!LI)
10025 continue;
10026 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
10027 continue;
10029 *this, V, *DL, *SE, *TTI,
10030 GatheredLoads[std::make_tuple(
10031 LI->getParent(),
10033 LI->getType())]);
10034 }
10035 }
10036 }
10037 // Try to vectorize gathered loads if this is not just a gather of loads.
10038 if (!GatheredLoads.empty())
10039 tryToVectorizeGatheredLoads(GatheredLoads);
10040}
10041
10042/// Merges shuffle masks and emits final shuffle instruction, if required. It
10043/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10044/// when the actual shuffle instruction is generated only if this is actually
10045/// required. Otherwise, the shuffle instruction emission is delayed till the
10046/// end of the process, to reduce the number of emitted instructions and further
10047/// analysis/transformations.
10048class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10049 bool IsFinalized = false;
10050 SmallVector<int> CommonMask;
10052 const TargetTransformInfo &TTI;
10054 SmallDenseSet<Value *> VectorizedVals;
10055 BoUpSLP &R;
10056 SmallPtrSetImpl<Value *> &CheckedExtracts;
10057 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10058 /// While set, still trying to estimate the cost for the same nodes and we
10059 /// can delay actual cost estimation (virtual shuffle instruction emission).
10060 /// May help better estimate the cost if same nodes must be permuted + allows
10061 /// to move most of the long shuffles cost estimation to TTI.
10062 bool SameNodesEstimated = true;
10063
10064 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10065 if (Ty->getScalarType()->isPointerTy()) {
10069 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10070 Ty->getScalarType());
10071 if (auto *VTy = dyn_cast<VectorType>(Ty))
10072 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10073 return Res;
10074 }
10075 return Constant::getAllOnesValue(Ty);
10076 }
10077
10078 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10079 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10080 return TTI::TCC_Free;
10081 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10082 InstructionCost GatherCost = 0;
10083 SmallVector<Value *> Gathers(VL);
10084 if (!Root && isSplat(VL)) {
10085 // Found the broadcasting of the single scalar, calculate the cost as
10086 // the broadcast.
10087 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10088 assert(It != VL.end() && "Expected at least one non-undef value.");
10089 // Add broadcast for non-identity shuffle only.
10090 bool NeedShuffle =
10091 count(VL, *It) > 1 &&
10092 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10093 if (!NeedShuffle) {
10094 if (isa<FixedVectorType>(ScalarTy)) {
10095 assert(SLPReVec && "FixedVectorType is not expected.");
10096 return TTI.getShuffleCost(
10097 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10098 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10099 cast<FixedVectorType>(ScalarTy));
10100 }
10101 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10102 CostKind, std::distance(VL.begin(), It),
10103 PoisonValue::get(VecTy), *It);
10104 }
10105
10106 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10107 transform(VL, ShuffleMask.begin(), [](Value *V) {
10108 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10109 });
10110 InstructionCost InsertCost =
10111 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10112 PoisonValue::get(VecTy), *It);
10113 return InsertCost + ::getShuffleCost(TTI,
10115 VecTy, ShuffleMask, CostKind,
10116 /*Index=*/0, /*SubTp=*/nullptr,
10117 /*Args=*/*It);
10118 }
10119 return GatherCost +
10120 (all_of(Gathers, IsaPred<UndefValue>)
10122 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10123 ScalarTy));
10124 };
10125
10126 /// Compute the cost of creating a vector containing the extracted values from
10127 /// \p VL.
10129 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10130 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10131 unsigned NumParts) {
10132 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10133 unsigned NumElts =
10134 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10135 auto *EE = dyn_cast<ExtractElementInst>(V);
10136 if (!EE)
10137 return Sz;
10138 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10139 if (!VecTy)
10140 return Sz;
10141 return std::max(Sz, VecTy->getNumElements());
10142 });
10143 // FIXME: this must be moved to TTI for better estimation.
10144 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10145 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10147 -> std::optional<TTI::ShuffleKind> {
10148 if (NumElts <= EltsPerVector)
10149 return std::nullopt;
10150 int OffsetReg0 =
10151 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10152 [](int S, int I) {
10153 if (I == PoisonMaskElem)
10154 return S;
10155 return std::min(S, I);
10156 }),
10157 EltsPerVector);
10158 int OffsetReg1 = OffsetReg0;
10159 DenseSet<int> RegIndices;
10160 // Check that if trying to permute same single/2 input vectors.
10162 int FirstRegId = -1;
10163 Indices.assign(1, OffsetReg0);
10164 for (auto [Pos, I] : enumerate(Mask)) {
10165 if (I == PoisonMaskElem)
10166 continue;
10167 int Idx = I - OffsetReg0;
10168 int RegId =
10169 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10170 if (FirstRegId < 0)
10171 FirstRegId = RegId;
10172 RegIndices.insert(RegId);
10173 if (RegIndices.size() > 2)
10174 return std::nullopt;
10175 if (RegIndices.size() == 2) {
10176 ShuffleKind = TTI::SK_PermuteTwoSrc;
10177 if (Indices.size() == 1) {
10178 OffsetReg1 = alignDown(
10179 std::accumulate(
10180 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10181 [&](int S, int I) {
10182 if (I == PoisonMaskElem)
10183 return S;
10184 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10185 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10186 if (RegId == FirstRegId)
10187 return S;
10188 return std::min(S, I);
10189 }),
10190 EltsPerVector);
10191 Indices.push_back(OffsetReg1 % NumElts);
10192 }
10193 Idx = I - OffsetReg1;
10194 }
10195 I = (Idx % NumElts) % EltsPerVector +
10196 (RegId == FirstRegId ? 0 : EltsPerVector);
10197 }
10198 return ShuffleKind;
10199 };
10201
10202 // Process extracts in blocks of EltsPerVector to check if the source vector
10203 // operand can be re-used directly. If not, add the cost of creating a
10204 // shuffle to extract the values into a vector register.
10205 for (unsigned Part : seq<unsigned>(NumParts)) {
10206 if (!ShuffleKinds[Part])
10207 continue;
10208 ArrayRef<int> MaskSlice = Mask.slice(
10209 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10210 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10211 copy(MaskSlice, SubMask.begin());
10213 std::optional<TTI::ShuffleKind> RegShuffleKind =
10214 CheckPerRegistersShuffle(SubMask, Indices);
10215 if (!RegShuffleKind) {
10216 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10218 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10219 Cost +=
10220 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10221 getWidenedType(ScalarTy, NumElts), MaskSlice);
10222 continue;
10223 }
10224 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10225 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10226 Cost +=
10227 ::getShuffleCost(TTI, *RegShuffleKind,
10228 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10229 }
10230 const unsigned BaseVF = getFullVectorNumberOfElements(
10231 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10232 for (unsigned Idx : Indices) {
10233 assert((Idx + EltsPerVector) <= BaseVF &&
10234 "SK_ExtractSubvector index out of range");
10236 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10237 Idx, getWidenedType(ScalarTy, EltsPerVector));
10238 }
10239 // Second attempt to check, if just a permute is better estimated than
10240 // subvector extract.
10241 SubMask.assign(NumElts, PoisonMaskElem);
10242 copy(MaskSlice, SubMask.begin());
10243 InstructionCost OriginalCost = ::getShuffleCost(
10244 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10245 if (OriginalCost < Cost)
10246 Cost = OriginalCost;
10247 }
10248 return Cost;
10249 }
10250 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
10251 /// shuffle emission.
10252 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
10253 ArrayRef<int> Mask) {
10254 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10255 if (Mask[Idx] != PoisonMaskElem)
10256 CommonMask[Idx] = Idx;
10257 }
10258 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10259 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10260 /// elements.
10261 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10262 ArrayRef<int> Mask, unsigned Part,
10263 unsigned SliceSize) {
10264 if (SameNodesEstimated) {
10265 // Delay the cost estimation if the same nodes are reshuffling.
10266 // If we already requested the cost of reshuffling of E1 and E2 before, no
10267 // need to estimate another cost with the sub-Mask, instead include this
10268 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10269 // estimation.
10270 if ((InVectors.size() == 2 &&
10271 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10272 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10273 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10274 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10275 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10276 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10277 "Expected all poisoned elements.");
10278 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10279 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10280 return;
10281 }
10282 // Found non-matching nodes - need to estimate the cost for the matched
10283 // and transform mask.
10284 Cost += createShuffle(InVectors.front(),
10285 InVectors.size() == 1 ? nullptr : InVectors.back(),
10286 CommonMask);
10287 transformMaskAfterShuffle(CommonMask, CommonMask);
10288 } else if (InVectors.size() == 2) {
10289 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10290 transformMaskAfterShuffle(CommonMask, CommonMask);
10291 }
10292 SameNodesEstimated = false;
10293 if (!E2 && InVectors.size() == 1) {
10294 unsigned VF = E1.getVectorFactor();
10295 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10296 VF = std::max(VF,
10297 cast<FixedVectorType>(V1->getType())->getNumElements());
10298 } else {
10299 const auto *E = cast<const TreeEntry *>(InVectors.front());
10300 VF = std::max(VF, E->getVectorFactor());
10301 }
10302 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10303 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10304 CommonMask[Idx] = Mask[Idx] + VF;
10305 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10306 transformMaskAfterShuffle(CommonMask, CommonMask);
10307 } else {
10308 auto P = InVectors.front();
10309 Cost += createShuffle(&E1, E2, Mask);
10310 unsigned VF = Mask.size();
10311 if (Value *V1 = P.dyn_cast<Value *>()) {
10312 VF = std::max(VF,
10313 getNumElements(V1->getType()));
10314 } else {
10315 const auto *E = cast<const TreeEntry *>(P);
10316 VF = std::max(VF, E->getVectorFactor());
10317 }
10318 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10319 if (Mask[Idx] != PoisonMaskElem)
10320 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10321 Cost += createShuffle(P, InVectors.front(), CommonMask);
10322 transformMaskAfterShuffle(CommonMask, CommonMask);
10323 }
10324 }
10325
10326 class ShuffleCostBuilder {
10327 const TargetTransformInfo &TTI;
10328
10329 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10330 int Index = -1;
10331 return Mask.empty() ||
10332 (VF == Mask.size() &&
10335 Index == 0);
10336 }
10337
10338 public:
10339 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10340 ~ShuffleCostBuilder() = default;
10341 InstructionCost createShuffleVector(Value *V1, Value *,
10342 ArrayRef<int> Mask) const {
10343 // Empty mask or identity mask are free.
10344 unsigned VF =
10345 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10346 if (isEmptyOrIdentity(Mask, VF))
10347 return TTI::TCC_Free;
10348 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10349 cast<VectorType>(V1->getType()), Mask);
10350 }
10351 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10352 // Empty mask or identity mask are free.
10353 unsigned VF =
10354 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10355 if (isEmptyOrIdentity(Mask, VF))
10356 return TTI::TCC_Free;
10357 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10358 cast<VectorType>(V1->getType()), Mask);
10359 }
10360 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10361 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10362 return TTI::TCC_Free;
10363 }
10364 void resizeToMatch(Value *&, Value *&) const {}
10365 };
10366
10367 /// Smart shuffle instruction emission, walks through shuffles trees and
10368 /// tries to find the best matching vector for the actual shuffle
10369 /// instruction.
10371 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10373 ArrayRef<int> Mask) {
10374 ShuffleCostBuilder Builder(TTI);
10375 SmallVector<int> CommonMask(Mask);
10376 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10377 unsigned CommonVF = Mask.size();
10378 InstructionCost ExtraCost = 0;
10379 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10380 unsigned VF) -> InstructionCost {
10381 if (E.isGather() && allConstant(E.Scalars))
10382 return TTI::TCC_Free;
10383 Type *EScalarTy = E.Scalars.front()->getType();
10384 bool IsSigned = true;
10385 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10386 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10387 IsSigned = It->second.second;
10388 }
10389 if (EScalarTy != ScalarTy) {
10390 unsigned CastOpcode = Instruction::Trunc;
10391 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10392 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10393 if (DstSz > SrcSz)
10394 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10395 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10396 getWidenedType(EScalarTy, VF),
10397 TTI::CastContextHint::None, CostKind);
10398 }
10399 return TTI::TCC_Free;
10400 };
10401 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10402 if (isa<Constant>(V))
10403 return TTI::TCC_Free;
10404 auto *VecTy = cast<VectorType>(V->getType());
10405 Type *EScalarTy = VecTy->getElementType();
10406 if (EScalarTy != ScalarTy) {
10407 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10408 unsigned CastOpcode = Instruction::Trunc;
10409 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10410 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10411 if (DstSz > SrcSz)
10412 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10413 return TTI.getCastInstrCost(
10414 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10415 VecTy, TTI::CastContextHint::None, CostKind);
10416 }
10417 return TTI::TCC_Free;
10418 };
10419 if (!V1 && !V2 && !P2.isNull()) {
10420 // Shuffle 2 entry nodes.
10421 const TreeEntry *E = cast<const TreeEntry *>(P1);
10422 unsigned VF = E->getVectorFactor();
10423 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10424 CommonVF = std::max(VF, E2->getVectorFactor());
10425 assert(all_of(Mask,
10426 [=](int Idx) {
10427 return Idx < 2 * static_cast<int>(CommonVF);
10428 }) &&
10429 "All elements in mask must be less than 2 * CommonVF.");
10430 if (E->Scalars.size() == E2->Scalars.size()) {
10431 SmallVector<int> EMask = E->getCommonMask();
10432 SmallVector<int> E2Mask = E2->getCommonMask();
10433 if (!EMask.empty() || !E2Mask.empty()) {
10434 for (int &Idx : CommonMask) {
10435 if (Idx == PoisonMaskElem)
10436 continue;
10437 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10438 Idx = EMask[Idx];
10439 else if (Idx >= static_cast<int>(CommonVF))
10440 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10441 E->Scalars.size();
10442 }
10443 }
10444 CommonVF = E->Scalars.size();
10445 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10446 GetNodeMinBWAffectedCost(*E2, CommonVF);
10447 } else {
10448 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10449 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10450 }
10451 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10452 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10453 } else if (!V1 && P2.isNull()) {
10454 // Shuffle single entry node.
10455 const TreeEntry *E = cast<const TreeEntry *>(P1);
10456 unsigned VF = E->getVectorFactor();
10457 CommonVF = VF;
10458 assert(
10459 all_of(Mask,
10460 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10461 "All elements in mask must be less than CommonVF.");
10462 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10463 SmallVector<int> EMask = E->getCommonMask();
10464 assert(!EMask.empty() && "Expected non-empty common mask.");
10465 for (int &Idx : CommonMask) {
10466 if (Idx != PoisonMaskElem)
10467 Idx = EMask[Idx];
10468 }
10469 CommonVF = E->Scalars.size();
10470 } else if (unsigned Factor = E->getInterleaveFactor();
10471 Factor > 0 && E->Scalars.size() != Mask.size() &&
10473 Factor)) {
10474 // Deinterleaved nodes are free.
10475 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10476 }
10477 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10478 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10479 // Not identity/broadcast? Try to see if the original vector is better.
10480 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10481 CommonVF == CommonMask.size() &&
10482 any_of(enumerate(CommonMask),
10483 [](const auto &&P) {
10484 return P.value() != PoisonMaskElem &&
10485 static_cast<unsigned>(P.value()) != P.index();
10486 }) &&
10487 any_of(CommonMask,
10488 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10489 SmallVector<int> ReorderMask;
10490 inversePermutation(E->ReorderIndices, ReorderMask);
10491 ::addMask(CommonMask, ReorderMask);
10492 }
10493 } else if (V1 && P2.isNull()) {
10494 // Shuffle single vector.
10495 ExtraCost += GetValueMinBWAffectedCost(V1);
10496 CommonVF = getVF(V1);
10497 assert(
10498 all_of(Mask,
10499 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10500 "All elements in mask must be less than CommonVF.");
10501 } else if (V1 && !V2) {
10502 // Shuffle vector and tree node.
10503 unsigned VF = getVF(V1);
10504 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10505 CommonVF = std::max(VF, E2->getVectorFactor());
10506 assert(all_of(Mask,
10507 [=](int Idx) {
10508 return Idx < 2 * static_cast<int>(CommonVF);
10509 }) &&
10510 "All elements in mask must be less than 2 * CommonVF.");
10511 if (E2->Scalars.size() == VF && VF != CommonVF) {
10512 SmallVector<int> E2Mask = E2->getCommonMask();
10513 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10514 for (int &Idx : CommonMask) {
10515 if (Idx == PoisonMaskElem)
10516 continue;
10517 if (Idx >= static_cast<int>(CommonVF))
10518 Idx = E2Mask[Idx - CommonVF] + VF;
10519 }
10520 CommonVF = VF;
10521 }
10522 ExtraCost += GetValueMinBWAffectedCost(V1);
10523 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10524 ExtraCost += GetNodeMinBWAffectedCost(
10525 *E2, std::min(CommonVF, E2->getVectorFactor()));
10526 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10527 } else if (!V1 && V2) {
10528 // Shuffle vector and tree node.
10529 unsigned VF = getVF(V2);
10530 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10531 CommonVF = std::max(VF, E1->getVectorFactor());
10532 assert(all_of(Mask,
10533 [=](int Idx) {
10534 return Idx < 2 * static_cast<int>(CommonVF);
10535 }) &&
10536 "All elements in mask must be less than 2 * CommonVF.");
10537 if (E1->Scalars.size() == VF && VF != CommonVF) {
10538 SmallVector<int> E1Mask = E1->getCommonMask();
10539 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10540 for (int &Idx : CommonMask) {
10541 if (Idx == PoisonMaskElem)
10542 continue;
10543 if (Idx >= static_cast<int>(CommonVF))
10544 Idx = E1Mask[Idx - CommonVF] + VF;
10545 else
10546 Idx = E1Mask[Idx];
10547 }
10548 CommonVF = VF;
10549 }
10550 ExtraCost += GetNodeMinBWAffectedCost(
10551 *E1, std::min(CommonVF, E1->getVectorFactor()));
10552 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10553 ExtraCost += GetValueMinBWAffectedCost(V2);
10554 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10555 } else {
10556 assert(V1 && V2 && "Expected both vectors.");
10557 unsigned VF = getVF(V1);
10558 CommonVF = std::max(VF, getVF(V2));
10559 assert(all_of(Mask,
10560 [=](int Idx) {
10561 return Idx < 2 * static_cast<int>(CommonVF);
10562 }) &&
10563 "All elements in mask must be less than 2 * CommonVF.");
10564 ExtraCost +=
10565 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10566 if (V1->getType() != V2->getType()) {
10567 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10568 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10569 } else {
10570 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10571 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10572 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10573 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10574 }
10575 }
10576 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10577 assert(SLPReVec && "FixedVectorType is not expected.");
10579 CommonMask);
10580 }
10581 InVectors.front() =
10582 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10583 if (InVectors.size() == 2)
10584 InVectors.pop_back();
10585 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10586 V1, V2, CommonMask, Builder);
10587 }
10588
10589public:
10591 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10592 SmallPtrSetImpl<Value *> &CheckedExtracts)
10593 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10594 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10595 CheckedExtracts(CheckedExtracts) {}
10596 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10597 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10598 unsigned NumParts, bool &UseVecBaseAsInput) {
10599 UseVecBaseAsInput = false;
10600 if (Mask.empty())
10601 return nullptr;
10602 Value *VecBase = nullptr;
10603 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10604 if (!E->ReorderIndices.empty()) {
10605 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10606 E->ReorderIndices.end());
10607 reorderScalars(VL, ReorderMask);
10608 }
10609 // Check if it can be considered reused if same extractelements were
10610 // vectorized already.
10611 bool PrevNodeFound = any_of(
10612 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10613 [&](const std::unique_ptr<TreeEntry> &TE) {
10614 return ((!TE->isAltShuffle() &&
10615 TE->getOpcode() == Instruction::ExtractElement) ||
10616 TE->isGather()) &&
10617 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10618 return VL.size() > Data.index() &&
10619 (Mask[Data.index()] == PoisonMaskElem ||
10620 isa<UndefValue>(VL[Data.index()]) ||
10621 Data.value() == VL[Data.index()]);
10622 });
10623 });
10624 SmallPtrSet<Value *, 4> UniqueBases;
10625 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10626 for (unsigned Part : seq<unsigned>(NumParts)) {
10627 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10628 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10629 for (auto [I, V] :
10630 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10631 // Ignore non-extractelement scalars.
10632 if (isa<UndefValue>(V) ||
10633 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10634 continue;
10635 // If all users of instruction are going to be vectorized and this
10636 // instruction itself is not going to be vectorized, consider this
10637 // instruction as dead and remove its cost from the final cost of the
10638 // vectorized tree.
10639 // Also, avoid adjusting the cost for extractelements with multiple uses
10640 // in different graph entries.
10641 auto *EE = cast<ExtractElementInst>(V);
10642 VecBase = EE->getVectorOperand();
10643 UniqueBases.insert(VecBase);
10644 const TreeEntry *VE = R.getTreeEntry(V);
10645 if (!CheckedExtracts.insert(V).second ||
10646 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10647 any_of(EE->users(),
10648 [&](User *U) {
10649 return isa<GetElementPtrInst>(U) &&
10650 !R.areAllUsersVectorized(cast<Instruction>(U),
10651 &VectorizedVals);
10652 }) ||
10653 (VE && VE != E))
10654 continue;
10655 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10656 if (!EEIdx)
10657 continue;
10658 unsigned Idx = *EEIdx;
10659 // Take credit for instruction that will become dead.
10660 if (EE->hasOneUse() || !PrevNodeFound) {
10661 Instruction *Ext = EE->user_back();
10662 if (isa<SExtInst, ZExtInst>(Ext) &&
10663 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10664 // Use getExtractWithExtendCost() to calculate the cost of
10665 // extractelement/ext pair.
10666 Cost -=
10667 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10668 EE->getVectorOperandType(), Idx);
10669 // Add back the cost of s|zext which is subtracted separately.
10671 Ext->getOpcode(), Ext->getType(), EE->getType(),
10672 TTI::getCastContextHint(Ext), CostKind, Ext);
10673 continue;
10674 }
10675 }
10676 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10677 CostKind, Idx);
10678 }
10679 }
10680 // Check that gather of extractelements can be represented as just a
10681 // shuffle of a single/two vectors the scalars are extracted from.
10682 // Found the bunch of extractelement instructions that must be gathered
10683 // into a vector and can be represented as a permutation elements in a
10684 // single input vector or of 2 input vectors.
10685 // Done for reused if same extractelements were vectorized already.
10686 if (!PrevNodeFound)
10687 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10688 InVectors.assign(1, E);
10689 CommonMask.assign(Mask.begin(), Mask.end());
10690 transformMaskAfterShuffle(CommonMask, CommonMask);
10691 SameNodesEstimated = false;
10692 if (NumParts != 1 && UniqueBases.size() != 1) {
10693 UseVecBaseAsInput = true;
10694 VecBase =
10695 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10696 }
10697 return VecBase;
10698 }
10699 /// Checks if the specified entry \p E needs to be delayed because of its
10700 /// dependency nodes.
10701 std::optional<InstructionCost>
10702 needToDelay(const TreeEntry *,
10704 // No need to delay the cost estimation during analysis.
10705 return std::nullopt;
10706 }
10707 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10708 if (&E1 == &E2) {
10709 assert(all_of(Mask,
10710 [&](int Idx) {
10711 return Idx < static_cast<int>(E1.getVectorFactor());
10712 }) &&
10713 "Expected single vector shuffle mask.");
10714 add(E1, Mask);
10715 return;
10716 }
10717 if (InVectors.empty()) {
10718 CommonMask.assign(Mask.begin(), Mask.end());
10719 InVectors.assign({&E1, &E2});
10720 return;
10721 }
10722 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10723 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10724 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10725 if (NumParts == 0 || NumParts >= Mask.size() ||
10726 MaskVecTy->getNumElements() % NumParts != 0 ||
10727 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10728 MaskVecTy->getNumElements() / NumParts))
10729 NumParts = 1;
10730 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10731 const auto *It =
10732 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10733 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10734 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10735 }
10736 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10737 if (InVectors.empty()) {
10738 CommonMask.assign(Mask.begin(), Mask.end());
10739 InVectors.assign(1, &E1);
10740 return;
10741 }
10742 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10743 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10744 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10745 if (NumParts == 0 || NumParts >= Mask.size() ||
10746 MaskVecTy->getNumElements() % NumParts != 0 ||
10747 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10748 MaskVecTy->getNumElements() / NumParts))
10749 NumParts = 1;
10750 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10751 const auto *It =
10752 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10753 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10754 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10755 if (!SameNodesEstimated && InVectors.size() == 1)
10756 InVectors.emplace_back(&E1);
10757 }
10758 /// Adds 2 input vectors and the mask for their shuffling.
10759 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10760 // May come only for shuffling of 2 vectors with extractelements, already
10761 // handled in adjustExtracts.
10762 assert(InVectors.size() == 1 &&
10763 all_of(enumerate(CommonMask),
10764 [&](auto P) {
10765 if (P.value() == PoisonMaskElem)
10766 return Mask[P.index()] == PoisonMaskElem;
10767 auto *EI = cast<ExtractElementInst>(
10768 cast<const TreeEntry *>(InVectors.front())
10769 ->getOrdered(P.index()));
10770 return EI->getVectorOperand() == V1 ||
10771 EI->getVectorOperand() == V2;
10772 }) &&
10773 "Expected extractelement vectors.");
10774 }
10775 /// Adds another one input vector and the mask for the shuffling.
10776 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10777 if (InVectors.empty()) {
10778 assert(CommonMask.empty() && !ForExtracts &&
10779 "Expected empty input mask/vectors.");
10780 CommonMask.assign(Mask.begin(), Mask.end());
10781 InVectors.assign(1, V1);
10782 return;
10783 }
10784 if (ForExtracts) {
10785 // No need to add vectors here, already handled them in adjustExtracts.
10786 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10787 !CommonMask.empty() &&
10788 all_of(enumerate(CommonMask),
10789 [&](auto P) {
10790 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10791 ->getOrdered(P.index());
10792 if (P.value() == PoisonMaskElem)
10793 return P.value() == Mask[P.index()] ||
10794 isa<UndefValue>(Scalar);
10795 if (isa<Constant>(V1))
10796 return true;
10797 auto *EI = cast<ExtractElementInst>(Scalar);
10798 return EI->getVectorOperand() == V1;
10799 }) &&
10800 "Expected only tree entry for extractelement vectors.");
10801 return;
10802 }
10803 assert(!InVectors.empty() && !CommonMask.empty() &&
10804 "Expected only tree entries from extracts/reused buildvectors.");
10805 unsigned VF = getVF(V1);
10806 if (InVectors.size() == 2) {
10807 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10808 transformMaskAfterShuffle(CommonMask, CommonMask);
10809 VF = std::max<unsigned>(VF, CommonMask.size());
10810 } else if (const auto *InTE =
10811 InVectors.front().dyn_cast<const TreeEntry *>()) {
10812 VF = std::max(VF, InTE->getVectorFactor());
10813 } else {
10814 VF = std::max(
10815 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10816 ->getNumElements());
10817 }
10818 InVectors.push_back(V1);
10819 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10820 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10821 CommonMask[Idx] = Mask[Idx] + VF;
10822 }
10823 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10824 Value *Root = nullptr) {
10825 Cost += getBuildVectorCost(VL, Root);
10826 if (!Root) {
10827 // FIXME: Need to find a way to avoid use of getNullValue here.
10829 unsigned VF = VL.size();
10830 if (MaskVF != 0)
10831 VF = std::min(VF, MaskVF);
10832 for (Value *V : VL.take_front(VF)) {
10833 if (isa<UndefValue>(V)) {
10834 Vals.push_back(cast<Constant>(V));
10835 continue;
10836 }
10837 Vals.push_back(Constant::getNullValue(V->getType()));
10838 }
10839 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10840 assert(SLPReVec && "FixedVectorType is not expected.");
10841 // When REVEC is enabled, we need to expand vector types into scalar
10842 // types.
10843 unsigned VecTyNumElements = VecTy->getNumElements();
10844 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10845 for (auto [I, V] : enumerate(Vals)) {
10846 Type *ScalarTy = V->getType()->getScalarType();
10847 Constant *NewVal;
10848 if (isa<PoisonValue>(V))
10849 NewVal = PoisonValue::get(ScalarTy);
10850 else if (isa<UndefValue>(V))
10851 NewVal = UndefValue::get(ScalarTy);
10852 else
10853 NewVal = Constant::getNullValue(ScalarTy);
10854 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10855 NewVal);
10856 }
10857 Vals.swap(NewVals);
10858 }
10859 return ConstantVector::get(Vals);
10860 }
10863 cast<FixedVectorType>(Root->getType())->getNumElements()),
10864 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10865 }
10867 /// Finalize emission of the shuffles.
10870 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10871 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10872 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10873 IsFinalized = true;
10874 if (Action) {
10875 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10876 if (InVectors.size() == 2)
10877 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10878 else
10879 Cost += createShuffle(Vec, nullptr, CommonMask);
10880 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10881 if (CommonMask[Idx] != PoisonMaskElem)
10882 CommonMask[Idx] = Idx;
10883 assert(VF > 0 &&
10884 "Expected vector length for the final value before action.");
10885 Value *V = cast<Value *>(Vec);
10886 Action(V, CommonMask);
10887 InVectors.front() = V;
10888 }
10889 if (!SubVectors.empty()) {
10890 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10891 if (InVectors.size() == 2)
10892 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10893 else
10894 Cost += createShuffle(Vec, nullptr, CommonMask);
10895 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10896 if (CommonMask[Idx] != PoisonMaskElem)
10897 CommonMask[Idx] = Idx;
10898 // Add subvectors permutation cost.
10899 if (!SubVectorsMask.empty()) {
10900 assert(SubVectorsMask.size() <= CommonMask.size() &&
10901 "Expected same size of masks for subvectors and common mask.");
10902 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10903 copy(SubVectorsMask, SVMask.begin());
10904 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10905 if (I2 != PoisonMaskElem) {
10906 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10907 I1 = I2 + CommonMask.size();
10908 }
10909 }
10911 getWidenedType(ScalarTy, CommonMask.size()),
10912 SVMask, CostKind);
10913 }
10914 for (auto [E, Idx] : SubVectors) {
10915 Type *EScalarTy = E->Scalars.front()->getType();
10916 bool IsSigned = true;
10917 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10918 EScalarTy =
10919 IntegerType::get(EScalarTy->getContext(), It->second.first);
10920 IsSigned = It->second.second;
10921 }
10922 if (ScalarTy != EScalarTy) {
10923 unsigned CastOpcode = Instruction::Trunc;
10924 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10925 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10926 if (DstSz > SrcSz)
10927 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10929 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10930 getWidenedType(EScalarTy, E->getVectorFactor()),
10932 }
10935 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
10936 getWidenedType(ScalarTy, E->getVectorFactor()));
10937 if (!CommonMask.empty()) {
10938 std::iota(std::next(CommonMask.begin(), Idx),
10939 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
10940 Idx);
10941 }
10942 }
10943 }
10944
10945 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
10946 if (CommonMask.empty()) {
10947 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
10948 return Cost;
10949 }
10950 return Cost +
10951 createShuffle(InVectors.front(),
10952 InVectors.size() == 2 ? InVectors.back() : nullptr,
10953 CommonMask);
10954 }
10955
10957 assert((IsFinalized || CommonMask.empty()) &&
10958 "Shuffle construction must be finalized.");
10959 }
10960};
10961
10962const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
10963 unsigned Idx) const {
10964 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
10965 return VE;
10966 const auto *It =
10967 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10968 return TE->isGather() &&
10969 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
10970 return EI.EdgeIdx == Idx && EI.UserTE == E;
10971 }) != TE->UserTreeIndices.end();
10972 });
10973 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
10974 return It->get();
10975}
10976
10977TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
10978 if (TE.State == TreeEntry::ScatterVectorize ||
10979 TE.State == TreeEntry::StridedVectorize)
10981 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
10982 !TE.isAltShuffle()) {
10983 if (TE.ReorderIndices.empty())
10985 SmallVector<int> Mask;
10986 inversePermutation(TE.ReorderIndices, Mask);
10987 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
10989 }
10991}
10992
10993/// Builds the arguments types vector for the given call instruction with the
10994/// given \p ID for the specified vector factor.
10997 const unsigned VF, unsigned MinBW,
10998 const TargetTransformInfo *TTI) {
10999 SmallVector<Type *> ArgTys;
11000 for (auto [Idx, Arg] : enumerate(CI->args())) {
11003 ArgTys.push_back(Arg->getType());
11004 continue;
11005 }
11006 if (MinBW > 0) {
11007 ArgTys.push_back(
11008 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11009 continue;
11010 }
11011 }
11012 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11013 }
11014 return ArgTys;
11015}
11016
11018BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11019 SmallPtrSetImpl<Value *> &CheckedExtracts) {
11020 ArrayRef<Value *> VL = E->Scalars;
11021
11022 Type *ScalarTy = getValueType(VL[0]);
11023 if (!isValidElementType(ScalarTy))
11026
11027 // If we have computed a smaller type for the expression, update VecTy so
11028 // that the costs will be accurate.
11029 auto It = MinBWs.find(E);
11030 Type *OrigScalarTy = ScalarTy;
11031 if (It != MinBWs.end()) {
11032 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11033 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11034 if (VecTy)
11035 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
11036 }
11037 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11038 unsigned EntryVF = E->getVectorFactor();
11039 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
11040
11041 if (E->isGather()) {
11042 if (allConstant(VL))
11043 return 0;
11044 if (isa<InsertElementInst>(VL[0]))
11046 if (isa<CmpInst>(VL.front()))
11047 ScalarTy = VL.front()->getType();
11048 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11049 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11050 }
11051 InstructionCost CommonCost = 0;
11053 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11054 !isReverseOrder(E->ReorderIndices))) {
11055 SmallVector<int> NewMask;
11056 if (E->getOpcode() == Instruction::Store) {
11057 // For stores the order is actually a mask.
11058 NewMask.resize(E->ReorderIndices.size());
11059 copy(E->ReorderIndices, NewMask.begin());
11060 } else {
11061 inversePermutation(E->ReorderIndices, NewMask);
11062 }
11063 ::addMask(Mask, NewMask);
11064 }
11065 if (!E->ReuseShuffleIndices.empty())
11066 ::addMask(Mask, E->ReuseShuffleIndices);
11067 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11068 CommonCost =
11069 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11070 assert((E->State == TreeEntry::Vectorize ||
11071 E->State == TreeEntry::ScatterVectorize ||
11072 E->State == TreeEntry::StridedVectorize) &&
11073 "Unhandled state");
11074 assert(E->getOpcode() &&
11075 ((allSameType(VL) && allSameBlock(VL)) ||
11076 (E->getOpcode() == Instruction::GetElementPtr &&
11077 E->getMainOp()->getType()->isPointerTy())) &&
11078 "Invalid VL");
11079 Instruction *VL0 = E->getMainOp();
11080 unsigned ShuffleOrOp =
11081 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11082 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11083 ShuffleOrOp = E->CombinedOp;
11084 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11085 const unsigned Sz = UniqueValues.size();
11086 SmallBitVector UsedScalars(Sz, false);
11087 for (unsigned I = 0; I < Sz; ++I) {
11088 if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11089 continue;
11090 UsedScalars.set(I);
11091 }
11092 auto GetCastContextHint = [&](Value *V) {
11093 if (const TreeEntry *OpTE = getTreeEntry(V))
11094 return getCastContextHint(*OpTE);
11095 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11096 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11097 !SrcState.isAltShuffle())
11100 };
11101 auto GetCostDiff =
11102 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11104 // Calculate the cost of this instruction.
11105 InstructionCost ScalarCost = 0;
11106 if (isa<CastInst, CallInst>(VL0)) {
11107 // For some of the instructions no need to calculate cost for each
11108 // particular instruction, we can use the cost of the single
11109 // instruction x total number of scalar instructions.
11110 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11111 } else {
11112 for (unsigned I = 0; I < Sz; ++I) {
11113 if (UsedScalars.test(I))
11114 continue;
11115 ScalarCost += ScalarEltCost(I);
11116 }
11117 }
11118
11119 InstructionCost VecCost = VectorCost(CommonCost);
11120 // Check if the current node must be resized, if the parent node is not
11121 // resized.
11122 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11123 E->Idx != 0 &&
11124 (E->getOpcode() != Instruction::Load ||
11125 !E->UserTreeIndices.empty())) {
11126 const EdgeInfo &EI =
11127 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11128 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11129 });
11130 if (EI.UserTE->getOpcode() != Instruction::Select ||
11131 EI.EdgeIdx != 0) {
11132 auto UserBWIt = MinBWs.find(EI.UserTE);
11133 Type *UserScalarTy =
11134 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11135 if (UserBWIt != MinBWs.end())
11136 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11137 UserBWIt->second.first);
11138 if (ScalarTy != UserScalarTy) {
11139 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11140 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11141 unsigned VecOpcode;
11142 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11143 if (BWSz > SrcBWSz)
11144 VecOpcode = Instruction::Trunc;
11145 else
11146 VecOpcode =
11147 It->second.second ? Instruction::SExt : Instruction::ZExt;
11148 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11149 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11150 CostKind);
11151 }
11152 }
11153 }
11154 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11155 ScalarCost, "Calculated costs for Tree"));
11156 return VecCost - ScalarCost;
11157 };
11158 // Calculate cost difference from vectorizing set of GEPs.
11159 // Negative value means vectorizing is profitable.
11160 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11161 assert((E->State == TreeEntry::Vectorize ||
11162 E->State == TreeEntry::StridedVectorize) &&
11163 "Entry state expected to be Vectorize or StridedVectorize here.");
11164 InstructionCost ScalarCost = 0;
11165 InstructionCost VecCost = 0;
11166 std::tie(ScalarCost, VecCost) = getGEPCosts(
11167 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11168 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11169 "Calculated GEPs cost for Tree"));
11170
11171 return VecCost - ScalarCost;
11172 };
11173
11174 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11175 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11176 if (MinMaxID == Intrinsic::not_intrinsic)
11178 Type *CanonicalType = Ty;
11179 if (CanonicalType->isPtrOrPtrVectorTy())
11180 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11181 CanonicalType->getContext(),
11182 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11183
11184 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11185 {CanonicalType, CanonicalType});
11186 InstructionCost IntrinsicCost =
11187 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11188 // If the selects are the only uses of the compares, they will be
11189 // dead and we can adjust the cost by removing their cost.
11190 if (VI && SelectOnly) {
11191 assert((!Ty->isVectorTy() || SLPReVec) &&
11192 "Expected only for scalar type.");
11193 auto *CI = cast<CmpInst>(VI->getOperand(0));
11194 IntrinsicCost -= TTI->getCmpSelInstrCost(
11195 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11196 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11197 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11198 }
11199 return IntrinsicCost;
11200 };
11201 switch (ShuffleOrOp) {
11202 case Instruction::PHI: {
11203 // Count reused scalars.
11204 InstructionCost ScalarCost = 0;
11206 for (Value *V : UniqueValues) {
11207 auto *PHI = dyn_cast<PHINode>(V);
11208 if (!PHI)
11209 continue;
11210
11211 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11212 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11213 Value *Op = PHI->getIncomingValue(I);
11214 Operands[I] = Op;
11215 }
11216 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11217 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11218 if (!OpTE->ReuseShuffleIndices.empty())
11219 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11220 OpTE->Scalars.size());
11221 }
11222
11223 return CommonCost - ScalarCost;
11224 }
11225 case Instruction::ExtractValue:
11226 case Instruction::ExtractElement: {
11227 auto GetScalarCost = [&](unsigned Idx) {
11228 if (isa<PoisonValue>(UniqueValues[Idx]))
11230
11231 auto *I = cast<Instruction>(UniqueValues[Idx]);
11232 VectorType *SrcVecTy;
11233 if (ShuffleOrOp == Instruction::ExtractElement) {
11234 auto *EE = cast<ExtractElementInst>(I);
11235 SrcVecTy = EE->getVectorOperandType();
11236 } else {
11237 auto *EV = cast<ExtractValueInst>(I);
11238 Type *AggregateTy = EV->getAggregateOperand()->getType();
11239 unsigned NumElts;
11240 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11241 NumElts = ATy->getNumElements();
11242 else
11243 NumElts = AggregateTy->getStructNumElements();
11244 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11245 }
11246 if (I->hasOneUse()) {
11247 Instruction *Ext = I->user_back();
11248 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11249 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11250 // Use getExtractWithExtendCost() to calculate the cost of
11251 // extractelement/ext pair.
11253 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11254 // Subtract the cost of s|zext which is subtracted separately.
11256 Ext->getOpcode(), Ext->getType(), I->getType(),
11258 return Cost;
11259 }
11260 }
11261 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11263 };
11264 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11265 return GetCostDiff(GetScalarCost, GetVectorCost);
11266 }
11267 case Instruction::InsertElement: {
11268 assert(E->ReuseShuffleIndices.empty() &&
11269 "Unique insertelements only are expected.");
11270 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11271 unsigned const NumElts = SrcVecTy->getNumElements();
11272 unsigned const NumScalars = VL.size();
11273
11274 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11275
11276 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11277 unsigned OffsetBeg = *getElementIndex(VL.front());
11278 unsigned OffsetEnd = OffsetBeg;
11279 InsertMask[OffsetBeg] = 0;
11280 for (auto [I, V] : enumerate(VL.drop_front())) {
11281 unsigned Idx = *getElementIndex(V);
11282 if (OffsetBeg > Idx)
11283 OffsetBeg = Idx;
11284 else if (OffsetEnd < Idx)
11285 OffsetEnd = Idx;
11286 InsertMask[Idx] = I + 1;
11287 }
11288 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11289 if (NumOfParts > 0 && NumOfParts < NumElts)
11290 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11291 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11292 VecScalarsSz;
11293 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11294 unsigned InsertVecSz = std::min<unsigned>(
11295 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11296 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11297 bool IsWholeSubvector =
11298 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11299 // Check if we can safely insert a subvector. If it is not possible, just
11300 // generate a whole-sized vector and shuffle the source vector and the new
11301 // subvector.
11302 if (OffsetBeg + InsertVecSz > VecSz) {
11303 // Align OffsetBeg to generate correct mask.
11304 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11305 InsertVecSz = VecSz;
11306 }
11307
11308 APInt DemandedElts = APInt::getZero(NumElts);
11309 // TODO: Add support for Instruction::InsertValue.
11311 if (!E->ReorderIndices.empty()) {
11312 inversePermutation(E->ReorderIndices, Mask);
11313 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11314 } else {
11315 Mask.assign(VecSz, PoisonMaskElem);
11316 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11317 }
11318 bool IsIdentity = true;
11319 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11320 Mask.swap(PrevMask);
11321 for (unsigned I = 0; I < NumScalars; ++I) {
11322 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11323 DemandedElts.setBit(InsertIdx);
11324 IsIdentity &= InsertIdx - OffsetBeg == I;
11325 Mask[InsertIdx - OffsetBeg] = I;
11326 }
11327 assert(Offset < NumElts && "Failed to find vector index offset");
11328
11330 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11331 /*Insert*/ true, /*Extract*/ false,
11332 CostKind);
11333
11334 // First cost - resize to actual vector size if not identity shuffle or
11335 // need to shift the vector.
11336 // Do not calculate the cost if the actual size is the register size and
11337 // we can merge this shuffle with the following SK_Select.
11338 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11339 if (!IsIdentity)
11341 InsertVecTy, Mask);
11342 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11343 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11344 }));
11345 // Second cost - permutation with subvector, if some elements are from the
11346 // initial vector or inserting a subvector.
11347 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11348 // subvector of ActualVecTy.
11349 SmallBitVector InMask =
11350 isUndefVector(FirstInsert->getOperand(0),
11351 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11352 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11353 if (InsertVecSz != VecSz) {
11354 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11355 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11356 CostKind, OffsetBeg - Offset, InsertVecTy);
11357 } else {
11358 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11359 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11360 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11361 I <= End; ++I)
11362 if (Mask[I] != PoisonMaskElem)
11363 Mask[I] = I + VecSz;
11364 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11365 Mask[I] =
11366 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11367 Cost +=
11368 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11369 }
11370 }
11371 return Cost;
11372 }
11373 case Instruction::ZExt:
11374 case Instruction::SExt:
11375 case Instruction::FPToUI:
11376 case Instruction::FPToSI:
11377 case Instruction::FPExt:
11378 case Instruction::PtrToInt:
11379 case Instruction::IntToPtr:
11380 case Instruction::SIToFP:
11381 case Instruction::UIToFP:
11382 case Instruction::Trunc:
11383 case Instruction::FPTrunc:
11384 case Instruction::BitCast: {
11385 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11386 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11387 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11388 unsigned Opcode = ShuffleOrOp;
11389 unsigned VecOpcode = Opcode;
11390 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11391 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11392 // Check if the values are candidates to demote.
11393 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11394 if (SrcIt != MinBWs.end()) {
11395 SrcBWSz = SrcIt->second.first;
11396 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11397 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11398 SrcVecTy =
11399 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11400 }
11401 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11402 if (BWSz == SrcBWSz) {
11403 VecOpcode = Instruction::BitCast;
11404 } else if (BWSz < SrcBWSz) {
11405 VecOpcode = Instruction::Trunc;
11406 } else if (It != MinBWs.end()) {
11407 assert(BWSz > SrcBWSz && "Invalid cast!");
11408 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11409 } else if (SrcIt != MinBWs.end()) {
11410 assert(BWSz > SrcBWSz && "Invalid cast!");
11411 VecOpcode =
11412 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11413 }
11414 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11415 !SrcIt->second.second) {
11416 VecOpcode = Instruction::UIToFP;
11417 }
11418 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11419 assert(Idx == 0 && "Expected 0 index only");
11420 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11421 VL0->getOperand(0)->getType(),
11423 };
11424 auto GetVectorCost = [=](InstructionCost CommonCost) {
11425 // Do not count cost here if minimum bitwidth is in effect and it is just
11426 // a bitcast (here it is just a noop).
11427 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11428 return CommonCost;
11429 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11430 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11431
11432 bool IsArithmeticExtendedReduction =
11433 E->Idx == 0 && UserIgnoreList &&
11434 all_of(*UserIgnoreList, [](Value *V) {
11435 auto *I = cast<Instruction>(V);
11436 return is_contained({Instruction::Add, Instruction::FAdd,
11437 Instruction::Mul, Instruction::FMul,
11438 Instruction::And, Instruction::Or,
11439 Instruction::Xor},
11440 I->getOpcode());
11441 });
11442 if (IsArithmeticExtendedReduction &&
11443 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11444 return CommonCost;
11445 return CommonCost +
11446 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11447 VecOpcode == Opcode ? VI : nullptr);
11448 };
11449 return GetCostDiff(GetScalarCost, GetVectorCost);
11450 }
11451 case Instruction::FCmp:
11452 case Instruction::ICmp:
11453 case Instruction::Select: {
11454 CmpPredicate VecPred, SwappedVecPred;
11455 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11456 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11457 match(VL0, MatchCmp))
11458 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11459 else
11460 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11463 auto GetScalarCost = [&](unsigned Idx) {
11464 if (isa<PoisonValue>(UniqueValues[Idx]))
11466
11467 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11468 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11471 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11472 // FIXME: Use CmpPredicate::getMatching here.
11473 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11474 !match(VI, MatchCmp)) ||
11475 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11476 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11477 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11480
11482 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11483 CostKind, getOperandInfo(VI->getOperand(0)),
11484 getOperandInfo(VI->getOperand(1)), VI);
11485 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11486 if (IntrinsicCost.isValid())
11487 ScalarCost = IntrinsicCost;
11488
11489 return ScalarCost;
11490 };
11491 auto GetVectorCost = [&](InstructionCost CommonCost) {
11492 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11493
11494 InstructionCost VecCost =
11495 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11496 CostKind, getOperandInfo(E->getOperand(0)),
11497 getOperandInfo(E->getOperand(1)), VL0);
11498 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11499 auto *CondType =
11500 getWidenedType(SI->getCondition()->getType(), VL.size());
11501 unsigned CondNumElements = CondType->getNumElements();
11502 unsigned VecTyNumElements = getNumElements(VecTy);
11503 assert(VecTyNumElements >= CondNumElements &&
11504 VecTyNumElements % CondNumElements == 0 &&
11505 "Cannot vectorize Instruction::Select");
11506 if (CondNumElements != VecTyNumElements) {
11507 // When the return type is i1 but the source is fixed vector type, we
11508 // need to duplicate the condition value.
11509 VecCost += ::getShuffleCost(
11510 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11511 createReplicatedMask(VecTyNumElements / CondNumElements,
11512 CondNumElements));
11513 }
11514 }
11515 return VecCost + CommonCost;
11516 };
11517 return GetCostDiff(GetScalarCost, GetVectorCost);
11518 }
11519 case TreeEntry::MinMax: {
11520 auto GetScalarCost = [&](unsigned Idx) {
11521 return GetMinMaxCost(OrigScalarTy);
11522 };
11523 auto GetVectorCost = [&](InstructionCost CommonCost) {
11524 InstructionCost VecCost = GetMinMaxCost(VecTy);
11525 return VecCost + CommonCost;
11526 };
11527 return GetCostDiff(GetScalarCost, GetVectorCost);
11528 }
11529 case Instruction::FNeg:
11530 case Instruction::Add:
11531 case Instruction::FAdd:
11532 case Instruction::Sub:
11533 case Instruction::FSub:
11534 case Instruction::Mul:
11535 case Instruction::FMul:
11536 case Instruction::UDiv:
11537 case Instruction::SDiv:
11538 case Instruction::FDiv:
11539 case Instruction::URem:
11540 case Instruction::SRem:
11541 case Instruction::FRem:
11542 case Instruction::Shl:
11543 case Instruction::LShr:
11544 case Instruction::AShr:
11545 case Instruction::And:
11546 case Instruction::Or:
11547 case Instruction::Xor: {
11548 auto GetScalarCost = [&](unsigned Idx) {
11549 if (isa<PoisonValue>(UniqueValues[Idx]))
11551
11552 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11553 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11554 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11555 TTI::OperandValueInfo Op2Info =
11556 TTI::getOperandInfo(VI->getOperand(OpIdx));
11557 SmallVector<const Value *> Operands(VI->operand_values());
11558 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11559 Op1Info, Op2Info, Operands, VI);
11560 };
11561 auto GetVectorCost = [=](InstructionCost CommonCost) {
11562 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11563 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11564 ArrayRef<Value *> Ops = E->getOperand(I);
11565 if (all_of(Ops, [&](Value *Op) {
11566 auto *CI = dyn_cast<ConstantInt>(Op);
11567 return CI && CI->getValue().countr_one() >= It->second.first;
11568 }))
11569 return CommonCost;
11570 }
11571 }
11572 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11573 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11574 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11575 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11576 Op2Info, {}, nullptr, TLI) +
11577 CommonCost;
11578 };
11579 return GetCostDiff(GetScalarCost, GetVectorCost);
11580 }
11581 case Instruction::GetElementPtr: {
11582 return CommonCost + GetGEPCostDiff(VL, VL0);
11583 }
11584 case Instruction::Load: {
11585 auto GetScalarCost = [&](unsigned Idx) {
11586 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11587 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11588 VI->getAlign(), VI->getPointerAddressSpace(),
11590 };
11591 auto *LI0 = cast<LoadInst>(VL0);
11592 auto GetVectorCost = [&](InstructionCost CommonCost) {
11593 InstructionCost VecLdCost;
11594 switch (E->State) {
11595 case TreeEntry::Vectorize:
11596 if (unsigned Factor = E->getInterleaveFactor()) {
11597 VecLdCost = TTI->getInterleavedMemoryOpCost(
11598 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11599 LI0->getPointerAddressSpace(), CostKind);
11600
11601 } else {
11602 VecLdCost = TTI->getMemoryOpCost(
11603 Instruction::Load, VecTy, LI0->getAlign(),
11604 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11605 }
11606 break;
11607 case TreeEntry::StridedVectorize: {
11608 Align CommonAlignment =
11609 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11610 VecLdCost = TTI->getStridedMemoryOpCost(
11611 Instruction::Load, VecTy, LI0->getPointerOperand(),
11612 /*VariableMask=*/false, CommonAlignment, CostKind);
11613 break;
11614 }
11615 case TreeEntry::ScatterVectorize: {
11616 Align CommonAlignment =
11617 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11618 VecLdCost = TTI->getGatherScatterOpCost(
11619 Instruction::Load, VecTy, LI0->getPointerOperand(),
11620 /*VariableMask=*/false, CommonAlignment, CostKind);
11621 break;
11622 }
11623 case TreeEntry::CombinedVectorize:
11624 case TreeEntry::NeedToGather:
11625 llvm_unreachable("Unexpected vectorization state.");
11626 }
11627 return VecLdCost + CommonCost;
11628 };
11629
11630 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11631 // If this node generates masked gather load then it is not a terminal node.
11632 // Hence address operand cost is estimated separately.
11633 if (E->State == TreeEntry::ScatterVectorize)
11634 return Cost;
11635
11636 // Estimate cost of GEPs since this tree node is a terminator.
11637 SmallVector<Value *> PointerOps(VL.size());
11638 for (auto [I, V] : enumerate(VL))
11639 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11640 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11641 }
11642 case Instruction::Store: {
11643 bool IsReorder = !E->ReorderIndices.empty();
11644 auto GetScalarCost = [=](unsigned Idx) {
11645 auto *VI = cast<StoreInst>(VL[Idx]);
11646 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11647 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11648 VI->getAlign(), VI->getPointerAddressSpace(),
11649 CostKind, OpInfo, VI);
11650 };
11651 auto *BaseSI =
11652 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11653 auto GetVectorCost = [=](InstructionCost CommonCost) {
11654 // We know that we can merge the stores. Calculate the cost.
11655 InstructionCost VecStCost;
11656 if (E->State == TreeEntry::StridedVectorize) {
11657 Align CommonAlignment =
11658 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11659 VecStCost = TTI->getStridedMemoryOpCost(
11660 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11661 /*VariableMask=*/false, CommonAlignment, CostKind);
11662 } else {
11663 assert(E->State == TreeEntry::Vectorize &&
11664 "Expected either strided or consecutive stores.");
11665 if (unsigned Factor = E->getInterleaveFactor()) {
11666 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11667 "No reused shuffles expected");
11668 CommonCost = 0;
11669 VecStCost = TTI->getInterleavedMemoryOpCost(
11670 Instruction::Store, VecTy, Factor, std::nullopt,
11671 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11672 } else {
11673 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11674 VecStCost = TTI->getMemoryOpCost(
11675 Instruction::Store, VecTy, BaseSI->getAlign(),
11676 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11677 }
11678 }
11679 return VecStCost + CommonCost;
11680 };
11681 SmallVector<Value *> PointerOps(VL.size());
11682 for (auto [I, V] : enumerate(VL)) {
11683 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11684 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11685 }
11686
11687 return GetCostDiff(GetScalarCost, GetVectorCost) +
11688 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11689 }
11690 case Instruction::Call: {
11691 auto GetScalarCost = [&](unsigned Idx) {
11692 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11695 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11696 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11697 }
11700 CI->getFunctionType()->params(), CostKind);
11701 };
11702 auto GetVectorCost = [=](InstructionCost CommonCost) {
11703 auto *CI = cast<CallInst>(VL0);
11706 CI, ID, VecTy->getNumElements(),
11707 It != MinBWs.end() ? It->second.first : 0, TTI);
11708 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11709 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11710 };
11711 return GetCostDiff(GetScalarCost, GetVectorCost);
11712 }
11713 case Instruction::ShuffleVector: {
11714 if (!SLPReVec || E->isAltShuffle())
11715 assert(E->isAltShuffle() &&
11716 ((Instruction::isBinaryOp(E->getOpcode()) &&
11717 Instruction::isBinaryOp(E->getAltOpcode())) ||
11718 (Instruction::isCast(E->getOpcode()) &&
11719 Instruction::isCast(E->getAltOpcode())) ||
11720 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11721 "Invalid Shuffle Vector Operand");
11722 // Try to find the previous shuffle node with the same operands and same
11723 // main/alternate ops.
11724 auto TryFindNodeWithEqualOperands = [=]() {
11725 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11726 if (TE.get() == E)
11727 break;
11728 if (TE->isAltShuffle() &&
11729 ((TE->getOpcode() == E->getOpcode() &&
11730 TE->getAltOpcode() == E->getAltOpcode()) ||
11731 (TE->getOpcode() == E->getAltOpcode() &&
11732 TE->getAltOpcode() == E->getOpcode())) &&
11733 TE->hasEqualOperands(*E))
11734 return true;
11735 }
11736 return false;
11737 };
11738 auto GetScalarCost = [&](unsigned Idx) {
11739 if (isa<PoisonValue>(UniqueValues[Idx]))
11741
11742 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11743 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11744 (void)E;
11745 return TTI->getInstructionCost(VI, CostKind);
11746 };
11747 // Need to clear CommonCost since the final shuffle cost is included into
11748 // vector cost.
11749 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11750 // VecCost is equal to sum of the cost of creating 2 vectors
11751 // and the cost of creating shuffle.
11752 InstructionCost VecCost = 0;
11753 if (TryFindNodeWithEqualOperands()) {
11754 LLVM_DEBUG({
11755 dbgs() << "SLP: diamond match for alternate node found.\n";
11756 E->dump();
11757 });
11758 // No need to add new vector costs here since we're going to reuse
11759 // same main/alternate vector ops, just do different shuffling.
11760 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11761 VecCost =
11762 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11763 VecCost +=
11764 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11765 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11766 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11767 VecCost = TTIRef.getCmpSelInstrCost(
11768 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11769 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11770 VL0);
11771 VecCost += TTIRef.getCmpSelInstrCost(
11772 E->getOpcode(), VecTy, MaskTy,
11773 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11774 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11775 E->getAltOp());
11776 } else {
11777 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11778 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11779 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11780 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11781 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11782 unsigned SrcBWSz =
11783 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11784 if (SrcIt != MinBWs.end()) {
11785 SrcBWSz = SrcIt->second.first;
11786 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11787 SrcTy = getWidenedType(SrcSclTy, VL.size());
11788 }
11789 if (BWSz <= SrcBWSz) {
11790 if (BWSz < SrcBWSz)
11791 VecCost =
11792 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11794 LLVM_DEBUG({
11795 dbgs()
11796 << "SLP: alternate extension, which should be truncated.\n";
11797 E->dump();
11798 });
11799 return VecCost;
11800 }
11801 }
11802 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11804 VecCost +=
11805 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11807 }
11809 E->buildAltOpShuffleMask(
11810 [&](Instruction *I) {
11811 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11812 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11813 *TLI);
11814 },
11815 Mask);
11817 FinalVecTy, Mask, CostKind);
11818 // Patterns like [fadd,fsub] can be combined into a single instruction
11819 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11820 // need to take into account their order when looking for the most used
11821 // order.
11822 unsigned Opcode0 = E->getOpcode();
11823 unsigned Opcode1 = E->getAltOpcode();
11824 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11825 // If this pattern is supported by the target then we consider the
11826 // order.
11827 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11828 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11829 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11830 return AltVecCost < VecCost ? AltVecCost : VecCost;
11831 }
11832 // TODO: Check the reverse order too.
11833 return VecCost;
11834 };
11835 if (SLPReVec && !E->isAltShuffle())
11836 return GetCostDiff(
11837 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11838 // If a group uses mask in order, the shufflevector can be
11839 // eliminated by instcombine. Then the cost is 0.
11840 assert(isa<ShuffleVectorInst>(VL.front()) &&
11841 "Not supported shufflevector usage.");
11842 auto *SV = cast<ShuffleVectorInst>(VL.front());
11843 unsigned SVNumElements =
11844 cast<FixedVectorType>(SV->getOperand(0)->getType())
11845 ->getNumElements();
11846 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11847 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11848 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11849 int NextIndex = 0;
11850 if (!all_of(Group, [&](Value *V) {
11851 assert(isa<ShuffleVectorInst>(V) &&
11852 "Not supported shufflevector usage.");
11853 auto *SV = cast<ShuffleVectorInst>(V);
11854 int Index;
11855 [[maybe_unused]] bool IsExtractSubvectorMask =
11856 SV->isExtractSubvectorMask(Index);
11857 assert(IsExtractSubvectorMask &&
11858 "Not supported shufflevector usage.");
11859 if (NextIndex != Index)
11860 return false;
11861 NextIndex += SV->getShuffleMask().size();
11862 return true;
11863 }))
11864 return ::getShuffleCost(
11866 calculateShufflevectorMask(E->Scalars));
11867 }
11868 return TTI::TCC_Free;
11869 });
11870 return GetCostDiff(GetScalarCost, GetVectorCost);
11871 }
11872 case Instruction::Freeze:
11873 return CommonCost;
11874 default:
11875 llvm_unreachable("Unknown instruction");
11876 }
11877}
11878
11879bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11880 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11881 << VectorizableTree.size() << " is fully vectorizable .\n");
11882
11883 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11885 return TE->isGather() &&
11886 !any_of(TE->Scalars,
11887 [this](Value *V) { return EphValues.contains(V); }) &&
11888 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11889 TE->Scalars.size() < Limit ||
11890 ((TE->getOpcode() == Instruction::ExtractElement ||
11891 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11892 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11893 (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
11894 any_of(TE->Scalars, IsaPred<LoadInst>));
11895 };
11896
11897 // We only handle trees of heights 1 and 2.
11898 if (VectorizableTree.size() == 1 &&
11899 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11900 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11901 (ForReduction &&
11902 AreVectorizableGathers(VectorizableTree[0].get(),
11903 VectorizableTree[0]->Scalars.size()) &&
11904 VectorizableTree[0]->getVectorFactor() > 2)))
11905 return true;
11906
11907 if (VectorizableTree.size() != 2)
11908 return false;
11909
11910 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11911 // with the second gather nodes if they have less scalar operands rather than
11912 // the initial tree element (may be profitable to shuffle the second gather)
11913 // or they are extractelements, which form shuffle.
11915 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11916 AreVectorizableGathers(VectorizableTree[1].get(),
11917 VectorizableTree[0]->Scalars.size()))
11918 return true;
11919
11920 // Gathering cost would be too much for tiny trees.
11921 if (VectorizableTree[0]->isGather() ||
11922 (VectorizableTree[1]->isGather() &&
11923 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11924 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11925 return false;
11926
11927 return true;
11928}
11929
11930static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
11932 bool MustMatchOrInst) {
11933 // Look past the root to find a source value. Arbitrarily follow the
11934 // path through operand 0 of any 'or'. Also, peek through optional
11935 // shift-left-by-multiple-of-8-bits.
11936 Value *ZextLoad = Root;
11937 const APInt *ShAmtC;
11938 bool FoundOr = false;
11939 while (!isa<ConstantExpr>(ZextLoad) &&
11940 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
11941 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
11942 ShAmtC->urem(8) == 0))) {
11943 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11944 ZextLoad = BinOp->getOperand(0);
11945 if (BinOp->getOpcode() == Instruction::Or)
11946 FoundOr = true;
11947 }
11948 // Check if the input is an extended load of the required or/shift expression.
11949 Value *Load;
11950 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11951 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
11952 return false;
11953
11954 // Require that the total load bit width is a legal integer type.
11955 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
11956 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
11957 Type *SrcTy = Load->getType();
11958 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
11959 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
11960 return false;
11961
11962 // Everything matched - assume that we can fold the whole sequence using
11963 // load combining.
11964 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
11965 << *(cast<Instruction>(Root)) << "\n");
11966
11967 return true;
11968}
11969
11971 if (RdxKind != RecurKind::Or)
11972 return false;
11973
11974 unsigned NumElts = VectorizableTree[0]->Scalars.size();
11975 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
11976 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
11977 /* MatchOr */ false);
11978}
11979
11981 // Peek through a final sequence of stores and check if all operations are
11982 // likely to be load-combined.
11983 unsigned NumElts = Stores.size();
11984 for (Value *Scalar : Stores) {
11985 Value *X;
11986 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
11987 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
11988 return false;
11989 }
11990 return true;
11991}
11992
11993bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
11994 if (!DebugCounter::shouldExecute(VectorizedGraphs))
11995 return true;
11996
11997 // Graph is empty - do nothing.
11998 if (VectorizableTree.empty()) {
11999 assert(ExternalUses.empty() && "We shouldn't have any external users");
12000
12001 return true;
12002 }
12003
12004 // No need to vectorize inserts of gathered values.
12005 if (VectorizableTree.size() == 2 &&
12006 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12007 VectorizableTree[1]->isGather() &&
12008 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12009 !(isSplat(VectorizableTree[1]->Scalars) ||
12010 allConstant(VectorizableTree[1]->Scalars))))
12011 return true;
12012
12013 // If the graph includes only PHI nodes and gathers, it is defnitely not
12014 // profitable for the vectorization, we can skip it, if the cost threshold is
12015 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12016 // gathers/buildvectors.
12017 constexpr int Limit = 4;
12018 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12019 !VectorizableTree.empty() &&
12020 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12021 return (TE->isGather() &&
12022 TE->getOpcode() != Instruction::ExtractElement &&
12023 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12024 TE->getOpcode() == Instruction::PHI;
12025 }))
12026 return true;
12027
12028 // We can vectorize the tree if its size is greater than or equal to the
12029 // minimum size specified by the MinTreeSize command line option.
12030 if (VectorizableTree.size() >= MinTreeSize)
12031 return false;
12032
12033 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12034 // can vectorize it if we can prove it fully vectorizable.
12035 if (isFullyVectorizableTinyTree(ForReduction))
12036 return false;
12037
12038 // Check if any of the gather node forms an insertelement buildvector
12039 // somewhere.
12040 bool IsAllowedSingleBVNode =
12041 VectorizableTree.size() > 1 ||
12042 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
12043 !VectorizableTree.front()->isAltShuffle() &&
12044 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12045 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12046 allSameBlock(VectorizableTree.front()->Scalars));
12047 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12048 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12049 return isa<ExtractElementInst, UndefValue>(V) ||
12050 (IsAllowedSingleBVNode &&
12051 !V->hasNUsesOrMore(UsesLimit) &&
12052 any_of(V->users(), IsaPred<InsertElementInst>));
12053 });
12054 }))
12055 return false;
12056
12057 if (VectorizableTree.back()->isGather() &&
12058 VectorizableTree.back()->isAltShuffle() &&
12059 VectorizableTree.back()->getVectorFactor() > 2 &&
12060 allSameBlock(VectorizableTree.back()->Scalars) &&
12061 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12063 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12064 VectorizableTree.back()->getVectorFactor()),
12065 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12066 /*Insert=*/true, /*Extract=*/false,
12068 return false;
12069
12070 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12071 // vectorizable.
12072 return true;
12073}
12074
12077 constexpr unsigned SmallTree = 3;
12078 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12079 getCanonicalGraphSize() <= SmallTree &&
12080 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12081 [](const std::unique_ptr<TreeEntry> &TE) {
12082 return TE->isGather() &&
12083 TE->getOpcode() == Instruction::Load &&
12084 !allSameBlock(TE->Scalars);
12085 }) == 1)
12086 return true;
12087 return false;
12088 }
12089 bool Res = false;
12090 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12091 TreeEntry &E = *VectorizableTree[Idx];
12092 if (!E.isGather())
12093 continue;
12094 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12095 return false;
12096 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12097 continue;
12098 Res = true;
12099 }
12100 return Res;
12101}
12102
12104 // Walk from the bottom of the tree to the top, tracking which values are
12105 // live. When we see a call instruction that is not part of our tree,
12106 // query TTI to see if there is a cost to keeping values live over it
12107 // (for example, if spills and fills are required).
12108 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12110
12112 Instruction *PrevInst = nullptr;
12113
12114 // The entries in VectorizableTree are not necessarily ordered by their
12115 // position in basic blocks. Collect them and order them by dominance so later
12116 // instructions are guaranteed to be visited first. For instructions in
12117 // different basic blocks, we only scan to the beginning of the block, so
12118 // their order does not matter, as long as all instructions in a basic block
12119 // are grouped together. Using dominance ensures a deterministic order.
12120 SmallVector<Instruction *, 16> OrderedScalars;
12121 for (const auto &TEPtr : VectorizableTree) {
12122 if (TEPtr->State != TreeEntry::Vectorize)
12123 continue;
12124 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12125 if (!Inst)
12126 continue;
12127 OrderedScalars.push_back(Inst);
12128 }
12129 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12130 auto *NodeA = DT->getNode(A->getParent());
12131 auto *NodeB = DT->getNode(B->getParent());
12132 assert(NodeA && "Should only process reachable instructions");
12133 assert(NodeB && "Should only process reachable instructions");
12134 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12135 "Different nodes should have different DFS numbers");
12136 if (NodeA != NodeB)
12137 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12138 return B->comesBefore(A);
12139 });
12140
12141 for (Instruction *Inst : OrderedScalars) {
12142 if (!PrevInst) {
12143 PrevInst = Inst;
12144 continue;
12145 }
12146
12147 // Update LiveValues.
12148 LiveValues.erase(PrevInst);
12149 for (auto &J : PrevInst->operands()) {
12150 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12151 LiveValues.insert(cast<Instruction>(&*J));
12152 }
12153
12154 LLVM_DEBUG({
12155 dbgs() << "SLP: #LV: " << LiveValues.size();
12156 for (auto *X : LiveValues)
12157 dbgs() << " " << X->getName();
12158 dbgs() << ", Looking at ";
12159 Inst->dump();
12160 });
12161
12162 // Now find the sequence of instructions between PrevInst and Inst.
12163 unsigned NumCalls = 0;
12164 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12165 PrevInstIt =
12166 PrevInst->getIterator().getReverse();
12167 while (InstIt != PrevInstIt) {
12168 if (PrevInstIt == PrevInst->getParent()->rend()) {
12169 PrevInstIt = Inst->getParent()->rbegin();
12170 continue;
12171 }
12172
12173 auto NoCallIntrinsic = [this](Instruction *I) {
12174 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12175 if (II->isAssumeLikeIntrinsic())
12176 return true;
12177 FastMathFlags FMF;
12179 for (auto &ArgOp : II->args())
12180 Tys.push_back(ArgOp->getType());
12181 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12182 FMF = FPMO->getFastMathFlags();
12183 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12184 FMF);
12185 InstructionCost IntrCost =
12188 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12189 if (IntrCost < CallCost)
12190 return true;
12191 }
12192 return false;
12193 };
12194
12195 // Debug information does not impact spill cost.
12196 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12197 &*PrevInstIt != PrevInst)
12198 NumCalls++;
12199
12200 ++PrevInstIt;
12201 }
12202
12203 if (NumCalls) {
12205 for (auto *II : LiveValues) {
12206 auto *ScalarTy = II->getType();
12207 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12208 ScalarTy = VectorTy->getElementType();
12209 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12210 }
12211 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12212 }
12213
12214 PrevInst = Inst;
12215 }
12216
12217 return Cost;
12218}
12219
12220/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12221/// buildvector sequence.
12223 const InsertElementInst *IE2) {
12224 if (IE1 == IE2)
12225 return false;
12226 const auto *I1 = IE1;
12227 const auto *I2 = IE2;
12228 const InsertElementInst *PrevI1;
12229 const InsertElementInst *PrevI2;
12230 unsigned Idx1 = *getElementIndex(IE1);
12231 unsigned Idx2 = *getElementIndex(IE2);
12232 do {
12233 if (I2 == IE1)
12234 return true;
12235 if (I1 == IE2)
12236 return false;
12237 PrevI1 = I1;
12238 PrevI2 = I2;
12239 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12240 getElementIndex(I1).value_or(Idx2) != Idx2)
12241 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12242 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12243 getElementIndex(I2).value_or(Idx1) != Idx1)
12244 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12245 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12246 llvm_unreachable("Two different buildvectors not expected.");
12247}
12248
12249namespace {
12250/// Returns incoming Value *, if the requested type is Value * too, or a default
12251/// value, otherwise.
12252struct ValueSelect {
12253 template <typename U>
12254 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12255 return V;
12256 }
12257 template <typename U>
12258 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12259 return U();
12260 }
12261};
12262} // namespace
12263
12264/// Does the analysis of the provided shuffle masks and performs the requested
12265/// actions on the vectors with the given shuffle masks. It tries to do it in
12266/// several steps.
12267/// 1. If the Base vector is not undef vector, resizing the very first mask to
12268/// have common VF and perform action for 2 input vectors (including non-undef
12269/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12270/// and processed as a shuffle of 2 elements.
12271/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12272/// action only for 1 vector with the given mask, if it is not the identity
12273/// mask.
12274/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12275/// vectors, combing the masks properly between the steps.
12276template <typename T>
12278 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12279 function_ref<unsigned(T *)> GetVF,
12280 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12282 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12283 SmallVector<int> Mask(ShuffleMask.begin()->second);
12284 auto VMIt = std::next(ShuffleMask.begin());
12285 T *Prev = nullptr;
12286 SmallBitVector UseMask =
12287 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12288 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12289 if (!IsBaseUndef.all()) {
12290 // Base is not undef, need to combine it with the next subvectors.
12291 std::pair<T *, bool> Res =
12292 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12293 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12294 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12295 if (Mask[Idx] == PoisonMaskElem)
12296 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12297 else
12298 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12299 }
12300 auto *V = ValueSelect::get<T *>(Base);
12301 (void)V;
12302 assert((!V || GetVF(V) == Mask.size()) &&
12303 "Expected base vector of VF number of elements.");
12304 Prev = Action(Mask, {nullptr, Res.first});
12305 } else if (ShuffleMask.size() == 1) {
12306 // Base is undef and only 1 vector is shuffled - perform the action only for
12307 // single vector, if the mask is not the identity mask.
12308 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12309 /*ForSingleMask=*/true);
12310 if (Res.second)
12311 // Identity mask is found.
12312 Prev = Res.first;
12313 else
12314 Prev = Action(Mask, {ShuffleMask.begin()->first});
12315 } else {
12316 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12317 // shuffles step by step, combining shuffle between the steps.
12318 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12319 unsigned Vec2VF = GetVF(VMIt->first);
12320 if (Vec1VF == Vec2VF) {
12321 // No need to resize the input vectors since they are of the same size, we
12322 // can shuffle them directly.
12323 ArrayRef<int> SecMask = VMIt->second;
12324 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12325 if (SecMask[I] != PoisonMaskElem) {
12326 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12327 Mask[I] = SecMask[I] + Vec1VF;
12328 }
12329 }
12330 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12331 } else {
12332 // Vectors of different sizes - resize and reshuffle.
12333 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12334 /*ForSingleMask=*/false);
12335 std::pair<T *, bool> Res2 =
12336 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12337 ArrayRef<int> SecMask = VMIt->second;
12338 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12339 if (Mask[I] != PoisonMaskElem) {
12340 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12341 if (Res1.second)
12342 Mask[I] = I;
12343 } else if (SecMask[I] != PoisonMaskElem) {
12344 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12345 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12346 }
12347 }
12348 Prev = Action(Mask, {Res1.first, Res2.first});
12349 }
12350 VMIt = std::next(VMIt);
12351 }
12352 bool IsBaseNotUndef = !IsBaseUndef.all();
12353 (void)IsBaseNotUndef;
12354 // Perform requested actions for the remaining masks/vectors.
12355 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12356 // Shuffle other input vectors, if any.
12357 std::pair<T *, bool> Res =
12358 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12359 ArrayRef<int> SecMask = VMIt->second;
12360 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12361 if (SecMask[I] != PoisonMaskElem) {
12362 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12363 "Multiple uses of scalars.");
12364 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12365 } else if (Mask[I] != PoisonMaskElem) {
12366 Mask[I] = I;
12367 }
12368 }
12369 Prev = Action(Mask, {Prev, Res.first});
12370 }
12371 return Prev;
12372}
12373
12374namespace {
12375/// Data type for handling buildvector sequences with the reused scalars from
12376/// other tree entries.
12377template <typename T> struct ShuffledInsertData {
12378 /// List of insertelements to be replaced by shuffles.
12379 SmallVector<InsertElementInst *> InsertElements;
12380 /// The parent vectors and shuffle mask for the given list of inserts.
12382};
12383} // namespace
12384
12387 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12388 << VectorizableTree.size() << ".\n");
12389
12390 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12391
12392 SmallPtrSet<Value *, 4> CheckedExtracts;
12393 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12394 TreeEntry &TE = *VectorizableTree[I];
12395 // No need to count the cost for combined entries, they are combined and
12396 // just skip their cost.
12397 if (TE.State == TreeEntry::CombinedVectorize) {
12398 LLVM_DEBUG(
12399 dbgs() << "SLP: Skipping cost for combined node that starts with "
12400 << *TE.Scalars[0] << ".\n";
12401 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12402 continue;
12403 }
12404 if (TE.isGather()) {
12405 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12406 E && E->getVectorFactor() == TE.getVectorFactor() &&
12407 E->isSame(TE.Scalars)) {
12408 // Some gather nodes might be absolutely the same as some vectorizable
12409 // nodes after reordering, need to handle it.
12410 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12411 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12412 << "SLP: Current total cost = " << Cost << "\n");
12413 continue;
12414 }
12415 }
12416
12417 // Exclude cost of gather loads nodes which are not used. These nodes were
12418 // built as part of the final attempt to vectorize gathered loads.
12419 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12420 "Expected gather nodes with users only.");
12421
12422 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12423 Cost += C;
12424 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12425 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12426 << "SLP: Current total cost = " << Cost << "\n");
12427 }
12428
12429 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12430 InstructionCost ExtractCost = 0;
12432 SmallVector<APInt> DemandedElts;
12433 SmallDenseSet<Value *, 4> UsedInserts;
12435 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12437 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12438 // Keep track {Scalar, Index, User} tuple.
12439 // On AArch64, this helps in fusing a mov instruction, associated with
12440 // extractelement, with fmul in the backend so that extractelement is free.
12442 for (ExternalUser &EU : ExternalUses) {
12443 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12444 }
12445 for (ExternalUser &EU : ExternalUses) {
12446 // Uses by ephemeral values are free (because the ephemeral value will be
12447 // removed prior to code generation, and so the extraction will be
12448 // removed as well).
12449 if (EphValues.count(EU.User))
12450 continue;
12451
12452 // Used in unreachable blocks or in EH pads (rarely executed) or is
12453 // terminated with unreachable instruction.
12454 if (BasicBlock *UserParent =
12455 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12456 UserParent &&
12457 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12458 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12459 continue;
12460
12461 // We only add extract cost once for the same scalar.
12462 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12463 !ExtractCostCalculated.insert(EU.Scalar).second)
12464 continue;
12465
12466 // No extract cost for vector "scalar"
12467 if (isa<FixedVectorType>(EU.Scalar->getType()))
12468 continue;
12469
12470 // If found user is an insertelement, do not calculate extract cost but try
12471 // to detect it as a final shuffled/identity match.
12472 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12473 VU && VU->getOperand(1) == EU.Scalar) {
12474 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12475 if (!UsedInserts.insert(VU).second)
12476 continue;
12477 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12478 if (InsertIdx) {
12479 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12480 auto *It = find_if(
12481 ShuffledInserts,
12482 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12483 // Checks if 2 insertelements are from the same buildvector.
12484 InsertElementInst *VecInsert = Data.InsertElements.front();
12486 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12487 Value *Op0 = II->getOperand(0);
12488 if (getTreeEntry(II) && !getTreeEntry(Op0))
12489 return nullptr;
12490 return Op0;
12491 });
12492 });
12493 int VecId = -1;
12494 if (It == ShuffledInserts.end()) {
12495 auto &Data = ShuffledInserts.emplace_back();
12496 Data.InsertElements.emplace_back(VU);
12497 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12498 VecId = ShuffledInserts.size() - 1;
12499 auto It = MinBWs.find(ScalarTE);
12500 if (It != MinBWs.end() &&
12501 VectorCasts
12502 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12503 .second) {
12504 unsigned BWSz = It->second.first;
12505 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12506 unsigned VecOpcode;
12507 if (DstBWSz < BWSz)
12508 VecOpcode = Instruction::Trunc;
12509 else
12510 VecOpcode =
12511 It->second.second ? Instruction::SExt : Instruction::ZExt;
12514 VecOpcode, FTy,
12515 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12516 FTy->getNumElements()),
12518 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12519 << " for extending externally used vector with "
12520 "non-equal minimum bitwidth.\n");
12521 Cost += C;
12522 }
12523 } else {
12524 if (isFirstInsertElement(VU, It->InsertElements.front()))
12525 It->InsertElements.front() = VU;
12526 VecId = std::distance(ShuffledInserts.begin(), It);
12527 }
12528 int InIdx = *InsertIdx;
12529 SmallVectorImpl<int> &Mask =
12530 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12531 if (Mask.empty())
12532 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12533 Mask[InIdx] = EU.Lane;
12534 DemandedElts[VecId].setBit(InIdx);
12535 continue;
12536 }
12537 }
12538 }
12539
12541 // If we plan to rewrite the tree in a smaller type, we will need to sign
12542 // extend the extracted value back to the original type. Here, we account
12543 // for the extract and the added cost of the sign extend if needed.
12544 InstructionCost ExtraCost = TTI::TCC_Free;
12545 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12546 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12547 auto It = MinBWs.find(Entry);
12548 if (It != MinBWs.end()) {
12549 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12550 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12551 ? Instruction::ZExt
12552 : Instruction::SExt;
12553 VecTy = getWidenedType(MinTy, BundleWidth);
12554 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12555 VecTy, EU.Lane);
12556 } else {
12557 ExtraCost =
12558 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12559 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12560 }
12561 // Leave the scalar instructions as is if they are cheaper than extracts.
12562 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12563 Entry->getOpcode() == Instruction::Load) {
12564 // Checks if the user of the external scalar is phi in loop body.
12565 auto IsPhiInLoop = [&](const ExternalUser &U) {
12566 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12567 auto *I = cast<Instruction>(U.Scalar);
12568 const Loop *L = LI->getLoopFor(Phi->getParent());
12569 return L && (Phi->getParent() == I->getParent() ||
12570 L == LI->getLoopFor(I->getParent()));
12571 }
12572 return false;
12573 };
12574 if (!ValueToExtUses) {
12575 ValueToExtUses.emplace();
12576 for_each(enumerate(ExternalUses), [&](const auto &P) {
12577 // Ignore phis in loops.
12578 if (IsPhiInLoop(P.value()))
12579 return;
12580
12581 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12582 });
12583 }
12584 // Can use original instruction, if no operands vectorized or they are
12585 // marked as externally used already.
12586 auto *Inst = cast<Instruction>(EU.Scalar);
12587 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12588 auto OperandIsScalar = [&](Value *V) {
12589 if (!getTreeEntry(V)) {
12590 // Some extractelements might be not vectorized, but
12591 // transformed into shuffle and removed from the function,
12592 // consider it here.
12593 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12594 return !EE->hasOneUse() || !MustGather.contains(EE);
12595 return true;
12596 }
12597 return ValueToExtUses->contains(V);
12598 };
12599 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12600 bool CanBeUsedAsScalarCast = false;
12601 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12602 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12603 Op && all_of(Op->operands(), OperandIsScalar)) {
12604 InstructionCost OpCost =
12605 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12607 : 0;
12608 if (ScalarCost + OpCost <= ExtraCost) {
12609 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12610 ScalarCost += OpCost;
12611 }
12612 }
12613 }
12614 if (CanBeUsedAsScalar) {
12615 bool KeepScalar = ScalarCost <= ExtraCost;
12616 // Try to keep original scalar if the user is the phi node from the same
12617 // block as the root phis, currently vectorized. It allows to keep
12618 // better ordering info of PHIs, being vectorized currently.
12619 bool IsProfitablePHIUser =
12620 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12621 VectorizableTree.front()->Scalars.size() > 2)) &&
12622 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12623 !Inst->hasNUsesOrMore(UsesLimit) &&
12624 none_of(Inst->users(),
12625 [&](User *U) {
12626 auto *PHIUser = dyn_cast<PHINode>(U);
12627 return (!PHIUser ||
12628 PHIUser->getParent() !=
12629 cast<Instruction>(
12630 VectorizableTree.front()->getMainOp())
12631 ->getParent()) &&
12632 !getTreeEntry(U);
12633 }) &&
12634 count_if(Entry->Scalars, [&](Value *V) {
12635 return ValueToExtUses->contains(V);
12636 }) <= 2;
12637 if (IsProfitablePHIUser) {
12638 KeepScalar = true;
12639 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12640 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12641 (!GatheredLoadsEntriesFirst.has_value() ||
12642 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12643 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12644 return ValueToExtUses->contains(V);
12645 });
12646 auto It = ExtractsCount.find(Entry);
12647 if (It != ExtractsCount.end()) {
12648 assert(ScalarUsesCount >= It->getSecond().size() &&
12649 "Expected total number of external uses not less than "
12650 "number of scalar uses.");
12651 ScalarUsesCount -= It->getSecond().size();
12652 }
12653 // Keep original scalar if number of externally used instructions in
12654 // the same entry is not power of 2. It may help to do some extra
12655 // vectorization for now.
12656 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12657 }
12658 if (KeepScalar) {
12659 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12660 for_each(Inst->operands(), [&](Value *V) {
12661 auto It = ValueToExtUses->find(V);
12662 if (It != ValueToExtUses->end()) {
12663 // Replace all uses to avoid compiler crash.
12664 ExternalUses[It->second].User = nullptr;
12665 }
12666 });
12667 ExtraCost = ScalarCost;
12668 if (!IsPhiInLoop(EU))
12669 ExtractsCount[Entry].insert(Inst);
12670 if (CanBeUsedAsScalarCast) {
12671 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12672 // Update the users of the operands of the cast operand to avoid
12673 // compiler crash.
12674 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12675 for_each(IOp->operands(), [&](Value *V) {
12676 auto It = ValueToExtUses->find(V);
12677 if (It != ValueToExtUses->end()) {
12678 // Replace all uses to avoid compiler crash.
12679 ExternalUses[It->second].User = nullptr;
12680 }
12681 });
12682 }
12683 }
12684 }
12685 }
12686 }
12687
12688 ExtractCost += ExtraCost;
12689 }
12690 // Insert externals for extract of operands of casts to be emitted as scalars
12691 // instead of extractelement.
12692 for (Value *V : ScalarOpsFromCasts) {
12693 ExternalUsesAsOriginalScalar.insert(V);
12694 if (const TreeEntry *E = getTreeEntry(V)) {
12695 ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
12696 }
12697 }
12698 // Add reduced value cost, if resized.
12699 if (!VectorizedVals.empty()) {
12700 const TreeEntry &Root = *VectorizableTree.front();
12701 auto BWIt = MinBWs.find(&Root);
12702 if (BWIt != MinBWs.end()) {
12703 Type *DstTy = Root.Scalars.front()->getType();
12704 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12705 unsigned SrcSz =
12706 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12707 if (OriginalSz != SrcSz) {
12708 unsigned Opcode = Instruction::Trunc;
12709 if (OriginalSz > SrcSz)
12710 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12711 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12712 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12713 assert(SLPReVec && "Only supported by REVEC.");
12714 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12715 }
12716 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12719 }
12720 }
12721 }
12722
12723 InstructionCost SpillCost = getSpillCost();
12724 Cost += SpillCost + ExtractCost;
12725 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12726 bool) {
12727 InstructionCost C = 0;
12728 unsigned VF = Mask.size();
12729 unsigned VecVF = TE->getVectorFactor();
12730 if (VF != VecVF &&
12731 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12733 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12734 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12735 OrigMask.begin());
12737 getWidenedType(TE->getMainOp()->getType(), VecVF),
12738 OrigMask);
12739 LLVM_DEBUG(
12740 dbgs() << "SLP: Adding cost " << C
12741 << " for final shuffle of insertelement external users.\n";
12742 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12743 Cost += C;
12744 return std::make_pair(TE, true);
12745 }
12746 return std::make_pair(TE, false);
12747 };
12748 // Calculate the cost of the reshuffled vectors, if any.
12749 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12750 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12751 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12752 unsigned VF = 0;
12753 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12755 assert((TEs.size() == 1 || TEs.size() == 2) &&
12756 "Expected exactly 1 or 2 tree entries.");
12757 if (TEs.size() == 1) {
12758 if (VF == 0)
12759 VF = TEs.front()->getVectorFactor();
12760 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12761 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12762 !all_of(enumerate(Mask), [=](const auto &Data) {
12763 return Data.value() == PoisonMaskElem ||
12764 (Data.index() < VF &&
12765 static_cast<int>(Data.index()) == Data.value());
12766 })) {
12769 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12770 << " for final shuffle of insertelement "
12771 "external users.\n";
12772 TEs.front()->dump();
12773 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12774 Cost += C;
12775 }
12776 } else {
12777 if (VF == 0) {
12778 if (TEs.front() &&
12779 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12780 VF = TEs.front()->getVectorFactor();
12781 else
12782 VF = Mask.size();
12783 }
12784 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12787 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12788 << " for final shuffle of vector node and external "
12789 "insertelement users.\n";
12790 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12791 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12792 Cost += C;
12793 }
12794 VF = Mask.size();
12795 return TEs.back();
12796 };
12797 (void)performExtractsShuffleAction<const TreeEntry>(
12798 MutableArrayRef(Vector.data(), Vector.size()), Base,
12799 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12800 EstimateShufflesCost);
12802 cast<FixedVectorType>(
12803 ShuffledInserts[I].InsertElements.front()->getType()),
12804 DemandedElts[I],
12805 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12806 Cost -= InsertCost;
12807 }
12808
12809 // Add the cost for reduced value resize (if required).
12810 if (ReductionBitWidth != 0) {
12811 assert(UserIgnoreList && "Expected reduction tree.");
12812 const TreeEntry &E = *VectorizableTree.front();
12813 auto It = MinBWs.find(&E);
12814 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12815 unsigned SrcSize = It->second.first;
12816 unsigned DstSize = ReductionBitWidth;
12817 unsigned Opcode = Instruction::Trunc;
12818 if (SrcSize < DstSize) {
12819 bool IsArithmeticExtendedReduction =
12820 all_of(*UserIgnoreList, [](Value *V) {
12821 auto *I = cast<Instruction>(V);
12822 return is_contained({Instruction::Add, Instruction::FAdd,
12823 Instruction::Mul, Instruction::FMul,
12824 Instruction::And, Instruction::Or,
12825 Instruction::Xor},
12826 I->getOpcode());
12827 });
12828 if (IsArithmeticExtendedReduction)
12829 Opcode =
12830 Instruction::BitCast; // Handle it by getExtendedReductionCost
12831 else
12832 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12833 }
12834 if (Opcode != Instruction::BitCast) {
12835 auto *SrcVecTy =
12836 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12837 auto *DstVecTy =
12838 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12839 TTI::CastContextHint CCH = getCastContextHint(E);
12840 InstructionCost CastCost;
12841 switch (E.getOpcode()) {
12842 case Instruction::SExt:
12843 case Instruction::ZExt:
12844 case Instruction::Trunc: {
12845 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12846 CCH = getCastContextHint(*OpTE);
12847 break;
12848 }
12849 default:
12850 break;
12851 }
12852 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12854 Cost += CastCost;
12855 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12856 << " for final resize for reduction from " << SrcVecTy
12857 << " to " << DstVecTy << "\n";
12858 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12859 }
12860 }
12861 }
12862
12863#ifndef NDEBUG
12864 SmallString<256> Str;
12865 {
12867 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12868 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12869 << "SLP: Total Cost = " << Cost << ".\n";
12870 }
12871 LLVM_DEBUG(dbgs() << Str);
12872 if (ViewSLPTree)
12873 ViewGraph(this, "SLP" + F->getName(), false, Str);
12874#endif
12875
12876 return Cost;
12877}
12878
12879/// Tries to find extractelement instructions with constant indices from fixed
12880/// vector type and gather such instructions into a bunch, which highly likely
12881/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12882/// successful, the matched scalars are replaced by poison values in \p VL for
12883/// future analysis.
12884std::optional<TTI::ShuffleKind>
12885BoUpSLP::tryToGatherSingleRegisterExtractElements(
12887 // Scan list of gathered scalars for extractelements that can be represented
12888 // as shuffles.
12890 SmallVector<int> UndefVectorExtracts;
12891 for (int I = 0, E = VL.size(); I < E; ++I) {
12892 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12893 if (!EI) {
12894 if (isa<UndefValue>(VL[I]))
12895 UndefVectorExtracts.push_back(I);
12896 continue;
12897 }
12898 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12899 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12900 continue;
12901 std::optional<unsigned> Idx = getExtractIndex(EI);
12902 // Undefined index.
12903 if (!Idx) {
12904 UndefVectorExtracts.push_back(I);
12905 continue;
12906 }
12907 if (Idx >= VecTy->getNumElements()) {
12908 UndefVectorExtracts.push_back(I);
12909 continue;
12910 }
12911 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12912 ExtractMask.reset(*Idx);
12913 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12914 UndefVectorExtracts.push_back(I);
12915 continue;
12916 }
12917 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12918 }
12919 // Sort the vector operands by the maximum number of uses in extractelements.
12921 VectorOpToIdx.takeVector();
12922 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
12923 return P1.second.size() > P2.second.size();
12924 });
12925 // Find the best pair of the vectors or a single vector.
12926 const int UndefSz = UndefVectorExtracts.size();
12927 unsigned SingleMax = 0;
12928 unsigned PairMax = 0;
12929 if (!Vectors.empty()) {
12930 SingleMax = Vectors.front().second.size() + UndefSz;
12931 if (Vectors.size() > 1) {
12932 auto *ItNext = std::next(Vectors.begin());
12933 PairMax = SingleMax + ItNext->second.size();
12934 }
12935 }
12936 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12937 return std::nullopt;
12938 // Check if better to perform a shuffle of 2 vectors or just of a single
12939 // vector.
12940 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
12941 SmallVector<Value *> GatheredExtracts(
12942 VL.size(), PoisonValue::get(VL.front()->getType()));
12943 if (SingleMax >= PairMax && SingleMax) {
12944 for (int Idx : Vectors.front().second)
12945 std::swap(GatheredExtracts[Idx], VL[Idx]);
12946 } else if (!Vectors.empty()) {
12947 for (unsigned Idx : {0, 1})
12948 for (int Idx : Vectors[Idx].second)
12949 std::swap(GatheredExtracts[Idx], VL[Idx]);
12950 }
12951 // Add extracts from undefs too.
12952 for (int Idx : UndefVectorExtracts)
12953 std::swap(GatheredExtracts[Idx], VL[Idx]);
12954 // Check that gather of extractelements can be represented as just a
12955 // shuffle of a single/two vectors the scalars are extracted from.
12956 std::optional<TTI::ShuffleKind> Res =
12957 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
12958 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
12959 // TODO: try to check other subsets if possible.
12960 // Restore the original VL if attempt was not successful.
12961 copy(SavedVL, VL.begin());
12962 return std::nullopt;
12963 }
12964 // Restore unused scalars from mask, if some of the extractelements were not
12965 // selected for shuffle.
12966 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
12967 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
12968 isa<UndefValue>(GatheredExtracts[I])) {
12969 std::swap(VL[I], GatheredExtracts[I]);
12970 continue;
12971 }
12972 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12973 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
12974 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
12975 is_contained(UndefVectorExtracts, I))
12976 continue;
12977 }
12978 return Res;
12979}
12980
12981/// Tries to find extractelement instructions with constant indices from fixed
12982/// vector type and gather such instructions into a bunch, which highly likely
12983/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12984/// successful, the matched scalars are replaced by poison values in \p VL for
12985/// future analysis.
12987BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
12989 unsigned NumParts) const {
12990 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
12991 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
12992 Mask.assign(VL.size(), PoisonMaskElem);
12993 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
12994 for (unsigned Part : seq<unsigned>(NumParts)) {
12995 // Scan list of gathered scalars for extractelements that can be represented
12996 // as shuffles.
12998 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
12999 SmallVector<int> SubMask;
13000 std::optional<TTI::ShuffleKind> Res =
13001 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13002 ShufflesRes[Part] = Res;
13003 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13004 }
13005 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13006 return Res.has_value();
13007 }))
13008 ShufflesRes.clear();
13009 return ShufflesRes;
13010}
13011
13012std::optional<TargetTransformInfo::ShuffleKind>
13013BoUpSLP::isGatherShuffledSingleRegisterEntry(
13014 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
13015 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
13016 Entries.clear();
13017 // TODO: currently checking only for Scalars in the tree entry, need to count
13018 // reused elements too for better cost estimation.
13019 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
13020 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13021 : TE->UserTreeIndices.front();
13022 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13023 const BasicBlock *TEInsertBlock = nullptr;
13024 // Main node of PHI entries keeps the correct order of operands/incoming
13025 // blocks.
13026 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13027 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13028 TEInsertPt = TEInsertBlock->getTerminator();
13029 } else {
13030 TEInsertBlock = TEInsertPt->getParent();
13031 }
13032 if (!DT->isReachableFromEntry(TEInsertBlock))
13033 return std::nullopt;
13034 auto *NodeUI = DT->getNode(TEInsertBlock);
13035 assert(NodeUI && "Should only process reachable instructions");
13036 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13037 auto CheckOrdering = [&](const Instruction *InsertPt) {
13038 // Argument InsertPt is an instruction where vector code for some other
13039 // tree entry (one that shares one or more scalars with TE) is going to be
13040 // generated. This lambda returns true if insertion point of vector code
13041 // for the TE dominates that point (otherwise dependency is the other way
13042 // around). The other node is not limited to be of a gather kind. Gather
13043 // nodes are not scheduled and their vector code is inserted before their
13044 // first user. If user is PHI, that is supposed to be at the end of a
13045 // predecessor block. Otherwise it is the last instruction among scalars of
13046 // the user node. So, instead of checking dependency between instructions
13047 // themselves, we check dependency between their insertion points for vector
13048 // code (since each scalar instruction ends up as a lane of a vector
13049 // instruction).
13050 const BasicBlock *InsertBlock = InsertPt->getParent();
13051 auto *NodeEUI = DT->getNode(InsertBlock);
13052 if (!NodeEUI)
13053 return false;
13054 assert((NodeUI == NodeEUI) ==
13055 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13056 "Different nodes should have different DFS numbers");
13057 // Check the order of the gather nodes users.
13058 if (TEInsertPt->getParent() != InsertBlock &&
13059 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13060 return false;
13061 if (TEInsertPt->getParent() == InsertBlock &&
13062 TEInsertPt->comesBefore(InsertPt))
13063 return false;
13064 return true;
13065 };
13066 // Find all tree entries used by the gathered values. If no common entries
13067 // found - not a shuffle.
13068 // Here we build a set of tree nodes for each gathered value and trying to
13069 // find the intersection between these sets. If we have at least one common
13070 // tree node for each gathered value - we have just a permutation of the
13071 // single vector. If we have 2 different sets, we're in situation where we
13072 // have a permutation of 2 input vectors.
13074 DenseMap<Value *, int> UsedValuesEntry;
13075 for (Value *V : VL) {
13076 if (isConstant(V))
13077 continue;
13078 // Build a list of tree entries where V is used.
13080 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13081 if (TEPtr == TE || TEPtr->Idx == 0)
13082 continue;
13083 assert(any_of(TEPtr->Scalars,
13084 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13085 "Must contain at least single gathered value.");
13086 assert(TEPtr->UserTreeIndices.size() == 1 &&
13087 "Expected only single user of a gather node.");
13088 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13089
13090 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13091 const Instruction *InsertPt =
13092 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13093 : &getLastInstructionInBundle(UseEI.UserTE);
13094 if (TEInsertPt == InsertPt) {
13095 // If 2 gathers are operands of the same entry (regardless of whether
13096 // user is PHI or else), compare operands indices, use the earlier one
13097 // as the base.
13098 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13099 continue;
13100 // If the user instruction is used for some reason in different
13101 // vectorized nodes - make it depend on index.
13102 if (TEUseEI.UserTE != UseEI.UserTE &&
13103 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13104 continue;
13105 }
13106
13107 // Check if the user node of the TE comes after user node of TEPtr,
13108 // otherwise TEPtr depends on TE.
13109 if ((TEInsertBlock != InsertPt->getParent() ||
13110 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13111 !CheckOrdering(InsertPt))
13112 continue;
13113 VToTEs.insert(TEPtr);
13114 }
13115 if (const TreeEntry *VTE = getTreeEntry(V)) {
13116 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13117 if (VTE->State != TreeEntry::Vectorize) {
13118 auto It = MultiNodeScalars.find(V);
13119 if (It == MultiNodeScalars.end())
13120 continue;
13121 VTE = *It->getSecond().begin();
13122 // Iterate through all vectorized nodes.
13123 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
13124 return MTE->State == TreeEntry::Vectorize;
13125 });
13126 if (MIt == It->getSecond().end())
13127 continue;
13128 VTE = *MIt;
13129 }
13130 }
13131 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13132 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13133 continue;
13134 VToTEs.insert(VTE);
13135 }
13136 if (VToTEs.empty())
13137 continue;
13138 if (UsedTEs.empty()) {
13139 // The first iteration, just insert the list of nodes to vector.
13140 UsedTEs.push_back(VToTEs);
13141 UsedValuesEntry.try_emplace(V, 0);
13142 } else {
13143 // Need to check if there are any previously used tree nodes which use V.
13144 // If there are no such nodes, consider that we have another one input
13145 // vector.
13146 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13147 unsigned Idx = 0;
13148 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13149 // Do we have a non-empty intersection of previously listed tree entries
13150 // and tree entries using current V?
13151 set_intersect(VToTEs, Set);
13152 if (!VToTEs.empty()) {
13153 // Yes, write the new subset and continue analysis for the next
13154 // scalar.
13155 Set.swap(VToTEs);
13156 break;
13157 }
13158 VToTEs = SavedVToTEs;
13159 ++Idx;
13160 }
13161 // No non-empty intersection found - need to add a second set of possible
13162 // source vectors.
13163 if (Idx == UsedTEs.size()) {
13164 // If the number of input vectors is greater than 2 - not a permutation,
13165 // fallback to the regular gather.
13166 // TODO: support multiple reshuffled nodes.
13167 if (UsedTEs.size() == 2)
13168 continue;
13169 UsedTEs.push_back(SavedVToTEs);
13170 Idx = UsedTEs.size() - 1;
13171 }
13172 UsedValuesEntry.try_emplace(V, Idx);
13173 }
13174 }
13175
13176 if (UsedTEs.empty()) {
13177 Entries.clear();
13178 return std::nullopt;
13179 }
13180
13181 unsigned VF = 0;
13182 if (UsedTEs.size() == 1) {
13183 // Keep the order to avoid non-determinism.
13184 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13185 UsedTEs.front().end());
13186 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13187 return TE1->Idx < TE2->Idx;
13188 });
13189 // Try to find the perfect match in another gather node at first.
13190 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13191 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13192 });
13193 if (It != FirstEntries.end() &&
13194 ((*It)->getVectorFactor() == VL.size() ||
13195 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13196 TE->ReuseShuffleIndices.size() == VL.size() &&
13197 (*It)->isSame(TE->Scalars)))) {
13198 Entries.push_back(*It);
13199 if ((*It)->getVectorFactor() == VL.size()) {
13200 std::iota(std::next(Mask.begin(), Part * VL.size()),
13201 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13202 } else {
13203 SmallVector<int> CommonMask = TE->getCommonMask();
13204 copy(CommonMask, Mask.begin());
13205 }
13206 // Clear undef scalars.
13207 for (unsigned I : seq<unsigned>(VL.size()))
13208 if (isa<PoisonValue>(VL[I]))
13209 Mask[Part * VL.size() + I] = PoisonMaskElem;
13211 }
13212 // No perfect match, just shuffle, so choose the first tree node from the
13213 // tree.
13214 Entries.push_back(FirstEntries.front());
13215 VF = FirstEntries.front()->getVectorFactor();
13216 } else {
13217 // Try to find nodes with the same vector factor.
13218 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13219 // Keep the order of tree nodes to avoid non-determinism.
13221 for (const TreeEntry *TE : UsedTEs.front()) {
13222 unsigned VF = TE->getVectorFactor();
13223 auto It = VFToTE.find(VF);
13224 if (It != VFToTE.end()) {
13225 if (It->second->Idx > TE->Idx)
13226 It->getSecond() = TE;
13227 continue;
13228 }
13229 VFToTE.try_emplace(VF, TE);
13230 }
13231 // Same, keep the order to avoid non-determinism.
13232 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13233 UsedTEs.back().end());
13234 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13235 return TE1->Idx < TE2->Idx;
13236 });
13237 for (const TreeEntry *TE : SecondEntries) {
13238 auto It = VFToTE.find(TE->getVectorFactor());
13239 if (It != VFToTE.end()) {
13240 VF = It->first;
13241 Entries.push_back(It->second);
13242 Entries.push_back(TE);
13243 break;
13244 }
13245 }
13246 // No 2 source vectors with the same vector factor - just choose 2 with max
13247 // index.
13248 if (Entries.empty()) {
13249 Entries.push_back(*llvm::max_element(
13250 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13251 return TE1->Idx < TE2->Idx;
13252 }));
13253 Entries.push_back(SecondEntries.front());
13254 VF = std::max(Entries.front()->getVectorFactor(),
13255 Entries.back()->getVectorFactor());
13256 } else {
13257 VF = Entries.front()->getVectorFactor();
13258 }
13259 }
13260
13261 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13262 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13263 // vectorized.
13264 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13265 auto *PHI = cast<PHINode>(V);
13266 auto *PHI1 = cast<PHINode>(V1);
13267 // Check that all incoming values are compatible/from same parent (if they
13268 // are instructions).
13269 // The incoming values are compatible if they all are constants, or
13270 // instruction with the same/alternate opcodes from the same basic block.
13271 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13272 Value *In = PHI->getIncomingValue(I);
13273 Value *In1 = PHI1->getIncomingValue(I);
13274 if (isConstant(In) && isConstant(In1))
13275 continue;
13276 if (!getSameOpcode({In, In1}, *TLI))
13277 return false;
13278 if (cast<Instruction>(In)->getParent() !=
13279 cast<Instruction>(In1)->getParent())
13280 return false;
13281 }
13282 return true;
13283 };
13284 // Check if the value can be ignored during analysis for shuffled gathers.
13285 // We suppose it is better to ignore instruction, which do not form splats,
13286 // are not vectorized/not extractelements (these instructions will be handled
13287 // by extractelements processing) or may form vector node in future.
13288 auto MightBeIgnored = [=](Value *V) {
13289 auto *I = dyn_cast<Instruction>(V);
13290 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13292 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13293 };
13294 // Check that the neighbor instruction may form a full vector node with the
13295 // current instruction V. It is possible, if they have same/alternate opcode
13296 // and same parent basic block.
13297 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13298 Value *V1 = VL[Idx];
13299 bool UsedInSameVTE = false;
13300 auto It = UsedValuesEntry.find(V1);
13301 if (It != UsedValuesEntry.end())
13302 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13303 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13304 getSameOpcode({V, V1}, *TLI) &&
13305 cast<Instruction>(V)->getParent() ==
13306 cast<Instruction>(V1)->getParent() &&
13307 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13308 };
13309 // Build a shuffle mask for better cost estimation and vector emission.
13310 SmallBitVector UsedIdxs(Entries.size());
13312 for (int I = 0, E = VL.size(); I < E; ++I) {
13313 Value *V = VL[I];
13314 auto It = UsedValuesEntry.find(V);
13315 if (It == UsedValuesEntry.end())
13316 continue;
13317 // Do not try to shuffle scalars, if they are constants, or instructions
13318 // that can be vectorized as a result of the following vector build
13319 // vectorization.
13320 if (isConstant(V) || (MightBeIgnored(V) &&
13321 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13322 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13323 continue;
13324 unsigned Idx = It->second;
13325 EntryLanes.emplace_back(Idx, I);
13326 UsedIdxs.set(Idx);
13327 }
13328 // Iterate through all shuffled scalars and select entries, which can be used
13329 // for final shuffle.
13331 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13332 if (!UsedIdxs.test(I))
13333 continue;
13334 // Fix the entry number for the given scalar. If it is the first entry, set
13335 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13336 // These indices are used when calculating final shuffle mask as the vector
13337 // offset.
13338 for (std::pair<unsigned, int> &Pair : EntryLanes)
13339 if (Pair.first == I)
13340 Pair.first = TempEntries.size();
13341 TempEntries.push_back(Entries[I]);
13342 }
13343 Entries.swap(TempEntries);
13344 if (EntryLanes.size() == Entries.size() &&
13345 !VL.equals(ArrayRef(TE->Scalars)
13346 .slice(Part * VL.size(),
13347 std::min<int>(VL.size(), TE->Scalars.size())))) {
13348 // We may have here 1 or 2 entries only. If the number of scalars is equal
13349 // to the number of entries, no need to do the analysis, it is not very
13350 // profitable. Since VL is not the same as TE->Scalars, it means we already
13351 // have some shuffles before. Cut off not profitable case.
13352 Entries.clear();
13353 return std::nullopt;
13354 }
13355 // Build the final mask, check for the identity shuffle, if possible.
13356 bool IsIdentity = Entries.size() == 1;
13357 // Pair.first is the offset to the vector, while Pair.second is the index of
13358 // scalar in the list.
13359 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13360 unsigned Idx = Part * VL.size() + Pair.second;
13361 Mask[Idx] =
13362 Pair.first * VF +
13363 (ForOrder ? std::distance(
13364 Entries[Pair.first]->Scalars.begin(),
13365 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13366 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13367 IsIdentity &= Mask[Idx] == Pair.second;
13368 }
13369 if (ForOrder || IsIdentity || Entries.empty()) {
13370 switch (Entries.size()) {
13371 case 1:
13372 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13374 break;
13375 case 2:
13376 if (EntryLanes.size() > 2 || VL.size() <= 2)
13378 break;
13379 default:
13380 break;
13381 }
13382 } else if (!isa<VectorType>(VL.front()->getType()) &&
13383 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13384 // Do the cost estimation if shuffle beneficial than buildvector.
13385 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13386 std::next(Mask.begin(), (Part + 1) * VL.size()));
13387 int MinElement = SubMask.front(), MaxElement = SubMask.front();
13388 for (int Idx : SubMask) {
13389 if (Idx == PoisonMaskElem)
13390 continue;
13391 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13392 MinElement = Idx;
13393 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13394 MaxElement = Idx;
13395 }
13396 assert(MaxElement >= 0 && MinElement >= 0 &&
13397 MaxElement % VF >= MinElement % VF &&
13398 "Expected at least single element.");
13399 unsigned NewVF = std::max<unsigned>(
13400 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13401 (MaxElement % VF) -
13402 (MinElement % VF) + 1));
13403 if (NewVF < VF) {
13404 for_each(SubMask, [&](int &Idx) {
13405 if (Idx == PoisonMaskElem)
13406 return;
13407 Idx = (Idx % VF) - (MinElement % VF) +
13408 (Idx >= static_cast<int>(VF) ? NewVF : 0);
13409 });
13410 VF = NewVF;
13411 }
13412
13414 auto *VecTy = getWidenedType(VL.front()->getType(), VF);
13415 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13416 auto GetShuffleCost = [&,
13419 VectorType *VecTy) -> InstructionCost {
13420 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13422 Mask, Entries.front()->getInterleaveFactor()))
13423 return TTI::TCC_Free;
13424 return ::getShuffleCost(TTI,
13425 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13427 VecTy, Mask, CostKind);
13428 };
13429 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13430 InstructionCost FirstShuffleCost = 0;
13431 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13432 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13433 FirstShuffleCost = ShuffleCost;
13434 } else {
13435 // Transform mask to include only first entry.
13436 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13437 bool IsIdentity = true;
13438 for (auto [I, Idx] : enumerate(FirstMask)) {
13439 if (Idx >= static_cast<int>(VF)) {
13441 } else {
13442 DemandedElts.clearBit(I);
13443 if (Idx != PoisonMaskElem)
13444 IsIdentity &= static_cast<int>(I) == Idx;
13445 }
13446 }
13447 if (!IsIdentity)
13448 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13449 FirstShuffleCost += TTI->getScalarizationOverhead(
13450 MaskVecTy, DemandedElts, /*Insert=*/true,
13451 /*Extract=*/false, CostKind);
13452 }
13453 InstructionCost SecondShuffleCost = 0;
13454 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13455 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13456 SecondShuffleCost = ShuffleCost;
13457 } else {
13458 // Transform mask to include only first entry.
13459 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13460 bool IsIdentity = true;
13461 for (auto [I, Idx] : enumerate(SecondMask)) {
13462 if (Idx < static_cast<int>(VF) && Idx >= 0) {
13464 } else {
13465 DemandedElts.clearBit(I);
13466 if (Idx != PoisonMaskElem) {
13467 Idx -= VF;
13468 IsIdentity &= static_cast<int>(I) == Idx;
13469 }
13470 }
13471 }
13472 if (!IsIdentity)
13473 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13474 SecondShuffleCost += TTI->getScalarizationOverhead(
13475 MaskVecTy, DemandedElts, /*Insert=*/true,
13476 /*Extract=*/false, CostKind);
13477 }
13478 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13479 for (auto [I, Idx] : enumerate(SubMask))
13480 if (Idx == PoisonMaskElem)
13481 DemandedElts.clearBit(I);
13482 InstructionCost BuildVectorCost =
13483 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13484 /*Extract=*/false, CostKind);
13485 const TreeEntry *BestEntry = nullptr;
13486 if (FirstShuffleCost < ShuffleCost) {
13487 copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
13488 BestEntry = Entries.front();
13489 ShuffleCost = FirstShuffleCost;
13490 }
13491 if (SecondShuffleCost < ShuffleCost) {
13492 copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
13493 BestEntry = Entries[1];
13494 ShuffleCost = SecondShuffleCost;
13495 }
13496 if (BuildVectorCost >= ShuffleCost) {
13497 if (BestEntry) {
13498 Entries.clear();
13499 Entries.push_back(BestEntry);
13500 }
13501 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13503 }
13504 }
13505 Entries.clear();
13506 // Clear the corresponding mask elements.
13507 std::fill(std::next(Mask.begin(), Part * VL.size()),
13508 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13509 return std::nullopt;
13510}
13511
13513BoUpSLP::isGatherShuffledEntry(
13514 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13515 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13516 bool ForOrder) {
13517 assert(NumParts > 0 && NumParts < VL.size() &&
13518 "Expected positive number of registers.");
13519 Entries.clear();
13520 // No need to check for the topmost gather node.
13521 if (TE == VectorizableTree.front().get() &&
13522 (!GatheredLoadsEntriesFirst.has_value() ||
13523 none_of(ArrayRef(VectorizableTree).drop_front(),
13524 [](const std::unique_ptr<TreeEntry> &TE) {
13525 return !TE->isGather();
13526 })))
13527 return {};
13528 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
13529 if (TE->isNonPowOf2Vec())
13530 return {};
13531 Mask.assign(VL.size(), PoisonMaskElem);
13532 assert((TE->UserTreeIndices.size() == 1 ||
13533 TE == VectorizableTree.front().get()) &&
13534 "Expected only single user of the gather node.");
13535 assert(VL.size() % NumParts == 0 &&
13536 "Number of scalars must be divisible by NumParts.");
13537 if (!TE->UserTreeIndices.empty() &&
13538 TE->UserTreeIndices.front().UserTE->isGather() &&
13539 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13540 assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
13541 isSplat(TE->Scalars)) &&
13542 "Expected splat or extractelements only node.");
13543 return {};
13544 }
13545 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13547 for (unsigned Part : seq<unsigned>(NumParts)) {
13548 ArrayRef<Value *> SubVL =
13549 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13550 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13551 std::optional<TTI::ShuffleKind> SubRes =
13552 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13553 ForOrder);
13554 if (!SubRes)
13555 SubEntries.clear();
13556 Res.push_back(SubRes);
13557 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13558 SubEntries.front()->getVectorFactor() == VL.size() &&
13559 (SubEntries.front()->isSame(TE->Scalars) ||
13560 SubEntries.front()->isSame(VL))) {
13561 SmallVector<const TreeEntry *> LocalSubEntries;
13562 LocalSubEntries.swap(SubEntries);
13563 Entries.clear();
13564 Res.clear();
13565 std::iota(Mask.begin(), Mask.end(), 0);
13566 // Clear undef scalars.
13567 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13568 if (isa<PoisonValue>(VL[I]))
13570 Entries.emplace_back(1, LocalSubEntries.front());
13572 return Res;
13573 }
13574 }
13575 if (all_of(Res,
13576 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13577 Entries.clear();
13578 return {};
13579 }
13580 return Res;
13581}
13582
13583InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13584 Type *ScalarTy) const {
13585 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13586 bool DuplicateNonConst = false;
13587 // Find the cost of inserting/extracting values from the vector.
13588 // Check if the same elements are inserted several times and count them as
13589 // shuffle candidates.
13590 APInt ShuffledElements = APInt::getZero(VL.size());
13591 DenseMap<Value *, unsigned> UniqueElements;
13594 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13595 if (V->getType() != ScalarTy) {
13596 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13598 V = nullptr;
13599 }
13600 if (!ForPoisonSrc)
13601 Cost +=
13602 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13603 I, Constant::getNullValue(VecTy), V);
13604 };
13605 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13606 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13607 Value *V = VL[I];
13608 // No need to shuffle duplicates for constants.
13609 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13610 ShuffledElements.setBit(I);
13611 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13612 continue;
13613 }
13614
13615 auto Res = UniqueElements.try_emplace(V, I);
13616 if (Res.second) {
13617 EstimateInsertCost(I, V);
13618 ShuffleMask[I] = I;
13619 continue;
13620 }
13621
13622 DuplicateNonConst = true;
13623 ShuffledElements.setBit(I);
13624 ShuffleMask[I] = Res.first->second;
13625 }
13626 if (ForPoisonSrc) {
13627 if (isa<FixedVectorType>(ScalarTy)) {
13628 assert(SLPReVec && "Only supported by REVEC.");
13629 // We don't need to insert elements one by one. Instead, we can insert the
13630 // entire vector into the destination.
13631 Cost = 0;
13632 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13633 for (unsigned I : seq<unsigned>(VL.size()))
13634 if (!ShuffledElements[I])
13636 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13637 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13638 } else {
13640 /*DemandedElts*/ ~ShuffledElements,
13641 /*Insert*/ true,
13642 /*Extract*/ false, CostKind, VL);
13643 }
13644 }
13645 if (DuplicateNonConst)
13647 VecTy, ShuffleMask);
13648 return Cost;
13649}
13650
13651Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13652 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13653 if (Res)
13654 return *Res;
13655 // Get the basic block this bundle is in. All instructions in the bundle
13656 // should be in this block (except for extractelement-like instructions with
13657 // constant indices or gathered loads).
13658 auto *Front = E->getMainOp();
13659 auto *BB = Front->getParent();
13660 assert(((GatheredLoadsEntriesFirst.has_value() &&
13661 E->getOpcode() == Instruction::Load && E->isGather() &&
13662 E->Idx < *GatheredLoadsEntriesFirst) ||
13663 all_of(E->Scalars,
13664 [=](Value *V) -> bool {
13665 if (E->getOpcode() == Instruction::GetElementPtr &&
13666 !isa<GetElementPtrInst>(V))
13667 return true;
13668 auto *I = dyn_cast<Instruction>(V);
13669 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13670 isVectorLikeInstWithConstOps(I);
13671 })) &&
13672 "Expected gathered loads or GEPs or instructions from same basic "
13673 "block.");
13674
13675 auto FindLastInst = [&]() {
13676 Instruction *LastInst = Front;
13677 for (Value *V : E->Scalars) {
13678 auto *I = dyn_cast<Instruction>(V);
13679 if (!I)
13680 continue;
13681 if (LastInst->getParent() == I->getParent()) {
13682 if (LastInst->comesBefore(I))
13683 LastInst = I;
13684 continue;
13685 }
13686 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13687 !isa<GetElementPtrInst>(I)) ||
13688 (isVectorLikeInstWithConstOps(LastInst) &&
13690 (GatheredLoadsEntriesFirst.has_value() &&
13691 E->getOpcode() == Instruction::Load && E->isGather() &&
13692 E->Idx < *GatheredLoadsEntriesFirst)) &&
13693 "Expected vector-like or non-GEP in GEP node insts only.");
13694 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13695 LastInst = I;
13696 continue;
13697 }
13698 if (!DT->isReachableFromEntry(I->getParent()))
13699 continue;
13700 auto *NodeA = DT->getNode(LastInst->getParent());
13701 auto *NodeB = DT->getNode(I->getParent());
13702 assert(NodeA && "Should only process reachable instructions");
13703 assert(NodeB && "Should only process reachable instructions");
13704 assert((NodeA == NodeB) ==
13705 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13706 "Different nodes should have different DFS numbers");
13707 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13708 LastInst = I;
13709 }
13710 BB = LastInst->getParent();
13711 return LastInst;
13712 };
13713
13714 auto FindFirstInst = [&]() {
13715 Instruction *FirstInst = Front;
13716 for (Value *V : E->Scalars) {
13717 auto *I = dyn_cast<Instruction>(V);
13718 if (!I)
13719 continue;
13720 if (FirstInst->getParent() == I->getParent()) {
13721 if (I->comesBefore(FirstInst))
13722 FirstInst = I;
13723 continue;
13724 }
13725 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13726 !isa<GetElementPtrInst>(I)) ||
13727 (isVectorLikeInstWithConstOps(FirstInst) &&
13729 "Expected vector-like or non-GEP in GEP node insts only.");
13730 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13731 FirstInst = I;
13732 continue;
13733 }
13734 if (!DT->isReachableFromEntry(I->getParent()))
13735 continue;
13736 auto *NodeA = DT->getNode(FirstInst->getParent());
13737 auto *NodeB = DT->getNode(I->getParent());
13738 assert(NodeA && "Should only process reachable instructions");
13739 assert(NodeB && "Should only process reachable instructions");
13740 assert((NodeA == NodeB) ==
13741 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13742 "Different nodes should have different DFS numbers");
13743 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13744 FirstInst = I;
13745 }
13746 return FirstInst;
13747 };
13748
13749 // Set insertpoint for gathered loads to the very first load.
13750 if (GatheredLoadsEntriesFirst.has_value() &&
13751 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13752 E->getOpcode() == Instruction::Load) {
13753 Res = FindFirstInst();
13754 return *Res;
13755 }
13756
13757 // Set the insert point to the beginning of the basic block if the entry
13758 // should not be scheduled.
13759 if (doesNotNeedToSchedule(E->Scalars) ||
13760 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13761 if ((E->getOpcode() == Instruction::GetElementPtr &&
13762 any_of(E->Scalars,
13763 [](Value *V) {
13764 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13765 })) ||
13766 all_of(E->Scalars,
13767 [](Value *V) {
13768 return isa<PoisonValue>(V) ||
13769 (!isVectorLikeInstWithConstOps(V) &&
13770 isUsedOutsideBlock(V));
13771 }) ||
13772 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13773 return isa<ExtractElementInst, UndefValue>(V) ||
13774 areAllOperandsNonInsts(V);
13775 })))
13776 Res = FindLastInst();
13777 else
13778 Res = FindFirstInst();
13779 return *Res;
13780 }
13781
13782 // Find the last instruction. The common case should be that BB has been
13783 // scheduled, and the last instruction is VL.back(). So we start with
13784 // VL.back() and iterate over schedule data until we reach the end of the
13785 // bundle. The end of the bundle is marked by null ScheduleData.
13786 if (BlocksSchedules.count(BB) && !E->isGather()) {
13787 Value *V = E->isOneOf(E->Scalars.back());
13789 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13790 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13791 if (Bundle && Bundle->isPartOfBundle())
13792 for (; Bundle; Bundle = Bundle->NextInBundle)
13793 Res = Bundle->Inst;
13794 }
13795
13796 // LastInst can still be null at this point if there's either not an entry
13797 // for BB in BlocksSchedules or there's no ScheduleData available for
13798 // VL.back(). This can be the case if buildTree_rec aborts for various
13799 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13800 // size is reached, etc.). ScheduleData is initialized in the scheduling
13801 // "dry-run".
13802 //
13803 // If this happens, we can still find the last instruction by brute force. We
13804 // iterate forwards from Front (inclusive) until we either see all
13805 // instructions in the bundle or reach the end of the block. If Front is the
13806 // last instruction in program order, LastInst will be set to Front, and we
13807 // will visit all the remaining instructions in the block.
13808 //
13809 // One of the reasons we exit early from buildTree_rec is to place an upper
13810 // bound on compile-time. Thus, taking an additional compile-time hit here is
13811 // not ideal. However, this should be exceedingly rare since it requires that
13812 // we both exit early from buildTree_rec and that the bundle be out-of-order
13813 // (causing us to iterate all the way to the end of the block).
13814 if (!Res)
13815 Res = FindLastInst();
13816 assert(Res && "Failed to find last instruction in bundle");
13817 return *Res;
13818}
13819
13820void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13821 auto *Front = E->getMainOp();
13822 Instruction *LastInst = &getLastInstructionInBundle(E);
13823 assert(LastInst && "Failed to find last instruction in bundle");
13824 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13825 // If the instruction is PHI, set the insert point after all the PHIs.
13826 bool IsPHI = isa<PHINode>(LastInst);
13827 if (IsPHI)
13828 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13829 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13830 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13831 } else {
13832 // Set the insertion point after the last instruction in the bundle. Set the
13833 // debug location to Front.
13834 Builder.SetInsertPoint(
13835 LastInst->getParent(),
13837 }
13838 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13839}
13840
13841Value *BoUpSLP::gather(
13842 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13843 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13844 // List of instructions/lanes from current block and/or the blocks which are
13845 // part of the current loop. These instructions will be inserted at the end to
13846 // make it possible to optimize loops and hoist invariant instructions out of
13847 // the loops body with better chances for success.
13849 SmallSet<int, 4> PostponedIndices;
13850 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13851 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13853 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13854 InsertBB = InsertBB->getSinglePredecessor();
13855 return InsertBB && InsertBB == InstBB;
13856 };
13857 for (int I = 0, E = VL.size(); I < E; ++I) {
13858 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13859 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13860 getTreeEntry(Inst) ||
13861 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13862 PostponedIndices.insert(I).second)
13863 PostponedInsts.emplace_back(Inst, I);
13864 }
13865
13866 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13867 Type *Ty) {
13868 Value *Scalar = V;
13869 if (Scalar->getType() != Ty) {
13870 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13871 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13872 Value *V = Scalar;
13873 if (auto *CI = dyn_cast<CastInst>(Scalar);
13874 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13875 Value *Op = CI->getOperand(0);
13876 if (auto *IOp = dyn_cast<Instruction>(Op);
13877 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13878 V = Op;
13879 }
13880 Scalar = Builder.CreateIntCast(
13881 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13882 }
13883
13884 Instruction *InsElt;
13885 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13886 assert(SLPReVec && "FixedVectorType is not expected.");
13887 Vec = InsElt = Builder.CreateInsertVector(
13888 Vec->getType(), Vec, Scalar,
13889 Builder.getInt64(Pos * VecTy->getNumElements()));
13890 auto *II = dyn_cast<IntrinsicInst>(InsElt);
13891 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13892 return Vec;
13893 } else {
13894 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13895 InsElt = dyn_cast<InsertElementInst>(Vec);
13896 if (!InsElt)
13897 return Vec;
13898 }
13899 GatherShuffleExtractSeq.insert(InsElt);
13900 CSEBlocks.insert(InsElt->getParent());
13901 // Add to our 'need-to-extract' list.
13902 if (isa<Instruction>(V)) {
13903 if (TreeEntry *Entry = getTreeEntry(V)) {
13904 // Find which lane we need to extract.
13905 User *UserOp = nullptr;
13906 if (Scalar != V) {
13907 if (auto *SI = dyn_cast<Instruction>(Scalar))
13908 UserOp = SI;
13909 } else {
13910 UserOp = InsElt;
13911 }
13912 if (UserOp) {
13913 unsigned FoundLane = Entry->findLaneForValue(V);
13914 ExternalUses.emplace_back(V, UserOp, FoundLane);
13915 }
13916 }
13917 }
13918 return Vec;
13919 };
13920 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13921 Value *Vec = PoisonValue::get(VecTy);
13922 SmallVector<int> NonConsts;
13924 std::iota(Mask.begin(), Mask.end(), 0);
13925 Value *OriginalRoot = Root;
13926 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13927 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13928 SV->getOperand(0)->getType() == VecTy) {
13929 Root = SV->getOperand(0);
13930 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
13931 }
13932 // Insert constant values at first.
13933 for (int I = 0, E = VL.size(); I < E; ++I) {
13934 if (PostponedIndices.contains(I))
13935 continue;
13936 if (!isConstant(VL[I])) {
13937 NonConsts.push_back(I);
13938 continue;
13939 }
13940 if (isa<PoisonValue>(VL[I]))
13941 continue;
13942 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
13943 Mask[I] = I + E;
13944 }
13945 if (Root) {
13946 if (isa<PoisonValue>(Vec)) {
13947 Vec = OriginalRoot;
13948 } else {
13949 Vec = CreateShuffle(Root, Vec, Mask);
13950 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
13951 OI && OI->hasNUses(0) &&
13952 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13953 return TE->VectorizedValue == OI;
13954 }))
13955 eraseInstruction(OI);
13956 }
13957 }
13958 // Insert non-constant values.
13959 for (int I : NonConsts)
13960 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
13961 // Append instructions, which are/may be part of the loop, in the end to make
13962 // it possible to hoist non-loop-based instructions.
13963 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
13964 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
13965
13966 return Vec;
13967}
13968
13969/// Merges shuffle masks and emits final shuffle instruction, if required. It
13970/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13971/// when the actual shuffle instruction is generated only if this is actually
13972/// required. Otherwise, the shuffle instruction emission is delayed till the
13973/// end of the process, to reduce the number of emitted instructions and further
13974/// analysis/transformations.
13975/// The class also will look through the previously emitted shuffle instructions
13976/// and properly mark indices in mask as undef.
13977/// For example, given the code
13978/// \code
13979/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
13980/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
13981/// \endcode
13982/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
13983/// look through %s1 and %s2 and emit
13984/// \code
13985/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
13986/// \endcode
13987/// instead.
13988/// If 2 operands are of different size, the smallest one will be resized and
13989/// the mask recalculated properly.
13990/// For example, given the code
13991/// \code
13992/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
13993/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
13994/// \endcode
13995/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
13996/// look through %s1 and %s2 and emit
13997/// \code
13998/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
13999/// \endcode
14000/// instead.
14001class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
14002 bool IsFinalized = false;
14003 /// Combined mask for all applied operands and masks. It is built during
14004 /// analysis and actual emission of shuffle vector instructions.
14005 SmallVector<int> CommonMask;
14006 /// List of operands for the shuffle vector instruction. It hold at max 2
14007 /// operands, if the 3rd is going to be added, the first 2 are combined into
14008 /// shuffle with \p CommonMask mask, the first operand sets to be the
14009 /// resulting shuffle and the second operand sets to be the newly added
14010 /// operand. The \p CommonMask is transformed in the proper way after that.
14011 SmallVector<Value *, 2> InVectors;
14012 IRBuilderBase &Builder;
14013 BoUpSLP &R;
14014
14015 class ShuffleIRBuilder {
14016 IRBuilderBase &Builder;
14017 /// Holds all of the instructions that we gathered.
14018 SetVector<Instruction *> &GatherShuffleExtractSeq;
14019 /// A list of blocks that we are going to CSE.
14020 DenseSet<BasicBlock *> &CSEBlocks;
14021 /// Data layout.
14022 const DataLayout &DL;
14023
14024 public:
14025 ShuffleIRBuilder(IRBuilderBase &Builder,
14026 SetVector<Instruction *> &GatherShuffleExtractSeq,
14027 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
14028 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14029 CSEBlocks(CSEBlocks), DL(DL) {}
14030 ~ShuffleIRBuilder() = default;
14031 /// Creates shufflevector for the 2 operands with the given mask.
14032 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
14033 if (V1->getType() != V2->getType()) {
14035 V1->getType()->isIntOrIntVectorTy() &&
14036 "Expected integer vector types only.");
14037 if (V1->getType() != V2->getType()) {
14038 if (cast<VectorType>(V2->getType())
14039 ->getElementType()
14040 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14041 ->getElementType()
14042 ->getIntegerBitWidth())
14043 V2 = Builder.CreateIntCast(
14044 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
14045 else
14046 V1 = Builder.CreateIntCast(
14047 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
14048 }
14049 }
14050 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14051 if (auto *I = dyn_cast<Instruction>(Vec)) {
14052 GatherShuffleExtractSeq.insert(I);
14053 CSEBlocks.insert(I->getParent());
14054 }
14055 return Vec;
14056 }
14057 /// Creates permutation of the single vector operand with the given mask, if
14058 /// it is not identity mask.
14059 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
14060 if (Mask.empty())
14061 return V1;
14062 unsigned VF = Mask.size();
14063 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14064 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
14065 return V1;
14066 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14067 if (auto *I = dyn_cast<Instruction>(Vec)) {
14068 GatherShuffleExtractSeq.insert(I);
14069 CSEBlocks.insert(I->getParent());
14070 }
14071 return Vec;
14072 }
14073 Value *createIdentity(Value *V) { return V; }
14074 Value *createPoison(Type *Ty, unsigned VF) {
14075 return PoisonValue::get(getWidenedType(Ty, VF));
14076 }
14077 /// Resizes 2 input vector to match the sizes, if the they are not equal
14078 /// yet. The smallest vector is resized to the size of the larger vector.
14079 void resizeToMatch(Value *&V1, Value *&V2) {
14080 if (V1->getType() == V2->getType())
14081 return;
14082 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14083 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14084 int VF = std::max(V1VF, V2VF);
14085 int MinVF = std::min(V1VF, V2VF);
14086 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
14087 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14088 0);
14089 Value *&Op = MinVF == V1VF ? V1 : V2;
14090 Op = Builder.CreateShuffleVector(Op, IdentityMask);
14091 if (auto *I = dyn_cast<Instruction>(Op)) {
14092 GatherShuffleExtractSeq.insert(I);
14093 CSEBlocks.insert(I->getParent());
14094 }
14095 if (MinVF == V1VF)
14096 V1 = Op;
14097 else
14098 V2 = Op;
14099 }
14100 };
14101
14102 /// Smart shuffle instruction emission, walks through shuffles trees and
14103 /// tries to find the best matching vector for the actual shuffle
14104 /// instruction.
14105 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
14106 assert(V1 && "Expected at least one vector value.");
14107 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14108 R.CSEBlocks, *R.DL);
14109 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14110 ShuffleBuilder);
14111 }
14112
14113 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
14114 /// shuffle emission.
14115 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
14116 ArrayRef<int> Mask) {
14117 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14118 if (Mask[Idx] != PoisonMaskElem)
14119 CommonMask[Idx] = Idx;
14120 }
14121
14122 /// Cast value \p V to the vector type with the same number of elements, but
14123 /// the base type \p ScalarTy.
14124 Value *castToScalarTyElem(Value *V,
14125 std::optional<bool> IsSigned = std::nullopt) {
14126 auto *VecTy = cast<VectorType>(V->getType());
14127 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
14128 if (VecTy->getElementType() == ScalarTy->getScalarType())
14129 return V;
14130 return Builder.CreateIntCast(
14131 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14132 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
14133 }
14134
14135public:
14137 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14138
14139 /// Adjusts extractelements after reusing them.
14140 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14141 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14142 unsigned NumParts, bool &UseVecBaseAsInput) {
14143 UseVecBaseAsInput = false;
14144 SmallPtrSet<Value *, 4> UniqueBases;
14145 Value *VecBase = nullptr;
14146 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14147 if (!E->ReorderIndices.empty()) {
14148 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14149 E->ReorderIndices.end());
14150 reorderScalars(VL, ReorderMask);
14151 }
14152 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14153 int Idx = Mask[I];
14154 if (Idx == PoisonMaskElem)
14155 continue;
14156 auto *EI = cast<ExtractElementInst>(VL[I]);
14157 VecBase = EI->getVectorOperand();
14158 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
14159 VecBase = TE->VectorizedValue;
14160 assert(VecBase && "Expected vectorized value.");
14161 UniqueBases.insert(VecBase);
14162 // If the only one use is vectorized - can delete the extractelement
14163 // itself.
14164 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14165 (NumParts != 1 && count(VL, EI) > 1) ||
14166 any_of(EI->users(), [&](User *U) {
14167 const TreeEntry *UTE = R.getTreeEntry(U);
14168 return !UTE || R.MultiNodeScalars.contains(U) ||
14169 (isa<GetElementPtrInst>(U) &&
14170 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14171 count_if(R.VectorizableTree,
14172 [&](const std::unique_ptr<TreeEntry> &TE) {
14173 return any_of(TE->UserTreeIndices,
14174 [&](const EdgeInfo &Edge) {
14175 return Edge.UserTE == UTE;
14176 }) &&
14177 is_contained(VL, EI);
14178 }) != 1;
14179 }))
14180 continue;
14181 R.eraseInstruction(EI);
14182 }
14183 if (NumParts == 1 || UniqueBases.size() == 1) {
14184 assert(VecBase && "Expected vectorized value.");
14185 return castToScalarTyElem(VecBase);
14186 }
14187 UseVecBaseAsInput = true;
14188 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14189 for (auto [I, Idx] : enumerate(Mask))
14190 if (Idx != PoisonMaskElem)
14191 Idx = I;
14192 };
14193 // Perform multi-register vector shuffle, joining them into a single virtual
14194 // long vector.
14195 // Need to shuffle each part independently and then insert all this parts
14196 // into a long virtual vector register, forming the original vector.
14197 Value *Vec = nullptr;
14198 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14199 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14200 for (unsigned Part : seq<unsigned>(NumParts)) {
14201 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14202 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14203 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14204 constexpr int MaxBases = 2;
14205 SmallVector<Value *, MaxBases> Bases(MaxBases);
14206 auto VLMask = zip(SubVL, SubMask);
14207 const unsigned VF = std::accumulate(
14208 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14209 if (std::get<1>(D) == PoisonMaskElem)
14210 return S;
14211 Value *VecOp =
14212 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14213 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14214 VecOp = TE->VectorizedValue;
14215 assert(VecOp && "Expected vectorized value.");
14216 const unsigned Size =
14217 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14218 return std::max(S, Size);
14219 });
14220 for (const auto [V, I] : VLMask) {
14221 if (I == PoisonMaskElem)
14222 continue;
14223 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14224 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14225 VecOp = TE->VectorizedValue;
14226 assert(VecOp && "Expected vectorized value.");
14227 VecOp = castToScalarTyElem(VecOp);
14228 Bases[I / VF] = VecOp;
14229 }
14230 if (!Bases.front())
14231 continue;
14232 Value *SubVec;
14233 if (Bases.back()) {
14234 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14235 TransformToIdentity(SubMask);
14236 } else {
14237 SubVec = Bases.front();
14238 }
14239 if (!Vec) {
14240 Vec = SubVec;
14241 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14242 [&](unsigned P) {
14243 ArrayRef<int> SubMask =
14244 Mask.slice(P * SliceSize,
14245 getNumElems(Mask.size(),
14246 SliceSize, P));
14247 return all_of(SubMask, [](int Idx) {
14248 return Idx == PoisonMaskElem;
14249 });
14250 })) &&
14251 "Expected first part or all previous parts masked.");
14252 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14253 } else {
14254 unsigned NewVF =
14255 cast<FixedVectorType>(Vec->getType())->getNumElements();
14256 if (Vec->getType() != SubVec->getType()) {
14257 unsigned SubVecVF =
14258 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14259 NewVF = std::max(NewVF, SubVecVF);
14260 }
14261 // Adjust SubMask.
14262 for (int &Idx : SubMask)
14263 if (Idx != PoisonMaskElem)
14264 Idx += NewVF;
14265 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14266 Vec = createShuffle(Vec, SubVec, VecMask);
14267 TransformToIdentity(VecMask);
14268 }
14269 }
14270 copy(VecMask, Mask.begin());
14271 return Vec;
14272 }
14273 /// Checks if the specified entry \p E needs to be delayed because of its
14274 /// dependency nodes.
14275 std::optional<Value *>
14276 needToDelay(const TreeEntry *E,
14278 // No need to delay emission if all deps are ready.
14279 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14280 return all_of(
14281 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14282 }))
14283 return std::nullopt;
14284 // Postpone gather emission, will be emitted after the end of the
14285 // process to keep correct order.
14286 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14287 return Builder.CreateAlignedLoad(
14288 ResVecTy,
14290 MaybeAlign());
14291 }
14292 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14293 /// shuffling.
14294 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14295 Value *V1 = E1.VectorizedValue;
14296 if (V1->getType()->isIntOrIntVectorTy())
14297 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14298 if (isa<PoisonValue>(V))
14299 return false;
14300 return !isKnownNonNegative(
14301 V, SimplifyQuery(*R.DL));
14302 }));
14303 Value *V2 = E2.VectorizedValue;
14304 if (V2->getType()->isIntOrIntVectorTy())
14305 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14306 if (isa<PoisonValue>(V))
14307 return false;
14308 return !isKnownNonNegative(
14309 V, SimplifyQuery(*R.DL));
14310 }));
14311 add(V1, V2, Mask);
14312 }
14313 /// Adds single input vector (in form of tree entry) and the mask for its
14314 /// shuffling.
14315 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14316 Value *V1 = E1.VectorizedValue;
14317 if (V1->getType()->isIntOrIntVectorTy())
14318 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14319 if (isa<PoisonValue>(V))
14320 return false;
14321 return !isKnownNonNegative(
14322 V, SimplifyQuery(*R.DL));
14323 }));
14324 add(V1, Mask);
14325 }
14326 /// Adds 2 input vectors and the mask for their shuffling.
14327 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14328 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14329 assert(isa<FixedVectorType>(V1->getType()) &&
14330 isa<FixedVectorType>(V2->getType()) &&
14331 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14332 V1 = castToScalarTyElem(V1);
14333 V2 = castToScalarTyElem(V2);
14334 if (InVectors.empty()) {
14335 InVectors.push_back(V1);
14336 InVectors.push_back(V2);
14337 CommonMask.assign(Mask.begin(), Mask.end());
14338 return;
14339 }
14340 Value *Vec = InVectors.front();
14341 if (InVectors.size() == 2) {
14342 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14343 transformMaskAfterShuffle(CommonMask, CommonMask);
14344 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14345 Mask.size()) {
14346 Vec = createShuffle(Vec, nullptr, CommonMask);
14347 transformMaskAfterShuffle(CommonMask, CommonMask);
14348 }
14349 V1 = createShuffle(V1, V2, Mask);
14350 unsigned VF = std::max(getVF(V1), getVF(Vec));
14351 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14352 if (Mask[Idx] != PoisonMaskElem)
14353 CommonMask[Idx] = Idx + VF;
14354 InVectors.front() = Vec;
14355 if (InVectors.size() == 2)
14356 InVectors.back() = V1;
14357 else
14358 InVectors.push_back(V1);
14359 }
14360 /// Adds another one input vector and the mask for the shuffling.
14361 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14362 assert(isa<FixedVectorType>(V1->getType()) &&
14363 "castToScalarTyElem expects V1 to be FixedVectorType");
14364 V1 = castToScalarTyElem(V1);
14365 if (InVectors.empty()) {
14366 InVectors.push_back(V1);
14367 CommonMask.assign(Mask.begin(), Mask.end());
14368 return;
14369 }
14370 const auto *It = find(InVectors, V1);
14371 if (It == InVectors.end()) {
14372 if (InVectors.size() == 2 ||
14373 InVectors.front()->getType() != V1->getType()) {
14374 Value *V = InVectors.front();
14375 if (InVectors.size() == 2) {
14376 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14377 transformMaskAfterShuffle(CommonMask, CommonMask);
14378 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14379 CommonMask.size()) {
14380 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14381 transformMaskAfterShuffle(CommonMask, CommonMask);
14382 }
14383 unsigned VF = std::max(CommonMask.size(), Mask.size());
14384 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14385 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14386 CommonMask[Idx] =
14387 V->getType() != V1->getType()
14388 ? Idx + VF
14389 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14390 ->getNumElements();
14391 if (V->getType() != V1->getType())
14392 V1 = createShuffle(V1, nullptr, Mask);
14393 InVectors.front() = V;
14394 if (InVectors.size() == 2)
14395 InVectors.back() = V1;
14396 else
14397 InVectors.push_back(V1);
14398 return;
14399 }
14400 // Check if second vector is required if the used elements are already
14401 // used from the first one.
14402 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14403 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14404 InVectors.push_back(V1);
14405 break;
14406 }
14407 }
14408 int VF = getVF(V1);
14409 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14410 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14411 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14412 }
14413 /// Adds another one input vector and the mask for the shuffling.
14415 SmallVector<int> NewMask;
14416 inversePermutation(Order, NewMask);
14417 add(V1, NewMask);
14418 }
14419 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14420 Value *Root = nullptr) {
14421 return R.gather(VL, Root, ScalarTy,
14422 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14423 return createShuffle(V1, V2, Mask);
14424 });
14425 }
14426 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14427 /// Finalize emission of the shuffles.
14428 /// \param Action the action (if any) to be performed before final applying of
14429 /// the \p ExtMask mask.
14430 Value *
14432 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14433 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14434 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14435 IsFinalized = true;
14436 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
14437 SmallVector<int> NewExtMask(ExtMask);
14438 if (ScalarTyNumElements != 1) {
14439 assert(SLPReVec && "FixedVectorType is not expected.");
14440 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask);
14441 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask);
14442 ExtMask = NewExtMask;
14443 }
14444 if (Action) {
14445 Value *Vec = InVectors.front();
14446 if (InVectors.size() == 2) {
14447 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14448 InVectors.pop_back();
14449 } else {
14450 Vec = createShuffle(Vec, nullptr, CommonMask);
14451 }
14452 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14453 if (CommonMask[Idx] != PoisonMaskElem)
14454 CommonMask[Idx] = Idx;
14455 assert(VF > 0 &&
14456 "Expected vector length for the final value before action.");
14457 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14458 if (VecVF < VF) {
14459 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14460 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14461 Vec = createShuffle(Vec, nullptr, ResizeMask);
14462 }
14463 Action(Vec, CommonMask);
14464 InVectors.front() = Vec;
14465 }
14466 if (!SubVectors.empty()) {
14467 Value *Vec = InVectors.front();
14468 if (InVectors.size() == 2) {
14469 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14470 InVectors.pop_back();
14471 } else {
14472 Vec = createShuffle(Vec, nullptr, CommonMask);
14473 }
14474 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14475 if (CommonMask[Idx] != PoisonMaskElem)
14476 CommonMask[Idx] = Idx;
14477 auto CreateSubVectors = [&](Value *Vec,
14478 SmallVectorImpl<int> &CommonMask) {
14479 for (auto [E, Idx] : SubVectors) {
14480 Value *V = E->VectorizedValue;
14481 if (V->getType()->isIntOrIntVectorTy())
14482 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14483 if (isa<PoisonValue>(V))
14484 return false;
14485 return !isKnownNonNegative(
14486 V, SimplifyQuery(*R.DL));
14487 }));
14488 unsigned InsertionIndex = Idx * ScalarTyNumElements;
14489 const unsigned SubVecVF =
14490 cast<FixedVectorType>(V->getType())->getNumElements();
14491 if (InsertionIndex % SubVecVF == 0) {
14492 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
14493 Builder.getInt64(InsertionIndex));
14494 } else {
14495 // Create shuffle, insertvector requires that index is multiple of
14496 // the subvectors length.
14497 const unsigned VecVF =
14498 cast<FixedVectorType>(Vec->getType())->getNumElements();
14500 std::iota(Mask.begin(), Mask.end(), 0);
14501 for (unsigned I : seq<unsigned>(
14502 InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements))
14503 Mask[I] = I - Idx + VecVF;
14504 Vec = createShuffle(Vec, V, Mask);
14505 }
14506 if (!CommonMask.empty()) {
14507 std::iota(
14508 std::next(CommonMask.begin(), InsertionIndex),
14509 std::next(CommonMask.begin(),
14510 (Idx + E->getVectorFactor()) * ScalarTyNumElements),
14511 InsertionIndex);
14512 }
14513 }
14514 return Vec;
14515 };
14516 if (SubVectorsMask.empty()) {
14517 Vec = CreateSubVectors(Vec, CommonMask);
14518 } else {
14519 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14520 copy(SubVectorsMask, SVMask.begin());
14521 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14522 if (I2 != PoisonMaskElem) {
14523 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14524 I1 = I2 + CommonMask.size();
14525 }
14526 }
14527 Value *InsertVec =
14528 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14529 Vec = createShuffle(InsertVec, Vec, SVMask);
14530 for (unsigned I : seq<unsigned>(CommonMask.size())) {
14531 if (SVMask[I] != PoisonMaskElem)
14532 CommonMask[I] = I;
14533 }
14534 }
14535 InVectors.front() = Vec;
14536 }
14537
14538 if (!ExtMask.empty()) {
14539 if (CommonMask.empty()) {
14540 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14541 } else {
14542 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14543 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14544 if (ExtMask[I] == PoisonMaskElem)
14545 continue;
14546 NewMask[I] = CommonMask[ExtMask[I]];
14547 }
14548 CommonMask.swap(NewMask);
14549 }
14550 }
14551 if (CommonMask.empty()) {
14552 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14553 return InVectors.front();
14554 }
14555 if (InVectors.size() == 2)
14556 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14557 return createShuffle(InVectors.front(), nullptr, CommonMask);
14558 }
14559
14561 assert((IsFinalized || CommonMask.empty()) &&
14562 "Shuffle construction must be finalized.");
14563 }
14564};
14565
14566BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14567 unsigned NodeIdx) {
14568 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14569 InstructionsState S = getSameOpcode(VL, *TLI);
14570 // Special processing for GEPs bundle, which may include non-gep values.
14571 if (!S && VL.front()->getType()->isPointerTy()) {
14572 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14573 if (It != VL.end())
14574 S = getSameOpcode(*It, *TLI);
14575 }
14576 if (!S)
14577 return nullptr;
14578 auto CheckSameVE = [&](const TreeEntry *VE) {
14579 return VE->isSame(VL) &&
14580 (any_of(VE->UserTreeIndices,
14581 [E, NodeIdx](const EdgeInfo &EI) {
14582 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14583 }) ||
14584 any_of(VectorizableTree,
14585 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14586 return TE->isOperandGatherNode(
14587 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14588 VE->isSame(TE->Scalars);
14589 }));
14590 };
14591 TreeEntry *VE = getTreeEntry(S.getMainOp());
14592 if (VE && CheckSameVE(VE))
14593 return VE;
14594 auto It = MultiNodeScalars.find(S.getMainOp());
14595 if (It != MultiNodeScalars.end()) {
14596 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
14597 return TE != VE && CheckSameVE(TE);
14598 });
14599 if (I != It->getSecond().end())
14600 return *I;
14601 }
14602 return nullptr;
14603}
14604
14605Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14606 bool PostponedPHIs) {
14607 ValueList &VL = E->getOperand(NodeIdx);
14608 const unsigned VF = VL.size();
14609 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14610 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14611 // V may be affected by MinBWs.
14612 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14613 // factor is the number of elements, not their type.
14614 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14615 unsigned NumElements = getNumElements(VL.front()->getType());
14616 ShuffleInstructionBuilder ShuffleBuilder(
14617 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14618 : ScalarTy,
14619 Builder, *this);
14620 ShuffleBuilder.add(V, Mask);
14622 E->CombinedEntriesWithIndices.size());
14623 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14624 [&](const auto &P) {
14625 return std::make_pair(VectorizableTree[P.first].get(),
14626 P.second);
14627 });
14628 assert((E->CombinedEntriesWithIndices.empty() ||
14629 E->ReorderIndices.empty()) &&
14630 "Expected either combined subnodes or reordering");
14631 return ShuffleBuilder.finalize({}, SubVectors, {});
14632 };
14633 Value *V = vectorizeTree(VE, PostponedPHIs);
14634 if (VF * getNumElements(VL[0]->getType()) !=
14635 cast<FixedVectorType>(V->getType())->getNumElements()) {
14636 if (!VE->ReuseShuffleIndices.empty()) {
14637 // Reshuffle to get only unique values.
14638 // If some of the scalars are duplicated in the vectorization
14639 // tree entry, we do not vectorize them but instead generate a
14640 // mask for the reuses. But if there are several users of the
14641 // same entry, they may have different vectorization factors.
14642 // This is especially important for PHI nodes. In this case, we
14643 // need to adapt the resulting instruction for the user
14644 // vectorization factor and have to reshuffle it again to take
14645 // only unique elements of the vector. Without this code the
14646 // function incorrectly returns reduced vector instruction with
14647 // the same elements, not with the unique ones.
14648
14649 // block:
14650 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14651 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14652 // ... (use %2)
14653 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14654 // br %block
14656 for (auto [I, V] : enumerate(VL)) {
14657 if (isa<PoisonValue>(V))
14658 continue;
14659 Mask[I] = VE->findLaneForValue(V);
14660 }
14661 V = FinalShuffle(V, Mask);
14662 } else {
14663 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14664 "Expected vectorization factor less "
14665 "than original vector size.");
14666 SmallVector<int> UniformMask(VF, 0);
14667 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14668 V = FinalShuffle(V, UniformMask);
14669 }
14670 }
14671 // Need to update the operand gather node, if actually the operand is not a
14672 // vectorized node, but the buildvector/gather node, which matches one of
14673 // the vectorized nodes.
14674 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14675 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14676 }) == VE->UserTreeIndices.end()) {
14677 auto *It =
14678 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14679 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14680 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14681 });
14682 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14683 (*It)->VectorizedValue = V;
14684 }
14685 return V;
14686 }
14687
14688 // Find the corresponding gather entry and vectorize it.
14689 // Allows to be more accurate with tree/graph transformations, checks for the
14690 // correctness of the transformations in many cases.
14691 auto *I = find_if(VectorizableTree,
14692 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14693 return TE->isOperandGatherNode({E, NodeIdx});
14694 });
14695 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14696 assert(I->get()->UserTreeIndices.size() == 1 &&
14697 "Expected only single user for the gather node.");
14698 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14699 return vectorizeTree(I->get(), PostponedPHIs);
14700}
14701
14702template <typename BVTy, typename ResTy, typename... Args>
14703ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14704 Args &...Params) {
14705 assert(E->isGather() && "Expected gather node.");
14706 unsigned VF = E->getVectorFactor();
14707
14708 bool NeedFreeze = false;
14709 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14710 E->ReuseShuffleIndices.end());
14711 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14712 // Clear values, to be replaced by insertvector instructions.
14713 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14714 for_each(MutableArrayRef(GatheredScalars)
14715 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14716 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14718 E->CombinedEntriesWithIndices.size());
14719 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14720 [&](const auto &P) {
14721 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14722 });
14723 // Build a mask out of the reorder indices and reorder scalars per this
14724 // mask.
14725 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14726 E->ReorderIndices.end());
14727 if (!ReorderMask.empty())
14728 reorderScalars(GatheredScalars, ReorderMask);
14729 SmallVector<int> SubVectorsMask;
14730 inversePermutation(E->ReorderIndices, SubVectorsMask);
14731 // Transform non-clustered elements in the mask to poison (-1).
14732 // "Clustered" operations will be reordered using this mask later.
14733 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14734 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14735 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14736 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14737 } else {
14738 SubVectorsMask.clear();
14739 }
14740 SmallVector<Value *> StoredGS(GatheredScalars);
14741 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14742 unsigned I, unsigned SliceSize,
14743 bool IsNotPoisonous) {
14744 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14745 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14746 }))
14747 return false;
14748 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14749 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14750 if (UserTE->getNumOperands() != 2)
14751 return false;
14752 if (!IsNotPoisonous) {
14753 auto *It =
14754 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14755 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14756 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14757 }) != TE->UserTreeIndices.end();
14758 });
14759 if (It == VectorizableTree.end())
14760 return false;
14761 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14762 if (!(*It)->ReorderIndices.empty()) {
14763 inversePermutation((*It)->ReorderIndices, ReorderMask);
14764 reorderScalars(GS, ReorderMask);
14765 }
14766 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14767 Value *V0 = std::get<0>(P);
14768 Value *V1 = std::get<1>(P);
14769 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14770 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14771 is_contained(E->Scalars, V1));
14772 }))
14773 return false;
14774 }
14775 int Idx;
14776 if ((Mask.size() < InputVF &&
14778 Idx == 0) ||
14779 (Mask.size() == InputVF &&
14780 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14781 std::iota(
14782 std::next(Mask.begin(), I * SliceSize),
14783 std::next(Mask.begin(),
14784 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14785 0);
14786 } else {
14787 unsigned IVal =
14788 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14789 std::fill(
14790 std::next(Mask.begin(), I * SliceSize),
14791 std::next(Mask.begin(),
14792 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14793 IVal);
14794 }
14795 return true;
14796 };
14797 BVTy ShuffleBuilder(ScalarTy, Params...);
14798 ResTy Res = ResTy();
14800 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14802 Value *ExtractVecBase = nullptr;
14803 bool UseVecBaseAsInput = false;
14806 Type *OrigScalarTy = GatheredScalars.front()->getType();
14807 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14808 unsigned NumParts = TTI->getNumberOfParts(VecTy);
14809 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14810 VecTy->getNumElements() % NumParts != 0 ||
14812 VecTy->getNumElements() / NumParts))
14813 NumParts = 1;
14814 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14815 // Check for gathered extracts.
14816 bool Resized = false;
14817 ExtractShuffles =
14818 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14819 if (!ExtractShuffles.empty()) {
14820 SmallVector<const TreeEntry *> ExtractEntries;
14821 for (auto [Idx, I] : enumerate(ExtractMask)) {
14822 if (I == PoisonMaskElem)
14823 continue;
14824 if (const auto *TE = getTreeEntry(
14825 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14826 ExtractEntries.push_back(TE);
14827 }
14828 if (std::optional<ResTy> Delayed =
14829 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14830 // Delay emission of gathers which are not ready yet.
14831 PostponedGathers.insert(E);
14832 // Postpone gather emission, will be emitted after the end of the
14833 // process to keep correct order.
14834 return *Delayed;
14835 }
14836 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14837 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14838 ExtractVecBase = VecBase;
14839 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14840 if (VF == VecBaseTy->getNumElements() &&
14841 GatheredScalars.size() != VF) {
14842 Resized = true;
14843 GatheredScalars.append(VF - GatheredScalars.size(),
14844 PoisonValue::get(OrigScalarTy));
14845 }
14846 }
14847 }
14848 // Gather extracts after we check for full matched gathers only.
14849 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
14850 ((E->getOpcode() == Instruction::Load ||
14851 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14852 any_of(E->Scalars,
14853 [this](Value *V) {
14854 return isa<LoadInst>(V) && getTreeEntry(V);
14855 })) ||
14856 E->isAltShuffle() ||
14857 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14858 isSplat(E->Scalars) ||
14859 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14860 GatherShuffles =
14861 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14862 }
14863 if (!GatherShuffles.empty()) {
14864 if (std::optional<ResTy> Delayed =
14865 ShuffleBuilder.needToDelay(E, Entries)) {
14866 // Delay emission of gathers which are not ready yet.
14867 PostponedGathers.insert(E);
14868 // Postpone gather emission, will be emitted after the end of the
14869 // process to keep correct order.
14870 return *Delayed;
14871 }
14872 if (GatherShuffles.size() == 1 &&
14873 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14874 Entries.front().front()->isSame(E->Scalars)) {
14875 // Perfect match in the graph, will reuse the previously vectorized
14876 // node. Cost is 0.
14877 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14878 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14879 // Restore the mask for previous partially matched values.
14880 Mask.resize(E->Scalars.size());
14881 const TreeEntry *FrontTE = Entries.front().front();
14882 if (FrontTE->ReorderIndices.empty() &&
14883 ((FrontTE->ReuseShuffleIndices.empty() &&
14884 E->Scalars.size() == FrontTE->Scalars.size()) ||
14885 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14886 std::iota(Mask.begin(), Mask.end(), 0);
14887 } else {
14888 for (auto [I, V] : enumerate(E->Scalars)) {
14889 if (isa<PoisonValue>(V)) {
14891 continue;
14892 }
14893 Mask[I] = FrontTE->findLaneForValue(V);
14894 }
14895 }
14896 ShuffleBuilder.add(*FrontTE, Mask);
14897 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
14898 SubVectorsMask);
14899 return Res;
14900 }
14901 if (!Resized) {
14902 if (GatheredScalars.size() != VF &&
14903 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14904 return any_of(TEs, [&](const TreeEntry *TE) {
14905 return TE->getVectorFactor() == VF;
14906 });
14907 }))
14908 GatheredScalars.append(VF - GatheredScalars.size(),
14909 PoisonValue::get(OrigScalarTy));
14910 }
14911 // Remove shuffled elements from list of gathers.
14912 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14913 if (Mask[I] != PoisonMaskElem)
14914 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14915 }
14916 }
14917 }
14918 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14919 SmallVectorImpl<int> &ReuseMask,
14920 bool IsRootPoison) {
14921 // For splats with can emit broadcasts instead of gathers, so try to find
14922 // such sequences.
14923 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
14924 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14925 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
14926 SmallVector<int> UndefPos;
14927 DenseMap<Value *, unsigned> UniquePositions;
14928 // Gather unique non-const values and all constant values.
14929 // For repeated values, just shuffle them.
14930 int NumNonConsts = 0;
14931 int SinglePos = 0;
14932 for (auto [I, V] : enumerate(Scalars)) {
14933 if (isa<UndefValue>(V)) {
14934 if (!isa<PoisonValue>(V)) {
14935 ReuseMask[I] = I;
14936 UndefPos.push_back(I);
14937 }
14938 continue;
14939 }
14940 if (isConstant(V)) {
14941 ReuseMask[I] = I;
14942 continue;
14943 }
14944 ++NumNonConsts;
14945 SinglePos = I;
14946 Value *OrigV = V;
14947 Scalars[I] = PoisonValue::get(OrigScalarTy);
14948 if (IsSplat) {
14949 Scalars.front() = OrigV;
14950 ReuseMask[I] = 0;
14951 } else {
14952 const auto Res = UniquePositions.try_emplace(OrigV, I);
14953 Scalars[Res.first->second] = OrigV;
14954 ReuseMask[I] = Res.first->second;
14955 }
14956 }
14957 if (NumNonConsts == 1) {
14958 // Restore single insert element.
14959 if (IsSplat) {
14960 ReuseMask.assign(VF, PoisonMaskElem);
14961 std::swap(Scalars.front(), Scalars[SinglePos]);
14962 if (!UndefPos.empty() && UndefPos.front() == 0)
14963 Scalars.front() = UndefValue::get(OrigScalarTy);
14964 }
14965 ReuseMask[SinglePos] = SinglePos;
14966 } else if (!UndefPos.empty() && IsSplat) {
14967 // For undef values, try to replace them with the simple broadcast.
14968 // We can do it if the broadcasted value is guaranteed to be
14969 // non-poisonous, or by freezing the incoming scalar value first.
14970 auto *It = find_if(Scalars, [this, E](Value *V) {
14971 return !isa<UndefValue>(V) &&
14972 (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
14973 (E->UserTreeIndices.size() == 1 &&
14974 any_of(V->uses(), [E](const Use &U) {
14975 // Check if the value already used in the same operation in
14976 // one of the nodes already.
14977 return E->UserTreeIndices.front().EdgeIdx !=
14978 U.getOperandNo() &&
14979 is_contained(
14980 E->UserTreeIndices.front().UserTE->Scalars,
14981 U.getUser());
14982 })));
14983 });
14984 if (It != Scalars.end()) {
14985 // Replace undefs by the non-poisoned scalars and emit broadcast.
14986 int Pos = std::distance(Scalars.begin(), It);
14987 for (int I : UndefPos) {
14988 // Set the undef position to the non-poisoned scalar.
14989 ReuseMask[I] = Pos;
14990 // Replace the undef by the poison, in the mask it is replaced by
14991 // non-poisoned scalar already.
14992 if (I != Pos)
14993 Scalars[I] = PoisonValue::get(OrigScalarTy);
14994 }
14995 } else {
14996 // Replace undefs by the poisons, emit broadcast and then emit
14997 // freeze.
14998 for (int I : UndefPos) {
14999 ReuseMask[I] = PoisonMaskElem;
15000 if (isa<UndefValue>(Scalars[I]))
15001 Scalars[I] = PoisonValue::get(OrigScalarTy);
15002 }
15003 NeedFreeze = true;
15004 }
15005 }
15006 };
15007 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15008 bool IsNonPoisoned = true;
15009 bool IsUsedInExpr = true;
15010 Value *Vec1 = nullptr;
15011 if (!ExtractShuffles.empty()) {
15012 // Gather of extractelements can be represented as just a shuffle of
15013 // a single/two vectors the scalars are extracted from.
15014 // Find input vectors.
15015 Value *Vec2 = nullptr;
15016 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15017 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
15018 ExtractMask[I] = PoisonMaskElem;
15019 }
15020 if (UseVecBaseAsInput) {
15021 Vec1 = ExtractVecBase;
15022 } else {
15023 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15024 if (ExtractMask[I] == PoisonMaskElem)
15025 continue;
15026 if (isa<UndefValue>(E->Scalars[I]))
15027 continue;
15028 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15029 Value *VecOp = EI->getVectorOperand();
15030 if (const auto *TE = getTreeEntry(VecOp))
15031 if (TE->VectorizedValue)
15032 VecOp = TE->VectorizedValue;
15033 if (!Vec1) {
15034 Vec1 = VecOp;
15035 } else if (Vec1 != VecOp) {
15036 assert((!Vec2 || Vec2 == VecOp) &&
15037 "Expected only 1 or 2 vectors shuffle.");
15038 Vec2 = VecOp;
15039 }
15040 }
15041 }
15042 if (Vec2) {
15043 IsUsedInExpr = false;
15044 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
15045 isGuaranteedNotToBePoison(Vec2, AC);
15046 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15047 } else if (Vec1) {
15048 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
15049 IsUsedInExpr &= FindReusedSplat(
15050 ExtractMask,
15051 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15052 ExtractMask.size(), IsNotPoisonedVec);
15053 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
15054 IsNonPoisoned &= IsNotPoisonedVec;
15055 } else {
15056 IsUsedInExpr = false;
15057 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15058 /*ForExtracts=*/true);
15059 }
15060 }
15061 if (!GatherShuffles.empty()) {
15062 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
15063 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
15064 for (const auto [I, TEs] : enumerate(Entries)) {
15065 if (TEs.empty()) {
15066 assert(!GatherShuffles[I] &&
15067 "No shuffles with empty entries list expected.");
15068 continue;
15069 }
15070 assert((TEs.size() == 1 || TEs.size() == 2) &&
15071 "Expected shuffle of 1 or 2 entries.");
15072 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
15073 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
15074 VecMask.assign(VecMask.size(), PoisonMaskElem);
15075 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
15076 if (TEs.size() == 1) {
15077 bool IsNotPoisonedVec =
15078 TEs.front()->VectorizedValue
15079 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15080 : true;
15081 IsUsedInExpr &=
15082 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
15083 SliceSize, IsNotPoisonedVec);
15084 ShuffleBuilder.add(*TEs.front(), VecMask);
15085 IsNonPoisoned &= IsNotPoisonedVec;
15086 } else {
15087 IsUsedInExpr = false;
15088 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15089 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15090 IsNonPoisoned &=
15091 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15092 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15093 }
15094 }
15095 }
15096 // Try to figure out best way to combine values: build a shuffle and insert
15097 // elements or just build several shuffles.
15098 // Insert non-constant scalars.
15099 SmallVector<Value *> NonConstants(GatheredScalars);
15100 int EMSz = ExtractMask.size();
15101 int MSz = Mask.size();
15102 // Try to build constant vector and shuffle with it only if currently we
15103 // have a single permutation and more than 1 scalar constants.
15104 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15105 bool IsIdentityShuffle =
15106 ((UseVecBaseAsInput ||
15107 all_of(ExtractShuffles,
15108 [](const std::optional<TTI::ShuffleKind> &SK) {
15109 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15111 })) &&
15112 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
15113 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15114 (!GatherShuffles.empty() &&
15115 all_of(GatherShuffles,
15116 [](const std::optional<TTI::ShuffleKind> &SK) {
15117 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15119 }) &&
15120 none_of(Mask, [&](int I) { return I >= MSz; }) &&
15122 bool EnoughConstsForShuffle =
15123 IsSingleShuffle &&
15124 (none_of(GatheredScalars,
15125 [](Value *V) {
15126 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15127 }) ||
15128 any_of(GatheredScalars,
15129 [](Value *V) {
15130 return isa<Constant>(V) && !isa<UndefValue>(V);
15131 })) &&
15132 (!IsIdentityShuffle ||
15133 (GatheredScalars.size() == 2 &&
15134 any_of(GatheredScalars,
15135 [](Value *V) { return !isa<UndefValue>(V); })) ||
15136 count_if(GatheredScalars, [](Value *V) {
15137 return isa<Constant>(V) && !isa<PoisonValue>(V);
15138 }) > 1);
15139 // NonConstants array contains just non-constant values, GatheredScalars
15140 // contains only constant to build final vector and then shuffle.
15141 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
15142 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15143 NonConstants[I] = PoisonValue::get(OrigScalarTy);
15144 else
15145 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15146 }
15147 // Generate constants for final shuffle and build a mask for them.
15148 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15149 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
15150 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
15151 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15152 ShuffleBuilder.add(BV, BVMask);
15153 }
15154 if (all_of(NonConstants, [=](Value *V) {
15155 return isa<PoisonValue>(V) ||
15156 (IsSingleShuffle && ((IsIdentityShuffle &&
15157 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15158 }))
15159 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15160 SubVectorsMask);
15161 else
15162 Res = ShuffleBuilder.finalize(
15163 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15164 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
15165 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
15166 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
15167 });
15168 } else if (!allConstant(GatheredScalars)) {
15169 // Gather unique scalars and all constants.
15170 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
15171 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
15172 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15173 ShuffleBuilder.add(BV, ReuseMask);
15174 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15175 SubVectorsMask);
15176 } else {
15177 // Gather all constants.
15178 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15179 for (auto [I, V] : enumerate(GatheredScalars)) {
15180 if (!isa<PoisonValue>(V))
15181 Mask[I] = I;
15182 }
15183 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15184 ShuffleBuilder.add(BV, Mask);
15185 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15186 SubVectorsMask);
15187 }
15188
15189 if (NeedFreeze)
15190 Res = ShuffleBuilder.createFreeze(Res);
15191 return Res;
15192}
15193
15194Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15195 bool PostponedPHIs) {
15196 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15197 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15198 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15199 Builder, *this);
15200}
15201
15202/// \returns \p I after propagating metadata from \p VL only for instructions in
15203/// \p VL.
15206 for (Value *V : VL)
15207 if (isa<Instruction>(V))
15208 Insts.push_back(V);
15209 return llvm::propagateMetadata(Inst, Insts);
15210}
15211
15212Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15213 IRBuilderBase::InsertPointGuard Guard(Builder);
15214
15215 if (E->VectorizedValue &&
15216 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15217 E->isAltShuffle())) {
15218 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15219 return E->VectorizedValue;
15220 }
15221
15222 Value *V = E->Scalars.front();
15223 Type *ScalarTy = V->getType();
15224 if (!isa<CmpInst>(V))
15225 ScalarTy = getValueType(V);
15226 auto It = MinBWs.find(E);
15227 if (It != MinBWs.end()) {
15228 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15229 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15230 if (VecTy)
15231 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15232 }
15233 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15234 if (E->isGather()) {
15235 // Set insert point for non-reduction initial nodes.
15236 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15237 setInsertPointAfterBundle(E);
15238 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15239 E->VectorizedValue = Vec;
15240 return Vec;
15241 }
15242
15243 bool IsReverseOrder =
15244 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15245 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15246 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15247 if (E->getOpcode() == Instruction::Store &&
15248 E->State == TreeEntry::Vectorize) {
15250 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15251 E->ReorderIndices.size());
15252 ShuffleBuilder.add(V, Mask);
15253 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15254 ShuffleBuilder.addOrdered(V, {});
15255 } else {
15256 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15257 }
15259 E->CombinedEntriesWithIndices.size());
15260 transform(
15261 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15262 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15263 });
15264 assert(
15265 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15266 "Expected either combined subnodes or reordering");
15267 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15268 };
15269
15270 assert(!E->isGather() && "Unhandled state");
15271 unsigned ShuffleOrOp =
15272 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15273 Instruction *VL0 = E->getMainOp();
15274 auto GetOperandSignedness = [&](unsigned Idx) {
15275 const TreeEntry *OpE = getOperandEntry(E, Idx);
15276 bool IsSigned = false;
15277 auto It = MinBWs.find(OpE);
15278 if (It != MinBWs.end())
15279 IsSigned = It->second.second;
15280 else
15281 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15282 if (isa<PoisonValue>(V))
15283 return false;
15284 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15285 });
15286 return IsSigned;
15287 };
15288 switch (ShuffleOrOp) {
15289 case Instruction::PHI: {
15290 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15291 E != VectorizableTree.front().get() ||
15292 !E->UserTreeIndices.empty()) &&
15293 "PHI reordering is free.");
15294 if (PostponedPHIs && E->VectorizedValue)
15295 return E->VectorizedValue;
15296 auto *PH = cast<PHINode>(VL0);
15297 Builder.SetInsertPoint(PH->getParent(),
15298 PH->getParent()->getFirstNonPHIIt());
15299 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15300 if (PostponedPHIs || !E->VectorizedValue) {
15301 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15302 E->PHI = NewPhi;
15303 Value *V = NewPhi;
15304
15305 // Adjust insertion point once all PHI's have been generated.
15306 Builder.SetInsertPoint(PH->getParent(),
15307 PH->getParent()->getFirstInsertionPt());
15308 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15309
15310 V = FinalShuffle(V, E);
15311
15312 E->VectorizedValue = V;
15313 if (PostponedPHIs)
15314 return V;
15315 }
15316 PHINode *NewPhi = cast<PHINode>(E->PHI);
15317 // If phi node is fully emitted - exit.
15318 if (NewPhi->getNumIncomingValues() != 0)
15319 return NewPhi;
15320
15321 // PHINodes may have multiple entries from the same block. We want to
15322 // visit every block once.
15324
15325 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15327 BasicBlock *IBB = PH->getIncomingBlock(I);
15328
15329 // Stop emission if all incoming values are generated.
15330 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15331 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15332 return NewPhi;
15333 }
15334
15335 if (!VisitedBBs.insert(IBB).second) {
15336 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15337 continue;
15338 }
15339
15340 Builder.SetInsertPoint(IBB->getTerminator());
15341 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15342 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15343 if (VecTy != Vec->getType()) {
15344 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15345 MinBWs.contains(getOperandEntry(E, I))) &&
15346 "Expected item in MinBWs.");
15347 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15348 }
15349 NewPhi->addIncoming(Vec, IBB);
15350 }
15351
15352 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15353 "Invalid number of incoming values");
15354 assert(E->VectorizedValue && "Expected vectorized value.");
15355 return E->VectorizedValue;
15356 }
15357
15358 case Instruction::ExtractElement: {
15359 Value *V = E->getSingleOperand(0);
15360 if (const TreeEntry *TE = getTreeEntry(V))
15361 V = TE->VectorizedValue;
15362 setInsertPointAfterBundle(E);
15363 V = FinalShuffle(V, E);
15364 E->VectorizedValue = V;
15365 return V;
15366 }
15367 case Instruction::ExtractValue: {
15368 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15369 Builder.SetInsertPoint(LI);
15370 Value *Ptr = LI->getPointerOperand();
15371 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15372 Value *NewV = ::propagateMetadata(V, E->Scalars);
15373 NewV = FinalShuffle(NewV, E);
15374 E->VectorizedValue = NewV;
15375 return NewV;
15376 }
15377 case Instruction::InsertElement: {
15378 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15379 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15380 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15381 ArrayRef<Value *> Op = E->getOperand(1);
15382 Type *ScalarTy = Op.front()->getType();
15383 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15384 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15385 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15386 assert(Res.first > 0 && "Expected item in MinBWs.");
15387 V = Builder.CreateIntCast(
15388 V,
15390 ScalarTy,
15391 cast<FixedVectorType>(V->getType())->getNumElements()),
15392 Res.second);
15393 }
15394
15395 // Create InsertVector shuffle if necessary
15396 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15397 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15398 }));
15399 const unsigned NumElts =
15400 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15401 const unsigned NumScalars = E->Scalars.size();
15402
15403 unsigned Offset = *getElementIndex(VL0);
15404 assert(Offset < NumElts && "Failed to find vector index offset");
15405
15406 // Create shuffle to resize vector
15408 if (!E->ReorderIndices.empty()) {
15409 inversePermutation(E->ReorderIndices, Mask);
15410 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15411 } else {
15412 Mask.assign(NumElts, PoisonMaskElem);
15413 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15414 }
15415 // Create InsertVector shuffle if necessary
15416 bool IsIdentity = true;
15417 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15418 Mask.swap(PrevMask);
15419 for (unsigned I = 0; I < NumScalars; ++I) {
15420 Value *Scalar = E->Scalars[PrevMask[I]];
15421 unsigned InsertIdx = *getElementIndex(Scalar);
15422 IsIdentity &= InsertIdx - Offset == I;
15423 Mask[InsertIdx - Offset] = I;
15424 }
15425 if (!IsIdentity || NumElts != NumScalars) {
15426 Value *V2 = nullptr;
15427 bool IsVNonPoisonous =
15429 SmallVector<int> InsertMask(Mask);
15430 if (NumElts != NumScalars && Offset == 0) {
15431 // Follow all insert element instructions from the current buildvector
15432 // sequence.
15433 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15434 do {
15435 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15436 if (!InsertIdx)
15437 break;
15438 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15439 InsertMask[*InsertIdx] = *InsertIdx;
15440 if (!Ins->hasOneUse())
15441 break;
15442 Ins = dyn_cast_or_null<InsertElementInst>(
15443 Ins->getUniqueUndroppableUser());
15444 } while (Ins);
15445 SmallBitVector UseMask =
15446 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15447 SmallBitVector IsFirstPoison =
15448 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15449 SmallBitVector IsFirstUndef =
15450 isUndefVector(FirstInsert->getOperand(0), UseMask);
15451 if (!IsFirstPoison.all()) {
15452 unsigned Idx = 0;
15453 for (unsigned I = 0; I < NumElts; I++) {
15454 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15455 IsFirstUndef.test(I)) {
15456 if (IsVNonPoisonous) {
15457 InsertMask[I] = I < NumScalars ? I : 0;
15458 continue;
15459 }
15460 if (!V2)
15461 V2 = UndefValue::get(V->getType());
15462 if (Idx >= NumScalars)
15463 Idx = NumScalars - 1;
15464 InsertMask[I] = NumScalars + Idx;
15465 ++Idx;
15466 } else if (InsertMask[I] != PoisonMaskElem &&
15467 Mask[I] == PoisonMaskElem) {
15468 InsertMask[I] = PoisonMaskElem;
15469 }
15470 }
15471 } else {
15472 InsertMask = Mask;
15473 }
15474 }
15475 if (!V2)
15476 V2 = PoisonValue::get(V->getType());
15477 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15478 if (auto *I = dyn_cast<Instruction>(V)) {
15479 GatherShuffleExtractSeq.insert(I);
15480 CSEBlocks.insert(I->getParent());
15481 }
15482 }
15483
15484 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15485 for (unsigned I = 0; I < NumElts; I++) {
15486 if (Mask[I] != PoisonMaskElem)
15487 InsertMask[Offset + I] = I;
15488 }
15489 SmallBitVector UseMask =
15490 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15491 SmallBitVector IsFirstUndef =
15492 isUndefVector(FirstInsert->getOperand(0), UseMask);
15493 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15494 NumElts != NumScalars) {
15495 if (IsFirstUndef.all()) {
15496 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15497 SmallBitVector IsFirstPoison =
15498 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15499 if (!IsFirstPoison.all()) {
15500 for (unsigned I = 0; I < NumElts; I++) {
15501 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15502 InsertMask[I] = I + NumElts;
15503 }
15504 }
15505 V = Builder.CreateShuffleVector(
15506 V,
15507 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15508 : FirstInsert->getOperand(0),
15509 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15510 if (auto *I = dyn_cast<Instruction>(V)) {
15511 GatherShuffleExtractSeq.insert(I);
15512 CSEBlocks.insert(I->getParent());
15513 }
15514 }
15515 } else {
15516 SmallBitVector IsFirstPoison =
15517 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15518 for (unsigned I = 0; I < NumElts; I++) {
15519 if (InsertMask[I] == PoisonMaskElem)
15520 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15521 else
15522 InsertMask[I] += NumElts;
15523 }
15524 V = Builder.CreateShuffleVector(
15525 FirstInsert->getOperand(0), V, InsertMask,
15526 cast<Instruction>(E->Scalars.back())->getName());
15527 if (auto *I = dyn_cast<Instruction>(V)) {
15528 GatherShuffleExtractSeq.insert(I);
15529 CSEBlocks.insert(I->getParent());
15530 }
15531 }
15532 }
15533
15534 ++NumVectorInstructions;
15535 E->VectorizedValue = V;
15536 return V;
15537 }
15538 case Instruction::ZExt:
15539 case Instruction::SExt:
15540 case Instruction::FPToUI:
15541 case Instruction::FPToSI:
15542 case Instruction::FPExt:
15543 case Instruction::PtrToInt:
15544 case Instruction::IntToPtr:
15545 case Instruction::SIToFP:
15546 case Instruction::UIToFP:
15547 case Instruction::Trunc:
15548 case Instruction::FPTrunc:
15549 case Instruction::BitCast: {
15550 setInsertPointAfterBundle(E);
15551
15552 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15553 if (E->VectorizedValue) {
15554 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15555 return E->VectorizedValue;
15556 }
15557
15558 auto *CI = cast<CastInst>(VL0);
15559 Instruction::CastOps VecOpcode = CI->getOpcode();
15560 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15561 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15562 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15563 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15564 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15565 // Check if the values are candidates to demote.
15566 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15567 if (SrcIt != MinBWs.end())
15568 SrcBWSz = SrcIt->second.first;
15569 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15570 if (BWSz == SrcBWSz) {
15571 VecOpcode = Instruction::BitCast;
15572 } else if (BWSz < SrcBWSz) {
15573 VecOpcode = Instruction::Trunc;
15574 } else if (It != MinBWs.end()) {
15575 assert(BWSz > SrcBWSz && "Invalid cast!");
15576 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15577 } else if (SrcIt != MinBWs.end()) {
15578 assert(BWSz > SrcBWSz && "Invalid cast!");
15579 VecOpcode =
15580 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15581 }
15582 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15583 !SrcIt->second.second) {
15584 VecOpcode = Instruction::UIToFP;
15585 }
15586 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15587 ? InVec
15588 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15589 V = FinalShuffle(V, E);
15590
15591 E->VectorizedValue = V;
15592 ++NumVectorInstructions;
15593 return V;
15594 }
15595 case Instruction::FCmp:
15596 case Instruction::ICmp: {
15597 setInsertPointAfterBundle(E);
15598
15599 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15600 if (E->VectorizedValue) {
15601 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15602 return E->VectorizedValue;
15603 }
15604 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15605 if (E->VectorizedValue) {
15606 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15607 return E->VectorizedValue;
15608 }
15609 if (L->getType() != R->getType()) {
15610 assert((getOperandEntry(E, 0)->isGather() ||
15611 getOperandEntry(E, 1)->isGather() ||
15612 MinBWs.contains(getOperandEntry(E, 0)) ||
15613 MinBWs.contains(getOperandEntry(E, 1))) &&
15614 "Expected item in MinBWs.");
15615 if (cast<VectorType>(L->getType())
15616 ->getElementType()
15617 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15618 ->getElementType()
15619 ->getIntegerBitWidth()) {
15620 Type *CastTy = R->getType();
15621 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15622 } else {
15623 Type *CastTy = L->getType();
15624 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15625 }
15626 }
15627
15628 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15629 Value *V = Builder.CreateCmp(P0, L, R);
15630 propagateIRFlags(V, E->Scalars, VL0);
15631 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15632 ICmp->setSameSign(/*B=*/false);
15633 // Do not cast for cmps.
15634 VecTy = cast<FixedVectorType>(V->getType());
15635 V = FinalShuffle(V, E);
15636
15637 E->VectorizedValue = V;
15638 ++NumVectorInstructions;
15639 return V;
15640 }
15641 case Instruction::Select: {
15642 setInsertPointAfterBundle(E);
15643
15644 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15645 if (E->VectorizedValue) {
15646 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15647 return E->VectorizedValue;
15648 }
15649 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15650 if (E->VectorizedValue) {
15651 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15652 return E->VectorizedValue;
15653 }
15654 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15655 if (E->VectorizedValue) {
15656 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15657 return E->VectorizedValue;
15658 }
15659 if (True->getType() != VecTy || False->getType() != VecTy) {
15660 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15661 getOperandEntry(E, 2)->isGather() ||
15662 MinBWs.contains(getOperandEntry(E, 1)) ||
15663 MinBWs.contains(getOperandEntry(E, 2))) &&
15664 "Expected item in MinBWs.");
15665 if (True->getType() != VecTy)
15666 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15667 if (False->getType() != VecTy)
15668 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15669 }
15670
15671 unsigned CondNumElements = getNumElements(Cond->getType());
15672 unsigned TrueNumElements = getNumElements(True->getType());
15673 assert(TrueNumElements >= CondNumElements &&
15674 TrueNumElements % CondNumElements == 0 &&
15675 "Cannot vectorize Instruction::Select");
15676 assert(TrueNumElements == getNumElements(False->getType()) &&
15677 "Cannot vectorize Instruction::Select");
15678 if (CondNumElements != TrueNumElements) {
15679 // When the return type is i1 but the source is fixed vector type, we
15680 // need to duplicate the condition value.
15681 Cond = Builder.CreateShuffleVector(
15682 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15683 CondNumElements));
15684 }
15685 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15686 "Cannot vectorize Instruction::Select");
15687 Value *V = Builder.CreateSelect(Cond, True, False);
15688 V = FinalShuffle(V, E);
15689
15690 E->VectorizedValue = V;
15691 ++NumVectorInstructions;
15692 return V;
15693 }
15694 case Instruction::FNeg: {
15695 setInsertPointAfterBundle(E);
15696
15697 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15698
15699 if (E->VectorizedValue) {
15700 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15701 return E->VectorizedValue;
15702 }
15703
15704 Value *V = Builder.CreateUnOp(
15705 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15706 propagateIRFlags(V, E->Scalars, VL0);
15707 if (auto *I = dyn_cast<Instruction>(V))
15708 V = ::propagateMetadata(I, E->Scalars);
15709
15710 V = FinalShuffle(V, E);
15711
15712 E->VectorizedValue = V;
15713 ++NumVectorInstructions;
15714
15715 return V;
15716 }
15717 case Instruction::Freeze: {
15718 setInsertPointAfterBundle(E);
15719
15720 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15721
15722 if (E->VectorizedValue) {
15723 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15724 return E->VectorizedValue;
15725 }
15726
15727 if (Op->getType() != VecTy) {
15728 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15729 MinBWs.contains(getOperandEntry(E, 0))) &&
15730 "Expected item in MinBWs.");
15731 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15732 }
15733 Value *V = Builder.CreateFreeze(Op);
15734 V = FinalShuffle(V, E);
15735
15736 E->VectorizedValue = V;
15737 ++NumVectorInstructions;
15738
15739 return V;
15740 }
15741 case Instruction::Add:
15742 case Instruction::FAdd:
15743 case Instruction::Sub:
15744 case Instruction::FSub:
15745 case Instruction::Mul:
15746 case Instruction::FMul:
15747 case Instruction::UDiv:
15748 case Instruction::SDiv:
15749 case Instruction::FDiv:
15750 case Instruction::URem:
15751 case Instruction::SRem:
15752 case Instruction::FRem:
15753 case Instruction::Shl:
15754 case Instruction::LShr:
15755 case Instruction::AShr:
15756 case Instruction::And:
15757 case Instruction::Or:
15758 case Instruction::Xor: {
15759 setInsertPointAfterBundle(E);
15760
15761 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15762 if (E->VectorizedValue) {
15763 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15764 return E->VectorizedValue;
15765 }
15766 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15767 if (E->VectorizedValue) {
15768 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15769 return E->VectorizedValue;
15770 }
15771 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15772 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15773 ArrayRef<Value *> Ops = E->getOperand(I);
15774 if (all_of(Ops, [&](Value *Op) {
15775 auto *CI = dyn_cast<ConstantInt>(Op);
15776 return CI && CI->getValue().countr_one() >= It->second.first;
15777 })) {
15778 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15779 E->VectorizedValue = V;
15780 ++NumVectorInstructions;
15781 return V;
15782 }
15783 }
15784 }
15785 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15786 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15787 getOperandEntry(E, 1)->isGather() ||
15788 MinBWs.contains(getOperandEntry(E, 0)) ||
15789 MinBWs.contains(getOperandEntry(E, 1))) &&
15790 "Expected item in MinBWs.");
15791 if (LHS->getType() != VecTy)
15792 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15793 if (RHS->getType() != VecTy)
15794 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15795 }
15796
15797 Value *V = Builder.CreateBinOp(
15798 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15799 RHS);
15800 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15801 if (auto *I = dyn_cast<Instruction>(V)) {
15802 V = ::propagateMetadata(I, E->Scalars);
15803 // Drop nuw flags for abs(sub(commutative), true).
15804 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15805 any_of(E->Scalars, [](Value *V) {
15806 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15807 }))
15808 I->setHasNoUnsignedWrap(/*b=*/false);
15809 }
15810
15811 V = FinalShuffle(V, E);
15812
15813 E->VectorizedValue = V;
15814 ++NumVectorInstructions;
15815
15816 return V;
15817 }
15818 case Instruction::Load: {
15819 // Loads are inserted at the head of the tree because we don't want to
15820 // sink them all the way down past store instructions.
15821 setInsertPointAfterBundle(E);
15822
15823 LoadInst *LI = cast<LoadInst>(VL0);
15824 Instruction *NewLI;
15825 Value *PO = LI->getPointerOperand();
15826 if (E->State == TreeEntry::Vectorize) {
15827 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15828 } else if (E->State == TreeEntry::StridedVectorize) {
15829 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15830 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15831 PO = IsReverseOrder ? PtrN : Ptr0;
15832 std::optional<int> Diff = getPointersDiff(
15833 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15834 Type *StrideTy = DL->getIndexType(PO->getType());
15835 Value *StrideVal;
15836 if (Diff) {
15837 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15838 StrideVal =
15839 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15840 DL->getTypeAllocSize(ScalarTy));
15841 } else {
15842 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15843 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15844 return cast<LoadInst>(V)->getPointerOperand();
15845 });
15846 OrdersType Order;
15847 std::optional<Value *> Stride =
15848 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15849 &*Builder.GetInsertPoint());
15850 Value *NewStride =
15851 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15852 StrideVal = Builder.CreateMul(
15853 NewStride,
15854 ConstantInt::get(
15855 StrideTy,
15856 (IsReverseOrder ? -1 : 1) *
15857 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15858 }
15859 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15860 auto *Inst = Builder.CreateIntrinsic(
15861 Intrinsic::experimental_vp_strided_load,
15862 {VecTy, PO->getType(), StrideTy},
15863 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15864 Builder.getInt32(E->Scalars.size())});
15865 Inst->addParamAttr(
15866 /*ArgNo=*/0,
15867 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15868 NewLI = Inst;
15869 } else {
15870 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15871 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15872 if (E->VectorizedValue) {
15873 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15874 return E->VectorizedValue;
15875 }
15876 if (isa<FixedVectorType>(ScalarTy)) {
15877 assert(SLPReVec && "FixedVectorType is not expected.");
15878 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15879 // to expand VecPtr if ScalarTy is a vector type.
15880 unsigned ScalarTyNumElements =
15881 cast<FixedVectorType>(ScalarTy)->getNumElements();
15882 unsigned VecTyNumElements =
15883 cast<FixedVectorType>(VecTy)->getNumElements();
15884 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15885 "Cannot expand getelementptr.");
15886 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15887 SmallVector<Constant *> Indices(VecTyNumElements);
15888 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15889 return Builder.getInt64(I % ScalarTyNumElements);
15890 });
15891 VecPtr = Builder.CreateGEP(
15892 VecTy->getElementType(),
15893 Builder.CreateShuffleVector(
15894 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15895 ConstantVector::get(Indices));
15896 }
15897 // Use the minimum alignment of the gathered loads.
15898 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15899 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15900 }
15901 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15902
15903 V = FinalShuffle(V, E);
15904 E->VectorizedValue = V;
15905 ++NumVectorInstructions;
15906 return V;
15907 }
15908 case Instruction::Store: {
15909 auto *SI = cast<StoreInst>(VL0);
15910
15911 setInsertPointAfterBundle(E);
15912
15913 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15914 if (VecValue->getType() != VecTy)
15915 VecValue =
15916 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15917 VecValue = FinalShuffle(VecValue, E);
15918
15919 Value *Ptr = SI->getPointerOperand();
15920 Instruction *ST;
15921 if (E->State == TreeEntry::Vectorize) {
15922 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
15923 } else {
15924 assert(E->State == TreeEntry::StridedVectorize &&
15925 "Expected either strided or consecutive stores.");
15926 if (!E->ReorderIndices.empty()) {
15927 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15928 Ptr = SI->getPointerOperand();
15929 }
15930 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15931 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
15932 auto *Inst = Builder.CreateIntrinsic(
15933 Intrinsic::experimental_vp_strided_store,
15934 {VecTy, Ptr->getType(), StrideTy},
15935 {VecValue, Ptr,
15936 ConstantInt::get(
15937 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15938 Builder.getAllOnesMask(VecTy->getElementCount()),
15939 Builder.getInt32(E->Scalars.size())});
15940 Inst->addParamAttr(
15941 /*ArgNo=*/1,
15942 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15943 ST = Inst;
15944 }
15945
15946 Value *V = ::propagateMetadata(ST, E->Scalars);
15947
15948 E->VectorizedValue = V;
15949 ++NumVectorInstructions;
15950 return V;
15951 }
15952 case Instruction::GetElementPtr: {
15953 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15954 setInsertPointAfterBundle(E);
15955
15956 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15957 if (E->VectorizedValue) {
15958 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15959 return E->VectorizedValue;
15960 }
15961
15962 SmallVector<Value *> OpVecs;
15963 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
15964 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15965 if (E->VectorizedValue) {
15966 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15967 return E->VectorizedValue;
15968 }
15969 OpVecs.push_back(OpVec);
15970 }
15971
15972 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
15973 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
15975 for (Value *V : E->Scalars) {
15976 if (isa<GetElementPtrInst>(V))
15977 GEPs.push_back(V);
15978 }
15979 V = ::propagateMetadata(I, GEPs);
15980 }
15981
15982 V = FinalShuffle(V, E);
15983
15984 E->VectorizedValue = V;
15985 ++NumVectorInstructions;
15986
15987 return V;
15988 }
15989 case Instruction::Call: {
15990 CallInst *CI = cast<CallInst>(VL0);
15991 setInsertPointAfterBundle(E);
15992
15994
15996 CI, ID, VecTy->getNumElements(),
15997 It != MinBWs.end() ? It->second.first : 0, TTI);
15998 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15999 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
16000 VecCallCosts.first <= VecCallCosts.second;
16001
16002 Value *ScalarArg = nullptr;
16003 SmallVector<Value *> OpVecs;
16004 SmallVector<Type *, 2> TysForDecl;
16005 // Add return type if intrinsic is overloaded on it.
16006 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
16007 TysForDecl.push_back(VecTy);
16008 auto *CEI = cast<CallInst>(VL0);
16009 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
16010 ValueList OpVL;
16011 // Some intrinsics have scalar arguments. This argument should not be
16012 // vectorized.
16013 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
16014 ScalarArg = CEI->getArgOperand(I);
16015 // if decided to reduce bitwidth of abs intrinsic, it second argument
16016 // must be set false (do not return poison, if value issigned min).
16017 if (ID == Intrinsic::abs && It != MinBWs.end() &&
16018 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
16019 ScalarArg = Builder.getFalse();
16020 OpVecs.push_back(ScalarArg);
16022 TysForDecl.push_back(ScalarArg->getType());
16023 continue;
16024 }
16025
16026 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
16027 if (E->VectorizedValue) {
16028 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16029 return E->VectorizedValue;
16030 }
16031 ScalarArg = CEI->getArgOperand(I);
16032 if (cast<VectorType>(OpVec->getType())->getElementType() !=
16033 ScalarArg->getType()->getScalarType() &&
16034 It == MinBWs.end()) {
16035 auto *CastTy =
16036 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16037 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16038 } else if (It != MinBWs.end()) {
16039 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16040 }
16041 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
16042 OpVecs.push_back(OpVec);
16043 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
16044 TysForDecl.push_back(OpVec->getType());
16045 }
16046
16047 Function *CF;
16048 if (!UseIntrinsic) {
16049 VFShape Shape =
16052 static_cast<unsigned>(VecTy->getNumElements())),
16053 false /*HasGlobalPred*/);
16054 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
16055 } else {
16056 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
16057 }
16058
16060 CI->getOperandBundlesAsDefs(OpBundles);
16061 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16062
16063 propagateIRFlags(V, E->Scalars, VL0);
16064 V = FinalShuffle(V, E);
16065
16066 E->VectorizedValue = V;
16067 ++NumVectorInstructions;
16068 return V;
16069 }
16070 case Instruction::ShuffleVector: {
16071 Value *V;
16072 if (SLPReVec && !E->isAltShuffle()) {
16073 setInsertPointAfterBundle(E);
16074 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16075 if (E->VectorizedValue) {
16076 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16077 return E->VectorizedValue;
16078 }
16079 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16080 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16081 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16082 "Not supported shufflevector usage.");
16083 SmallVector<int> NewMask(ThisMask.size());
16084 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16085 return SVSrc->getShuffleMask()[Mask];
16086 });
16087 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16088 } else {
16089 V = Builder.CreateShuffleVector(Src, ThisMask);
16090 }
16091 propagateIRFlags(V, E->Scalars, VL0);
16092 if (auto *I = dyn_cast<Instruction>(V))
16093 V = ::propagateMetadata(I, E->Scalars);
16094 V = FinalShuffle(V, E);
16095 } else {
16096 assert(E->isAltShuffle() &&
16097 ((Instruction::isBinaryOp(E->getOpcode()) &&
16098 Instruction::isBinaryOp(E->getAltOpcode())) ||
16099 (Instruction::isCast(E->getOpcode()) &&
16100 Instruction::isCast(E->getAltOpcode())) ||
16101 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16102 "Invalid Shuffle Vector Operand");
16103
16104 Value *LHS = nullptr, *RHS = nullptr;
16105 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16106 setInsertPointAfterBundle(E);
16107 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16108 if (E->VectorizedValue) {
16109 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16110 return E->VectorizedValue;
16111 }
16112 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16113 } else {
16114 setInsertPointAfterBundle(E);
16115 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16116 }
16117 if (E->VectorizedValue) {
16118 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16119 return E->VectorizedValue;
16120 }
16121 if (LHS && RHS &&
16122 ((Instruction::isBinaryOp(E->getOpcode()) &&
16123 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
16124 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
16125 assert((It != MinBWs.end() ||
16126 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16127 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16128 MinBWs.contains(getOperandEntry(E, 0)) ||
16129 MinBWs.contains(getOperandEntry(E, 1))) &&
16130 "Expected item in MinBWs.");
16131 Type *CastTy = VecTy;
16132 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
16133 if (cast<VectorType>(LHS->getType())
16134 ->getElementType()
16135 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16136 ->getElementType()
16137 ->getIntegerBitWidth())
16138 CastTy = RHS->getType();
16139 else
16140 CastTy = LHS->getType();
16141 }
16142 if (LHS->getType() != CastTy)
16143 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16144 if (RHS->getType() != CastTy)
16145 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16146 }
16147
16148 Value *V0, *V1;
16149 if (Instruction::isBinaryOp(E->getOpcode())) {
16150 V0 = Builder.CreateBinOp(
16151 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16152 V1 = Builder.CreateBinOp(
16153 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16154 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16155 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16156 auto *AltCI = cast<CmpInst>(E->getAltOp());
16157 CmpInst::Predicate AltPred = AltCI->getPredicate();
16158 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16159 } else {
16160 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16161 unsigned SrcBWSz = DL->getTypeSizeInBits(
16162 cast<VectorType>(LHS->getType())->getElementType());
16163 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16164 if (BWSz <= SrcBWSz) {
16165 if (BWSz < SrcBWSz)
16166 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16167 assert(LHS->getType() == VecTy &&
16168 "Expected same type as operand.");
16169 if (auto *I = dyn_cast<Instruction>(LHS))
16170 LHS = ::propagateMetadata(I, E->Scalars);
16171 LHS = FinalShuffle(LHS, E);
16172 E->VectorizedValue = LHS;
16173 ++NumVectorInstructions;
16174 return LHS;
16175 }
16176 }
16177 V0 = Builder.CreateCast(
16178 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16179 V1 = Builder.CreateCast(
16180 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16181 }
16182 // Add V0 and V1 to later analysis to try to find and remove matching
16183 // instruction, if any.
16184 for (Value *V : {V0, V1}) {
16185 if (auto *I = dyn_cast<Instruction>(V)) {
16186 GatherShuffleExtractSeq.insert(I);
16187 CSEBlocks.insert(I->getParent());
16188 }
16189 }
16190
16191 // Create shuffle to take alternate operations from the vector.
16192 // Also, gather up main and alt scalar ops to propagate IR flags to
16193 // each vector operation.
16194 ValueList OpScalars, AltScalars;
16196 E->buildAltOpShuffleMask(
16197 [E, this](Instruction *I) {
16198 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16199 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16200 *TLI);
16201 },
16202 Mask, &OpScalars, &AltScalars);
16203
16204 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16205 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16206 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16207 // Drop nuw flags for abs(sub(commutative), true).
16208 if (auto *I = dyn_cast<Instruction>(Vec);
16209 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16210 any_of(E->Scalars, [](Value *V) {
16211 if (isa<PoisonValue>(V))
16212 return false;
16213 auto *IV = cast<Instruction>(V);
16214 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16215 }))
16216 I->setHasNoUnsignedWrap(/*b=*/false);
16217 };
16218 DropNuwFlag(V0, E->getOpcode());
16219 DropNuwFlag(V1, E->getAltOpcode());
16220
16221 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16222 assert(SLPReVec && "FixedVectorType is not expected.");
16224 }
16225 V = Builder.CreateShuffleVector(V0, V1, Mask);
16226 if (auto *I = dyn_cast<Instruction>(V)) {
16227 V = ::propagateMetadata(I, E->Scalars);
16228 GatherShuffleExtractSeq.insert(I);
16229 CSEBlocks.insert(I->getParent());
16230 }
16231 }
16232
16233 E->VectorizedValue = V;
16234 ++NumVectorInstructions;
16235
16236 return V;
16237 }
16238 default:
16239 llvm_unreachable("unknown inst");
16240 }
16241 return nullptr;
16242}
16243
16245 ExtraValueToDebugLocsMap ExternallyUsedValues;
16246 return vectorizeTree(ExternallyUsedValues);
16247}
16248
16249Value *
16251 Instruction *ReductionRoot) {
16252 // All blocks must be scheduled before any instructions are inserted.
16253 for (auto &BSIter : BlocksSchedules) {
16254 scheduleBlock(BSIter.second.get());
16255 }
16256 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16257 // need to rebuild it.
16258 EntryToLastInstruction.clear();
16259
16260 if (ReductionRoot)
16261 Builder.SetInsertPoint(ReductionRoot->getParent(),
16262 ReductionRoot->getIterator());
16263 else
16264 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16265
16266 // Emit gathered loads first to emit better code for the users of those
16267 // gathered loads.
16268 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16269 if (GatheredLoadsEntriesFirst.has_value() &&
16270 TE->Idx >= *GatheredLoadsEntriesFirst &&
16271 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16272 assert((!TE->UserTreeIndices.empty() ||
16273 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16274 "Expected gathered load node.");
16275 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16276 }
16277 }
16278 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16279 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16280 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16281 if (TE->State == TreeEntry::Vectorize &&
16282 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16283 TE->VectorizedValue)
16284 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16285 // Run through the list of postponed gathers and emit them, replacing the temp
16286 // emitted allocas with actual vector instructions.
16287 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16289 for (const TreeEntry *E : PostponedNodes) {
16290 auto *TE = const_cast<TreeEntry *>(E);
16291 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16292 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16293 TE->UserTreeIndices.front().EdgeIdx)) &&
16294 VecTE->isSame(TE->Scalars))
16295 // Found gather node which is absolutely the same as one of the
16296 // vectorized nodes. It may happen after reordering.
16297 continue;
16298 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16299 TE->VectorizedValue = nullptr;
16300 auto *UserI =
16301 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16302 // If user is a PHI node, its vector code have to be inserted right before
16303 // block terminator. Since the node was delayed, there were some unresolved
16304 // dependencies at the moment when stab instruction was emitted. In a case
16305 // when any of these dependencies turn out an operand of another PHI, coming
16306 // from this same block, position of a stab instruction will become invalid.
16307 // The is because source vector that supposed to feed this gather node was
16308 // inserted at the end of the block [after stab instruction]. So we need
16309 // to adjust insertion point again to the end of block.
16310 if (isa<PHINode>(UserI)) {
16311 // Insert before all users.
16312 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16313 for (User *U : PrevVec->users()) {
16314 if (U == UserI)
16315 continue;
16316 auto *UI = dyn_cast<Instruction>(U);
16317 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16318 continue;
16319 if (UI->comesBefore(InsertPt))
16320 InsertPt = UI;
16321 }
16322 Builder.SetInsertPoint(InsertPt);
16323 } else {
16324 Builder.SetInsertPoint(PrevVec);
16325 }
16326 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16327 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16328 if (auto *VecI = dyn_cast<Instruction>(Vec);
16329 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16330 Builder.GetInsertPoint()->comesBefore(VecI))
16331 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16332 Builder.GetInsertPoint());
16333 if (Vec->getType() != PrevVec->getType()) {
16334 assert(Vec->getType()->isIntOrIntVectorTy() &&
16335 PrevVec->getType()->isIntOrIntVectorTy() &&
16336 "Expected integer vector types only.");
16337 std::optional<bool> IsSigned;
16338 for (Value *V : TE->Scalars) {
16339 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16340 auto It = MinBWs.find(BaseTE);
16341 if (It != MinBWs.end()) {
16342 IsSigned = IsSigned.value_or(false) || It->second.second;
16343 if (*IsSigned)
16344 break;
16345 }
16346 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16347 auto It = MinBWs.find(MNTE);
16348 if (It != MinBWs.end()) {
16349 IsSigned = IsSigned.value_or(false) || It->second.second;
16350 if (*IsSigned)
16351 break;
16352 }
16353 }
16354 if (IsSigned.value_or(false))
16355 break;
16356 // Scan through gather nodes.
16357 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16358 auto It = MinBWs.find(BVE);
16359 if (It != MinBWs.end()) {
16360 IsSigned = IsSigned.value_or(false) || It->second.second;
16361 if (*IsSigned)
16362 break;
16363 }
16364 }
16365 if (IsSigned.value_or(false))
16366 break;
16367 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16368 IsSigned =
16369 IsSigned.value_or(false) ||
16370 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16371 continue;
16372 }
16373 if (IsSigned.value_or(false))
16374 break;
16375 }
16376 }
16377 if (IsSigned.value_or(false)) {
16378 // Final attempt - check user node.
16379 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16380 if (It != MinBWs.end())
16381 IsSigned = It->second.second;
16382 }
16383 assert(IsSigned &&
16384 "Expected user node or perfect diamond match in MinBWs.");
16385 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16386 }
16387 PrevVec->replaceAllUsesWith(Vec);
16388 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16389 // Replace the stub vector node, if it was used before for one of the
16390 // buildvector nodes already.
16391 auto It = PostponedValues.find(PrevVec);
16392 if (It != PostponedValues.end()) {
16393 for (TreeEntry *VTE : It->getSecond())
16394 VTE->VectorizedValue = Vec;
16395 }
16396 eraseInstruction(PrevVec);
16397 }
16398
16399 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16400 << " values .\n");
16401
16403 // Maps vector instruction to original insertelement instruction
16404 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16405 // Maps extract Scalar to the corresponding extractelement instruction in the
16406 // basic block. Only one extractelement per block should be emitted.
16408 ScalarToEEs;
16409 SmallDenseSet<Value *, 4> UsedInserts;
16411 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16413 // Extract all of the elements with the external uses.
16414 for (const auto &ExternalUse : ExternalUses) {
16415 Value *Scalar = ExternalUse.Scalar;
16416 llvm::User *User = ExternalUse.User;
16417
16418 // Skip users that we already RAUW. This happens when one instruction
16419 // has multiple uses of the same value.
16420 if (User && !is_contained(Scalar->users(), User))
16421 continue;
16422 TreeEntry *E = getTreeEntry(Scalar);
16423 assert(E && "Invalid scalar");
16424 assert(!E->isGather() && "Extracting from a gather list");
16425 // Non-instruction pointers are not deleted, just skip them.
16426 if (E->getOpcode() == Instruction::GetElementPtr &&
16427 !isa<GetElementPtrInst>(Scalar))
16428 continue;
16429
16430 Value *Vec = E->VectorizedValue;
16431 assert(Vec && "Can't find vectorizable value");
16432
16433 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16434 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16435 if (Scalar->getType() != Vec->getType()) {
16436 Value *Ex = nullptr;
16437 Value *ExV = nullptr;
16438 auto *Inst = dyn_cast<Instruction>(Scalar);
16439 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16440 auto It = ScalarToEEs.find(Scalar);
16441 if (It != ScalarToEEs.end()) {
16442 // No need to emit many extracts, just move the only one in the
16443 // current block.
16444 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16445 : Builder.GetInsertBlock());
16446 if (EEIt != It->second.end()) {
16447 Value *PrevV = EEIt->second.first;
16448 if (auto *I = dyn_cast<Instruction>(PrevV);
16449 I && !ReplaceInst &&
16450 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16451 Builder.GetInsertPoint()->comesBefore(I)) {
16452 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16453 Builder.GetInsertPoint());
16454 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16455 CI->moveAfter(I);
16456 }
16457 Ex = PrevV;
16458 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16459 }
16460 }
16461 if (!Ex) {
16462 // "Reuse" the existing extract to improve final codegen.
16463 if (ReplaceInst) {
16464 // Leave the instruction as is, if it cheaper extracts and all
16465 // operands are scalar.
16466 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16467 IgnoredExtracts.insert(EE);
16468 Ex = EE;
16469 } else {
16470 auto *CloneInst = Inst->clone();
16471 CloneInst->insertBefore(Inst);
16472 if (Inst->hasName())
16473 CloneInst->takeName(Inst);
16474 Ex = CloneInst;
16475 }
16476 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16477 ES && isa<Instruction>(Vec)) {
16478 Value *V = ES->getVectorOperand();
16479 auto *IVec = cast<Instruction>(Vec);
16480 if (const TreeEntry *ETE = getTreeEntry(V))
16481 V = ETE->VectorizedValue;
16482 if (auto *IV = dyn_cast<Instruction>(V);
16483 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16484 IV->comesBefore(IVec))
16485 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16486 else
16487 Ex = Builder.CreateExtractElement(Vec, Lane);
16488 } else if (auto *VecTy =
16489 dyn_cast<FixedVectorType>(Scalar->getType())) {
16490 assert(SLPReVec && "FixedVectorType is not expected.");
16491 unsigned VecTyNumElements = VecTy->getNumElements();
16492 // When REVEC is enabled, we need to extract a vector.
16493 // Note: The element size of Scalar may be different from the
16494 // element size of Vec.
16495 Ex = Builder.CreateExtractVector(
16497 VecTyNumElements),
16498 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
16499 } else {
16500 Ex = Builder.CreateExtractElement(Vec, Lane);
16501 }
16502 // If necessary, sign-extend or zero-extend ScalarRoot
16503 // to the larger type.
16504 ExV = Ex;
16505 if (Scalar->getType() != Ex->getType())
16506 ExV = Builder.CreateIntCast(
16507 Ex, Scalar->getType(),
16508 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16509 auto *I = dyn_cast<Instruction>(Ex);
16510 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16511 : &F->getEntryBlock(),
16512 std::make_pair(Ex, ExV));
16513 }
16514 // The then branch of the previous if may produce constants, since 0
16515 // operand might be a constant.
16516 if (auto *ExI = dyn_cast<Instruction>(Ex);
16517 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16518 GatherShuffleExtractSeq.insert(ExI);
16519 CSEBlocks.insert(ExI->getParent());
16520 }
16521 return ExV;
16522 }
16523 assert(isa<FixedVectorType>(Scalar->getType()) &&
16524 isa<InsertElementInst>(Scalar) &&
16525 "In-tree scalar of vector type is not insertelement?");
16526 auto *IE = cast<InsertElementInst>(Scalar);
16527 VectorToInsertElement.try_emplace(Vec, IE);
16528 return Vec;
16529 };
16530 // If User == nullptr, the Scalar remains as scalar in vectorized
16531 // instructions or is used as extra arg. Generate ExtractElement instruction
16532 // and update the record for this scalar in ExternallyUsedValues.
16533 if (!User) {
16534 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16535 continue;
16536 assert((ExternallyUsedValues.count(Scalar) ||
16537 Scalar->hasNUsesOrMore(UsesLimit) ||
16538 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16539 any_of(Scalar->users(),
16540 [&](llvm::User *U) {
16541 if (ExternalUsesAsOriginalScalar.contains(U))
16542 return true;
16543 TreeEntry *UseEntry = getTreeEntry(U);
16544 return UseEntry &&
16545 (UseEntry->State == TreeEntry::Vectorize ||
16546 UseEntry->State ==
16547 TreeEntry::StridedVectorize) &&
16548 (E->State == TreeEntry::Vectorize ||
16549 E->State == TreeEntry::StridedVectorize) &&
16550 doesInTreeUserNeedToExtract(
16551 Scalar, getRootEntryInstruction(*UseEntry),
16552 TLI, TTI);
16553 })) &&
16554 "Scalar with nullptr User must be registered in "
16555 "ExternallyUsedValues map or remain as scalar in vectorized "
16556 "instructions");
16557 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16558 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16559 if (PHI->getParent()->isLandingPad())
16560 Builder.SetInsertPoint(
16561 PHI->getParent(),
16562 std::next(
16563 PHI->getParent()->getLandingPadInst()->getIterator()));
16564 else
16565 Builder.SetInsertPoint(PHI->getParent(),
16566 PHI->getParent()->getFirstNonPHIIt());
16567 } else {
16568 Builder.SetInsertPoint(VecI->getParent(),
16569 std::next(VecI->getIterator()));
16570 }
16571 } else {
16572 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16573 }
16574 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16575 // Required to update internally referenced instructions.
16576 if (Scalar != NewInst) {
16577 assert((!isa<ExtractElementInst>(Scalar) ||
16578 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16579 "Extractelements should not be replaced.");
16580 Scalar->replaceAllUsesWith(NewInst);
16581 }
16582 continue;
16583 }
16584
16585 if (auto *VU = dyn_cast<InsertElementInst>(User);
16586 VU && VU->getOperand(1) == Scalar) {
16587 // Skip if the scalar is another vector op or Vec is not an instruction.
16588 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16589 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16590 if (!UsedInserts.insert(VU).second)
16591 continue;
16592 // Need to use original vector, if the root is truncated.
16593 auto BWIt = MinBWs.find(E);
16594 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16595 auto *ScalarTy = FTy->getElementType();
16596 auto Key = std::make_pair(Vec, ScalarTy);
16597 auto VecIt = VectorCasts.find(Key);
16598 if (VecIt == VectorCasts.end()) {
16599 IRBuilderBase::InsertPointGuard Guard(Builder);
16600 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16601 if (IVec->getParent()->isLandingPad())
16602 Builder.SetInsertPoint(IVec->getParent(),
16603 std::next(IVec->getParent()
16604 ->getLandingPadInst()
16605 ->getIterator()));
16606 else
16607 Builder.SetInsertPoint(
16608 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16609 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16610 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16611 }
16612 Vec = Builder.CreateIntCast(
16613 Vec,
16615 ScalarTy,
16616 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16617 BWIt->second.second);
16618 VectorCasts.try_emplace(Key, Vec);
16619 } else {
16620 Vec = VecIt->second;
16621 }
16622 }
16623
16624 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16625 if (InsertIdx) {
16626 auto *It = find_if(
16627 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16628 // Checks if 2 insertelements are from the same buildvector.
16629 InsertElementInst *VecInsert = Data.InsertElements.front();
16631 VU, VecInsert,
16632 [](InsertElementInst *II) { return II->getOperand(0); });
16633 });
16634 unsigned Idx = *InsertIdx;
16635 if (It == ShuffledInserts.end()) {
16636 (void)ShuffledInserts.emplace_back();
16637 It = std::next(ShuffledInserts.begin(),
16638 ShuffledInserts.size() - 1);
16639 }
16640 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16641 if (Mask.empty())
16642 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16643 Mask[Idx] = ExternalUse.Lane;
16644 It->InsertElements.push_back(cast<InsertElementInst>(User));
16645 continue;
16646 }
16647 }
16648 }
16649 }
16650
16651 // Generate extracts for out-of-tree users.
16652 // Find the insertion point for the extractelement lane.
16653 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16654 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16655 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16656 if (PH->getIncomingValue(I) == Scalar) {
16657 Instruction *IncomingTerminator =
16658 PH->getIncomingBlock(I)->getTerminator();
16659 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16660 Builder.SetInsertPoint(VecI->getParent(),
16661 std::next(VecI->getIterator()));
16662 } else {
16663 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16664 }
16665 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16666 PH->setOperand(I, NewInst);
16667 }
16668 }
16669 } else {
16670 Builder.SetInsertPoint(cast<Instruction>(User));
16671 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16672 User->replaceUsesOfWith(Scalar, NewInst);
16673 }
16674 } else {
16675 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16676 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16677 User->replaceUsesOfWith(Scalar, NewInst);
16678 }
16679
16680 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16681 }
16682
16683 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16684 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16685 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16686 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16687 for (int I = 0, E = Mask.size(); I < E; ++I) {
16688 if (Mask[I] < VF)
16689 CombinedMask1[I] = Mask[I];
16690 else
16691 CombinedMask2[I] = Mask[I] - VF;
16692 }
16693 ShuffleInstructionBuilder ShuffleBuilder(
16694 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16695 ShuffleBuilder.add(V1, CombinedMask1);
16696 if (V2)
16697 ShuffleBuilder.add(V2, CombinedMask2);
16698 return ShuffleBuilder.finalize({}, {}, {});
16699 };
16700
16701 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16702 bool ForSingleMask) {
16703 unsigned VF = Mask.size();
16704 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16705 if (VF != VecVF) {
16706 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16707 Vec = CreateShuffle(Vec, nullptr, Mask);
16708 return std::make_pair(Vec, true);
16709 }
16710 if (!ForSingleMask) {
16711 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16712 for (unsigned I = 0; I < VF; ++I) {
16713 if (Mask[I] != PoisonMaskElem)
16714 ResizeMask[Mask[I]] = Mask[I];
16715 }
16716 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16717 }
16718 }
16719
16720 return std::make_pair(Vec, false);
16721 };
16722 // Perform shuffling of the vectorize tree entries for better handling of
16723 // external extracts.
16724 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16725 // Find the first and the last instruction in the list of insertelements.
16726 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16727 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16728 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16729 Builder.SetInsertPoint(LastInsert);
16730 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16731 Value *NewInst = performExtractsShuffleAction<Value>(
16732 MutableArrayRef(Vector.data(), Vector.size()),
16733 FirstInsert->getOperand(0),
16734 [](Value *Vec) {
16735 return cast<VectorType>(Vec->getType())
16736 ->getElementCount()
16737 .getKnownMinValue();
16738 },
16739 ResizeToVF,
16740 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16741 ArrayRef<Value *> Vals) {
16742 assert((Vals.size() == 1 || Vals.size() == 2) &&
16743 "Expected exactly 1 or 2 input values.");
16744 if (Vals.size() == 1) {
16745 // Do not create shuffle if the mask is a simple identity
16746 // non-resizing mask.
16747 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16748 ->getNumElements() ||
16749 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16750 return CreateShuffle(Vals.front(), nullptr, Mask);
16751 return Vals.front();
16752 }
16753 return CreateShuffle(Vals.front() ? Vals.front()
16754 : FirstInsert->getOperand(0),
16755 Vals.back(), Mask);
16756 });
16757 auto It = ShuffledInserts[I].InsertElements.rbegin();
16758 // Rebuild buildvector chain.
16759 InsertElementInst *II = nullptr;
16760 if (It != ShuffledInserts[I].InsertElements.rend())
16761 II = *It;
16763 while (It != ShuffledInserts[I].InsertElements.rend()) {
16764 assert(II && "Must be an insertelement instruction.");
16765 if (*It == II)
16766 ++It;
16767 else
16768 Inserts.push_back(cast<Instruction>(II));
16769 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16770 }
16771 for (Instruction *II : reverse(Inserts)) {
16772 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16773 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16774 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16775 II->moveAfter(NewI);
16776 NewInst = II;
16777 }
16778 LastInsert->replaceAllUsesWith(NewInst);
16779 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16780 IE->replaceUsesOfWith(IE->getOperand(0),
16781 PoisonValue::get(IE->getOperand(0)->getType()));
16782 IE->replaceUsesOfWith(IE->getOperand(1),
16783 PoisonValue::get(IE->getOperand(1)->getType()));
16784 eraseInstruction(IE);
16785 }
16786 CSEBlocks.insert(LastInsert->getParent());
16787 }
16788
16789 SmallVector<Instruction *> RemovedInsts;
16790 // For each vectorized value:
16791 for (auto &TEPtr : VectorizableTree) {
16792 TreeEntry *Entry = TEPtr.get();
16793
16794 // No need to handle users of gathered values.
16795 if (Entry->isGather())
16796 continue;
16797
16798 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16799
16800 // For each lane:
16801 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16802 Value *Scalar = Entry->Scalars[Lane];
16803
16804 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16805 !isa<GetElementPtrInst>(Scalar))
16806 continue;
16807 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16808 EE && IgnoredExtracts.contains(EE))
16809 continue;
16810 if (isa<PoisonValue>(Scalar))
16811 continue;
16812#ifndef NDEBUG
16813 Type *Ty = Scalar->getType();
16814 if (!Ty->isVoidTy()) {
16815 for (User *U : Scalar->users()) {
16816 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16817
16818 // It is legal to delete users in the ignorelist.
16819 assert((getTreeEntry(U) ||
16820 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16821 (isa_and_nonnull<Instruction>(U) &&
16822 isDeleted(cast<Instruction>(U)))) &&
16823 "Deleting out-of-tree value");
16824 }
16825 }
16826#endif
16827 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16828 auto *I = cast<Instruction>(Scalar);
16829 RemovedInsts.push_back(I);
16830 }
16831 }
16832
16833 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16834 // new vector instruction.
16835 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16836 V->mergeDIAssignID(RemovedInsts);
16837
16838 // Clear up reduction references, if any.
16839 if (UserIgnoreList) {
16840 for (Instruction *I : RemovedInsts) {
16841 const TreeEntry *IE = getTreeEntry(I);
16842 if (IE->Idx != 0 &&
16843 !(VectorizableTree.front()->isGather() &&
16844 !IE->UserTreeIndices.empty() &&
16845 (ValueToGatherNodes.lookup(I).contains(
16846 VectorizableTree.front().get()) ||
16847 any_of(IE->UserTreeIndices,
16848 [&](const EdgeInfo &EI) {
16849 return EI.UserTE == VectorizableTree.front().get() &&
16850 EI.EdgeIdx == UINT_MAX;
16851 }))) &&
16852 !(GatheredLoadsEntriesFirst.has_value() &&
16853 IE->Idx >= *GatheredLoadsEntriesFirst &&
16854 VectorizableTree.front()->isGather() &&
16855 is_contained(VectorizableTree.front()->Scalars, I)))
16856 continue;
16857 SmallVector<SelectInst *> LogicalOpSelects;
16858 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16859 // Do not replace condition of the logical op in form select <cond>.
16860 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16861 (match(U.getUser(), m_LogicalAnd()) ||
16862 match(U.getUser(), m_LogicalOr())) &&
16863 U.getOperandNo() == 0;
16864 if (IsPoisoningLogicalOp) {
16865 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16866 return false;
16867 }
16868 return UserIgnoreList->contains(U.getUser());
16869 });
16870 // Replace conditions of the poisoning logical ops with the non-poison
16871 // constant value.
16872 for (SelectInst *SI : LogicalOpSelects)
16873 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16874 }
16875 }
16876 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16877 // cache correctness.
16878 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16879 // - instructions are not deleted until later.
16880 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16881
16882 Builder.ClearInsertionPoint();
16883 InstrElementSize.clear();
16884
16885 const TreeEntry &RootTE = *VectorizableTree.front();
16886 Value *Vec = RootTE.VectorizedValue;
16887 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16888 It != MinBWs.end() &&
16889 ReductionBitWidth != It->second.first) {
16890 IRBuilder<>::InsertPointGuard Guard(Builder);
16891 Builder.SetInsertPoint(ReductionRoot->getParent(),
16892 ReductionRoot->getIterator());
16893 Vec = Builder.CreateIntCast(
16894 Vec,
16895 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16896 cast<VectorType>(Vec->getType())->getElementCount()),
16897 It->second.second);
16898 }
16899 return Vec;
16900}
16901
16903 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16904 << " gather sequences instructions.\n");
16905 // LICM InsertElementInst sequences.
16906 for (Instruction *I : GatherShuffleExtractSeq) {
16907 if (isDeleted(I))
16908 continue;
16909
16910 // Check if this block is inside a loop.
16911 Loop *L = LI->getLoopFor(I->getParent());
16912 if (!L)
16913 continue;
16914
16915 // Check if it has a preheader.
16916 BasicBlock *PreHeader = L->getLoopPreheader();
16917 if (!PreHeader)
16918 continue;
16919
16920 // If the vector or the element that we insert into it are
16921 // instructions that are defined in this basic block then we can't
16922 // hoist this instruction.
16923 if (any_of(I->operands(), [L](Value *V) {
16924 auto *OpI = dyn_cast<Instruction>(V);
16925 return OpI && L->contains(OpI);
16926 }))
16927 continue;
16928
16929 // We can hoist this instruction. Move it to the pre-header.
16930 I->moveBefore(PreHeader->getTerminator());
16931 CSEBlocks.insert(PreHeader);
16932 }
16933
16934 // Make a list of all reachable blocks in our CSE queue.
16936 CSEWorkList.reserve(CSEBlocks.size());
16937 for (BasicBlock *BB : CSEBlocks)
16938 if (DomTreeNode *N = DT->getNode(BB)) {
16940 CSEWorkList.push_back(N);
16941 }
16942
16943 // Sort blocks by domination. This ensures we visit a block after all blocks
16944 // dominating it are visited.
16945 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
16946 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
16947 "Different nodes should have different DFS numbers");
16948 return A->getDFSNumIn() < B->getDFSNumIn();
16949 });
16950
16951 // Less defined shuffles can be replaced by the more defined copies.
16952 // Between two shuffles one is less defined if it has the same vector operands
16953 // and its mask indeces are the same as in the first one or undefs. E.g.
16954 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
16955 // poison, <0, 0, 0, 0>.
16956 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
16957 SmallVectorImpl<int> &NewMask) {
16958 if (I1->getType() != I2->getType())
16959 return false;
16960 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16961 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16962 if (!SI1 || !SI2)
16963 return I1->isIdenticalTo(I2);
16964 if (SI1->isIdenticalTo(SI2))
16965 return true;
16966 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
16967 if (SI1->getOperand(I) != SI2->getOperand(I))
16968 return false;
16969 // Check if the second instruction is more defined than the first one.
16970 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
16971 ArrayRef<int> SM1 = SI1->getShuffleMask();
16972 // Count trailing undefs in the mask to check the final number of used
16973 // registers.
16974 unsigned LastUndefsCnt = 0;
16975 for (int I = 0, E = NewMask.size(); I < E; ++I) {
16976 if (SM1[I] == PoisonMaskElem)
16977 ++LastUndefsCnt;
16978 else
16979 LastUndefsCnt = 0;
16980 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
16981 NewMask[I] != SM1[I])
16982 return false;
16983 if (NewMask[I] == PoisonMaskElem)
16984 NewMask[I] = SM1[I];
16985 }
16986 // Check if the last undefs actually change the final number of used vector
16987 // registers.
16988 return SM1.size() - LastUndefsCnt > 1 &&
16989 TTI->getNumberOfParts(SI1->getType()) ==
16991 getWidenedType(SI1->getType()->getElementType(),
16992 SM1.size() - LastUndefsCnt));
16993 };
16994 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
16995 // instructions. TODO: We can further optimize this scan if we split the
16996 // instructions into different buckets based on the insert lane.
16998 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
16999 assert(*I &&
17000 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17001 "Worklist not sorted properly!");
17002 BasicBlock *BB = (*I)->getBlock();
17003 // For all instructions in blocks containing gather sequences:
17004 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
17005 if (isDeleted(&In))
17006 continue;
17007 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17008 !GatherShuffleExtractSeq.contains(&In))
17009 continue;
17010
17011 // Check if we can replace this instruction with any of the
17012 // visited instructions.
17013 bool Replaced = false;
17014 for (Instruction *&V : Visited) {
17015 SmallVector<int> NewMask;
17016 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17017 DT->dominates(V->getParent(), In.getParent())) {
17018 In.replaceAllUsesWith(V);
17019 eraseInstruction(&In);
17020 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17021 if (!NewMask.empty())
17022 SI->setShuffleMask(NewMask);
17023 Replaced = true;
17024 break;
17025 }
17026 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17027 GatherShuffleExtractSeq.contains(V) &&
17028 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17029 DT->dominates(In.getParent(), V->getParent())) {
17030 In.moveAfter(V);
17031 V->replaceAllUsesWith(&In);
17033 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17034 if (!NewMask.empty())
17035 SI->setShuffleMask(NewMask);
17036 V = &In;
17037 Replaced = true;
17038 break;
17039 }
17040 }
17041 if (!Replaced) {
17042 assert(!is_contained(Visited, &In));
17043 Visited.push_back(&In);
17044 }
17045 }
17046 }
17047 CSEBlocks.clear();
17048 GatherShuffleExtractSeq.clear();
17049}
17050
17051BoUpSLP::ScheduleData *
17052BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17053 ScheduleData *Bundle = nullptr;
17054 ScheduleData *PrevInBundle = nullptr;
17055 for (Value *V : VL) {
17057 continue;
17058 ScheduleData *BundleMember = getScheduleData(V);
17059 assert(BundleMember &&
17060 "no ScheduleData for bundle member "
17061 "(maybe not in same basic block)");
17062 assert(BundleMember->isSchedulingEntity() &&
17063 "bundle member already part of other bundle");
17064 if (PrevInBundle) {
17065 PrevInBundle->NextInBundle = BundleMember;
17066 } else {
17067 Bundle = BundleMember;
17068 }
17069
17070 // Group the instructions to a bundle.
17071 BundleMember->FirstInBundle = Bundle;
17072 PrevInBundle = BundleMember;
17073 }
17074 assert(Bundle && "Failed to find schedule bundle");
17075 return Bundle;
17076}
17077
17078// Groups the instructions to a bundle (which is then a single scheduling entity)
17079// and schedules instructions until the bundle gets ready.
17080std::optional<BoUpSLP::ScheduleData *>
17081BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
17082 const InstructionsState &S) {
17083 // No need to schedule PHIs, insertelement, extractelement and extractvalue
17084 // instructions.
17085 if (isa<PHINode>(S.getMainOp()) ||
17087 return nullptr;
17088
17089 // Initialize the instruction bundle.
17090 Instruction *OldScheduleEnd = ScheduleEnd;
17091 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
17092
17093 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17094 ScheduleData *Bundle) {
17095 // The scheduling region got new instructions at the lower end (or it is a
17096 // new region for the first bundle). This makes it necessary to
17097 // recalculate all dependencies.
17098 // It is seldom that this needs to be done a second time after adding the
17099 // initial bundle to the region.
17100 if (ScheduleEnd != OldScheduleEnd) {
17101 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
17102 if (ScheduleData *SD = getScheduleData(I))
17103 SD->clearDependencies();
17104 ReSchedule = true;
17105 }
17106 if (Bundle) {
17107 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
17108 << " in block " << BB->getName() << "\n");
17109 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
17110 }
17111
17112 if (ReSchedule) {
17113 resetSchedule();
17114 initialFillReadyList(ReadyInsts);
17115 }
17116
17117 // Now try to schedule the new bundle or (if no bundle) just calculate
17118 // dependencies. As soon as the bundle is "ready" it means that there are no
17119 // cyclic dependencies and we can schedule it. Note that's important that we
17120 // don't "schedule" the bundle yet (see cancelScheduling).
17121 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17122 !ReadyInsts.empty()) {
17123 ScheduleData *Picked = ReadyInsts.pop_back_val();
17124 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17125 "must be ready to schedule");
17126 schedule(Picked, ReadyInsts);
17127 }
17128 };
17129
17130 // Make sure that the scheduling region contains all
17131 // instructions of the bundle.
17132 for (Value *V : VL) {
17134 continue;
17135 if (!extendSchedulingRegion(V, S)) {
17136 // If the scheduling region got new instructions at the lower end (or it
17137 // is a new region for the first bundle). This makes it necessary to
17138 // recalculate all dependencies.
17139 // Otherwise the compiler may crash trying to incorrectly calculate
17140 // dependencies and emit instruction in the wrong order at the actual
17141 // scheduling.
17142 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
17143 return std::nullopt;
17144 }
17145 }
17146
17147 bool ReSchedule = false;
17148 for (Value *V : VL) {
17150 continue;
17151 ScheduleData *BundleMember = getScheduleData(V);
17152 assert(BundleMember &&
17153 "no ScheduleData for bundle member (maybe not in same basic block)");
17154
17155 // Make sure we don't leave the pieces of the bundle in the ready list when
17156 // whole bundle might not be ready.
17157 ReadyInsts.remove(BundleMember);
17158
17159 if (!BundleMember->IsScheduled)
17160 continue;
17161 // A bundle member was scheduled as single instruction before and now
17162 // needs to be scheduled as part of the bundle. We just get rid of the
17163 // existing schedule.
17164 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
17165 << " was already scheduled\n");
17166 ReSchedule = true;
17167 }
17168
17169 auto *Bundle = buildBundle(VL);
17170 TryScheduleBundleImpl(ReSchedule, Bundle);
17171 if (!Bundle->isReady()) {
17172 cancelScheduling(VL, S.getMainOp());
17173 return std::nullopt;
17174 }
17175 return Bundle;
17176}
17177
17178void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17179 Value *OpValue) {
17180 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17182 return;
17183
17184 if (doesNotNeedToBeScheduled(OpValue))
17185 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17186 ScheduleData *Bundle = getScheduleData(OpValue);
17187 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17188 assert(!Bundle->IsScheduled &&
17189 "Can't cancel bundle which is already scheduled");
17190 assert(Bundle->isSchedulingEntity() &&
17191 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17192 "tried to unbundle something which is not a bundle");
17193
17194 // Remove the bundle from the ready list.
17195 if (Bundle->isReady())
17196 ReadyInsts.remove(Bundle);
17197
17198 // Un-bundle: make single instructions out of the bundle.
17199 ScheduleData *BundleMember = Bundle;
17200 while (BundleMember) {
17201 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17202 BundleMember->FirstInBundle = BundleMember;
17203 ScheduleData *Next = BundleMember->NextInBundle;
17204 BundleMember->NextInBundle = nullptr;
17205 BundleMember->TE = nullptr;
17206 if (BundleMember->unscheduledDepsInBundle() == 0) {
17207 ReadyInsts.insert(BundleMember);
17208 }
17209 BundleMember = Next;
17210 }
17211}
17212
17213BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17214 // Allocate a new ScheduleData for the instruction.
17215 if (ChunkPos >= ChunkSize) {
17216 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17217 ChunkPos = 0;
17218 }
17219 return &(ScheduleDataChunks.back()[ChunkPos++]);
17220}
17221
17222bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17223 Value *V, const InstructionsState &S) {
17224 Instruction *I = dyn_cast<Instruction>(V);
17225 assert(I && "bundle member must be an instruction");
17226 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17228 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17229 "be scheduled");
17230 if (getScheduleData(I))
17231 return true;
17232 if (!ScheduleStart) {
17233 // It's the first instruction in the new region.
17234 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17235 ScheduleStart = I;
17236 ScheduleEnd = I->getNextNode();
17237 assert(ScheduleEnd && "tried to vectorize a terminator?");
17238 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17239 return true;
17240 }
17241 // Search up and down at the same time, because we don't know if the new
17242 // instruction is above or below the existing scheduling region.
17243 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17244 // against the budget. Otherwise debug info could affect codegen.
17246 ++ScheduleStart->getIterator().getReverse();
17247 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17248 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17249 BasicBlock::iterator LowerEnd = BB->end();
17250 auto IsAssumeLikeIntr = [](const Instruction &I) {
17251 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17252 return II->isAssumeLikeIntrinsic();
17253 return false;
17254 };
17255 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17256 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17257 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17258 &*DownIter != I) {
17259 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17260 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17261 return false;
17262 }
17263
17264 ++UpIter;
17265 ++DownIter;
17266
17267 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17268 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17269 }
17270 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17271 assert(I->getParent() == ScheduleStart->getParent() &&
17272 "Instruction is in wrong basic block.");
17273 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17274 ScheduleStart = I;
17275 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17276 << "\n");
17277 return true;
17278 }
17279 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17280 "Expected to reach top of the basic block or instruction down the "
17281 "lower end.");
17282 assert(I->getParent() == ScheduleEnd->getParent() &&
17283 "Instruction is in wrong basic block.");
17284 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17285 nullptr);
17286 ScheduleEnd = I->getNextNode();
17287 assert(ScheduleEnd && "tried to vectorize a terminator?");
17288 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17289 return true;
17290}
17291
17292void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17293 Instruction *ToI,
17294 ScheduleData *PrevLoadStore,
17295 ScheduleData *NextLoadStore) {
17296 ScheduleData *CurrentLoadStore = PrevLoadStore;
17297 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17298 // No need to allocate data for non-schedulable instructions.
17300 continue;
17301 ScheduleData *SD = ScheduleDataMap.lookup(I);
17302 if (!SD) {
17303 SD = allocateScheduleDataChunks();
17304 ScheduleDataMap[I] = SD;
17305 }
17306 assert(!isInSchedulingRegion(SD) &&
17307 "new ScheduleData already in scheduling region");
17308 SD->init(SchedulingRegionID, I);
17309
17310 if (I->mayReadOrWriteMemory() &&
17311 (!isa<IntrinsicInst>(I) ||
17312 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17313 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17314 Intrinsic::pseudoprobe))) {
17315 // Update the linked list of memory accessing instructions.
17316 if (CurrentLoadStore) {
17317 CurrentLoadStore->NextLoadStore = SD;
17318 } else {
17319 FirstLoadStoreInRegion = SD;
17320 }
17321 CurrentLoadStore = SD;
17322 }
17323
17324 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17325 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17326 RegionHasStackSave = true;
17327 }
17328 if (NextLoadStore) {
17329 if (CurrentLoadStore)
17330 CurrentLoadStore->NextLoadStore = NextLoadStore;
17331 } else {
17332 LastLoadStoreInRegion = CurrentLoadStore;
17333 }
17334}
17335
17336void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17337 bool InsertInReadyList,
17338 BoUpSLP *SLP) {
17339 assert(SD->isSchedulingEntity());
17340
17342 WorkList.push_back(SD);
17343
17344 while (!WorkList.empty()) {
17345 ScheduleData *SD = WorkList.pop_back_val();
17346 for (ScheduleData *BundleMember = SD; BundleMember;
17347 BundleMember = BundleMember->NextInBundle) {
17348 assert(isInSchedulingRegion(BundleMember));
17349 if (BundleMember->hasValidDependencies())
17350 continue;
17351
17352 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17353 << "\n");
17354 BundleMember->Dependencies = 0;
17355 BundleMember->resetUnscheduledDeps();
17356
17357 // Handle def-use chain dependencies.
17358 for (User *U : BundleMember->Inst->users()) {
17359 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17360 BundleMember->Dependencies++;
17361 ScheduleData *DestBundle = UseSD->FirstInBundle;
17362 if (!DestBundle->IsScheduled)
17363 BundleMember->incrementUnscheduledDeps(1);
17364 if (!DestBundle->hasValidDependencies())
17365 WorkList.push_back(DestBundle);
17366 }
17367 }
17368
17369 auto MakeControlDependent = [&](Instruction *I) {
17370 auto *DepDest = getScheduleData(I);
17371 assert(DepDest && "must be in schedule window");
17372 DepDest->ControlDependencies.push_back(BundleMember);
17373 BundleMember->Dependencies++;
17374 ScheduleData *DestBundle = DepDest->FirstInBundle;
17375 if (!DestBundle->IsScheduled)
17376 BundleMember->incrementUnscheduledDeps(1);
17377 if (!DestBundle->hasValidDependencies())
17378 WorkList.push_back(DestBundle);
17379 };
17380
17381 // Any instruction which isn't safe to speculate at the beginning of the
17382 // block is control dependend on any early exit or non-willreturn call
17383 // which proceeds it.
17384 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17385 for (Instruction *I = BundleMember->Inst->getNextNode();
17386 I != ScheduleEnd; I = I->getNextNode()) {
17387 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17388 continue;
17389
17390 // Add the dependency
17391 MakeControlDependent(I);
17392
17394 // Everything past here must be control dependent on I.
17395 break;
17396 }
17397 }
17398
17399 if (RegionHasStackSave) {
17400 // If we have an inalloc alloca instruction, it needs to be scheduled
17401 // after any preceeding stacksave. We also need to prevent any alloca
17402 // from reordering above a preceeding stackrestore.
17403 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17404 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17405 for (Instruction *I = BundleMember->Inst->getNextNode();
17406 I != ScheduleEnd; I = I->getNextNode()) {
17407 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17408 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17409 // Any allocas past here must be control dependent on I, and I
17410 // must be memory dependend on BundleMember->Inst.
17411 break;
17412
17413 if (!isa<AllocaInst>(I))
17414 continue;
17415
17416 // Add the dependency
17417 MakeControlDependent(I);
17418 }
17419 }
17420
17421 // In addition to the cases handle just above, we need to prevent
17422 // allocas and loads/stores from moving below a stacksave or a
17423 // stackrestore. Avoiding moving allocas below stackrestore is currently
17424 // thought to be conservatism. Moving loads/stores below a stackrestore
17425 // can lead to incorrect code.
17426 if (isa<AllocaInst>(BundleMember->Inst) ||
17427 BundleMember->Inst->mayReadOrWriteMemory()) {
17428 for (Instruction *I = BundleMember->Inst->getNextNode();
17429 I != ScheduleEnd; I = I->getNextNode()) {
17430 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17431 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17432 continue;
17433
17434 // Add the dependency
17435 MakeControlDependent(I);
17436 break;
17437 }
17438 }
17439 }
17440
17441 // Handle the memory dependencies (if any).
17442 ScheduleData *DepDest = BundleMember->NextLoadStore;
17443 if (!DepDest)
17444 continue;
17445 Instruction *SrcInst = BundleMember->Inst;
17446 assert(SrcInst->mayReadOrWriteMemory() &&
17447 "NextLoadStore list for non memory effecting bundle?");
17448 MemoryLocation SrcLoc = getLocation(SrcInst);
17449 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17450 unsigned NumAliased = 0;
17451 unsigned DistToSrc = 1;
17452
17453 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17454 assert(isInSchedulingRegion(DepDest));
17455
17456 // We have two limits to reduce the complexity:
17457 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17458 // SLP->isAliased (which is the expensive part in this loop).
17459 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17460 // the whole loop (even if the loop is fast, it's quadratic).
17461 // It's important for the loop break condition (see below) to
17462 // check this limit even between two read-only instructions.
17463 if (DistToSrc >= MaxMemDepDistance ||
17464 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17465 (NumAliased >= AliasedCheckLimit ||
17466 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17467
17468 // We increment the counter only if the locations are aliased
17469 // (instead of counting all alias checks). This gives a better
17470 // balance between reduced runtime and accurate dependencies.
17471 NumAliased++;
17472
17473 DepDest->MemoryDependencies.push_back(BundleMember);
17474 BundleMember->Dependencies++;
17475 ScheduleData *DestBundle = DepDest->FirstInBundle;
17476 if (!DestBundle->IsScheduled) {
17477 BundleMember->incrementUnscheduledDeps(1);
17478 }
17479 if (!DestBundle->hasValidDependencies()) {
17480 WorkList.push_back(DestBundle);
17481 }
17482 }
17483
17484 // Example, explaining the loop break condition: Let's assume our
17485 // starting instruction is i0 and MaxMemDepDistance = 3.
17486 //
17487 // +--------v--v--v
17488 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17489 // +--------^--^--^
17490 //
17491 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17492 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17493 // Previously we already added dependencies from i3 to i6,i7,i8
17494 // (because of MaxMemDepDistance). As we added a dependency from
17495 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17496 // and we can abort this loop at i6.
17497 if (DistToSrc >= 2 * MaxMemDepDistance)
17498 break;
17499 DistToSrc++;
17500 }
17501 }
17502 if (InsertInReadyList && SD->isReady()) {
17503 ReadyInsts.insert(SD);
17504 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17505 << "\n");
17506 }
17507 }
17508}
17509
17510void BoUpSLP::BlockScheduling::resetSchedule() {
17511 assert(ScheduleStart &&
17512 "tried to reset schedule on block which has not been scheduled");
17513 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17514 if (ScheduleData *SD = getScheduleData(I)) {
17515 assert(isInSchedulingRegion(SD) &&
17516 "ScheduleData not in scheduling region");
17517 SD->IsScheduled = false;
17518 SD->resetUnscheduledDeps();
17519 }
17520 }
17521 ReadyInsts.clear();
17522}
17523
17524void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17525 if (!BS->ScheduleStart)
17526 return;
17527
17528 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17529
17530 // A key point - if we got here, pre-scheduling was able to find a valid
17531 // scheduling of the sub-graph of the scheduling window which consists
17532 // of all vector bundles and their transitive users. As such, we do not
17533 // need to reschedule anything *outside of* that subgraph.
17534
17535 BS->resetSchedule();
17536
17537 // For the real scheduling we use a more sophisticated ready-list: it is
17538 // sorted by the original instruction location. This lets the final schedule
17539 // be as close as possible to the original instruction order.
17540 // WARNING: If changing this order causes a correctness issue, that means
17541 // there is some missing dependence edge in the schedule data graph.
17542 struct ScheduleDataCompare {
17543 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17544 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17545 }
17546 };
17547 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17548
17549 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17550 // and fill the ready-list with initial instructions.
17551 int Idx = 0;
17552 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17553 I = I->getNextNode()) {
17554 if (ScheduleData *SD = BS->getScheduleData(I)) {
17555 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17556 (void)SDTE;
17558 SD->isPartOfBundle() ==
17559 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17560 "scheduler and vectorizer bundle mismatch");
17561 SD->FirstInBundle->SchedulingPriority = Idx++;
17562
17563 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17564 BS->calculateDependencies(SD, false, this);
17565 }
17566 }
17567 BS->initialFillReadyList(ReadyInsts);
17568
17569 Instruction *LastScheduledInst = BS->ScheduleEnd;
17570
17571 // Do the "real" scheduling.
17572 while (!ReadyInsts.empty()) {
17573 ScheduleData *Picked = *ReadyInsts.begin();
17574 ReadyInsts.erase(ReadyInsts.begin());
17575
17576 // Move the scheduled instruction(s) to their dedicated places, if not
17577 // there yet.
17578 for (ScheduleData *BundleMember = Picked; BundleMember;
17579 BundleMember = BundleMember->NextInBundle) {
17580 Instruction *PickedInst = BundleMember->Inst;
17581 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17582 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17583 LastScheduledInst = PickedInst;
17584 }
17585
17586 BS->schedule(Picked, ReadyInsts);
17587 }
17588
17589 // Check that we didn't break any of our invariants.
17590#ifdef EXPENSIVE_CHECKS
17591 BS->verify();
17592#endif
17593
17594#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17595 // Check that all schedulable entities got scheduled
17596 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17597 ScheduleData *SD = BS->getScheduleData(I);
17598 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17599 assert(SD->IsScheduled && "must be scheduled at this point");
17600 }
17601#endif
17602
17603 // Avoid duplicate scheduling of the block.
17604 BS->ScheduleStart = nullptr;
17605}
17606
17608 // If V is a store, just return the width of the stored value (or value
17609 // truncated just before storing) without traversing the expression tree.
17610 // This is the common case.
17611 if (auto *Store = dyn_cast<StoreInst>(V))
17612 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17613
17614 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17615 return getVectorElementSize(IEI->getOperand(1));
17616
17617 auto E = InstrElementSize.find(V);
17618 if (E != InstrElementSize.end())
17619 return E->second;
17620
17621 // If V is not a store, we can traverse the expression tree to find loads
17622 // that feed it. The type of the loaded value may indicate a more suitable
17623 // width than V's type. We want to base the vector element size on the width
17624 // of memory operations where possible.
17627 if (auto *I = dyn_cast<Instruction>(V)) {
17628 Worklist.emplace_back(I, I->getParent(), 0);
17629 Visited.insert(I);
17630 }
17631
17632 // Traverse the expression tree in bottom-up order looking for loads. If we
17633 // encounter an instruction we don't yet handle, we give up.
17634 auto Width = 0u;
17635 Value *FirstNonBool = nullptr;
17636 while (!Worklist.empty()) {
17637 auto [I, Parent, Level] = Worklist.pop_back_val();
17638
17639 // We should only be looking at scalar instructions here. If the current
17640 // instruction has a vector type, skip.
17641 auto *Ty = I->getType();
17642 if (isa<VectorType>(Ty))
17643 continue;
17644 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17645 FirstNonBool = I;
17646 if (Level > RecursionMaxDepth)
17647 continue;
17648
17649 // If the current instruction is a load, update MaxWidth to reflect the
17650 // width of the loaded value.
17651 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17652 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17653
17654 // Otherwise, we need to visit the operands of the instruction. We only
17655 // handle the interesting cases from buildTree here. If an operand is an
17656 // instruction we haven't yet visited and from the same basic block as the
17657 // user or the use is a PHI node, we add it to the worklist.
17660 for (Use &U : I->operands()) {
17661 if (auto *J = dyn_cast<Instruction>(U.get()))
17662 if (Visited.insert(J).second &&
17663 (isa<PHINode>(I) || J->getParent() == Parent)) {
17664 Worklist.emplace_back(J, J->getParent(), Level + 1);
17665 continue;
17666 }
17667 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17668 FirstNonBool = U.get();
17669 }
17670 } else {
17671 break;
17672 }
17673 }
17674
17675 // If we didn't encounter a memory access in the expression tree, or if we
17676 // gave up for some reason, just return the width of V. Otherwise, return the
17677 // maximum width we found.
17678 if (!Width) {
17679 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17680 V = FirstNonBool;
17681 Width = DL->getTypeSizeInBits(V->getType());
17682 }
17683
17684 for (Instruction *I : Visited)
17685 InstrElementSize[I] = Width;
17686
17687 return Width;
17688}
17689
17690bool BoUpSLP::collectValuesToDemote(
17691 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17693 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17694 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17695 // We can always demote constants.
17696 if (all_of(E.Scalars, IsaPred<Constant>))
17697 return true;
17698
17699 unsigned OrigBitWidth =
17700 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17701 if (OrigBitWidth == BitWidth) {
17702 MaxDepthLevel = 1;
17703 return true;
17704 }
17705
17706 // Check if the node was analyzed already and must keep its original bitwidth.
17707 if (NodesToKeepBWs.contains(E.Idx))
17708 return false;
17709
17710 // If the value is not a vectorized instruction in the expression and not used
17711 // by the insertelement instruction and not used in multiple vector nodes, it
17712 // cannot be demoted.
17713 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17714 if (isa<PoisonValue>(R))
17715 return false;
17716 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17717 });
17718 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17719 if (isa<PoisonValue>(V))
17720 return true;
17721 if (MultiNodeScalars.contains(V))
17722 return false;
17723 // For lat shuffle of sext/zext with many uses need to check the extra bit
17724 // for unsigned values, otherwise may have incorrect casting for reused
17725 // scalars.
17726 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17727 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17728 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17729 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17730 return true;
17731 }
17732 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17733 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17734 if (IsSignedNode)
17735 ++BitWidth1;
17736 if (auto *I = dyn_cast<Instruction>(V)) {
17737 APInt Mask = DB->getDemandedBits(I);
17738 unsigned BitWidth2 =
17739 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17740 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17741 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17742 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17743 break;
17744 BitWidth2 *= 2;
17745 }
17746 BitWidth1 = std::min(BitWidth1, BitWidth2);
17747 }
17748 BitWidth = std::max(BitWidth, BitWidth1);
17749 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17750 };
17751 using namespace std::placeholders;
17752 auto FinalAnalysis = [&]() {
17753 if (!IsProfitableToDemote)
17754 return false;
17755 bool Res = all_of(
17756 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17757 // Demote gathers.
17758 if (Res && E.isGather()) {
17759 // Check possible extractelement instructions bases and final vector
17760 // length.
17761 SmallPtrSet<Value *, 4> UniqueBases;
17762 for (Value *V : E.Scalars) {
17763 auto *EE = dyn_cast<ExtractElementInst>(V);
17764 if (!EE)
17765 continue;
17766 UniqueBases.insert(EE->getVectorOperand());
17767 }
17768 const unsigned VF = E.Scalars.size();
17769 Type *OrigScalarTy = E.Scalars.front()->getType();
17770 if (UniqueBases.size() <= 2 ||
17771 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17773 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17774 ToDemote.push_back(E.Idx);
17775 }
17776 return Res;
17777 };
17778 if (E.isGather() || !Visited.insert(&E).second ||
17779 any_of(E.Scalars, [&](Value *V) {
17780 return all_of(V->users(), [&](User *U) {
17781 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17782 });
17783 }))
17784 return FinalAnalysis();
17785
17786 if (any_of(E.Scalars, [&](Value *V) {
17787 return !all_of(V->users(), [=](User *U) {
17788 return getTreeEntry(U) ||
17789 (E.Idx == 0 && UserIgnoreList &&
17790 UserIgnoreList->contains(U)) ||
17791 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17792 !U->getType()->isScalableTy() &&
17793 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17794 }) && !IsPotentiallyTruncated(V, BitWidth);
17795 }))
17796 return false;
17797
17798 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17799 bool &NeedToExit) {
17800 NeedToExit = false;
17801 unsigned InitLevel = MaxDepthLevel;
17802 for (const TreeEntry *Op : Operands) {
17803 unsigned Level = InitLevel;
17804 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17805 ToDemote, Visited, NodesToKeepBWs, Level,
17806 IsProfitableToDemote, IsTruncRoot)) {
17807 if (!IsProfitableToDemote)
17808 return false;
17809 NeedToExit = true;
17810 if (!FinalAnalysis())
17811 return false;
17812 continue;
17813 }
17814 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17815 }
17816 return true;
17817 };
17818 auto AttemptCheckBitwidth =
17819 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17820 // Try all bitwidth < OrigBitWidth.
17821 NeedToExit = false;
17822 unsigned BestFailBitwidth = 0;
17823 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17824 if (Checker(BitWidth, OrigBitWidth))
17825 return true;
17826 if (BestFailBitwidth == 0 && FinalAnalysis())
17827 BestFailBitwidth = BitWidth;
17828 }
17829 if (BitWidth >= OrigBitWidth) {
17830 if (BestFailBitwidth == 0) {
17831 BitWidth = OrigBitWidth;
17832 return false;
17833 }
17834 MaxDepthLevel = 1;
17835 BitWidth = BestFailBitwidth;
17836 NeedToExit = true;
17837 return true;
17838 }
17839 return false;
17840 };
17841 auto TryProcessInstruction =
17842 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17843 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17844 if (Operands.empty()) {
17845 if (!IsTruncRoot)
17846 MaxDepthLevel = 1;
17847 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17848 std::ref(BitWidth)));
17849 } else {
17850 // Several vectorized uses? Check if we can truncate it, otherwise -
17851 // exit.
17852 if (E.UserTreeIndices.size() > 1 &&
17853 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17854 std::ref(BitWidth))))
17855 return false;
17856 bool NeedToExit = false;
17857 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17858 return false;
17859 if (NeedToExit)
17860 return true;
17861 if (!ProcessOperands(Operands, NeedToExit))
17862 return false;
17863 if (NeedToExit)
17864 return true;
17865 }
17866
17867 ++MaxDepthLevel;
17868 // Record the entry that we can demote.
17869 ToDemote.push_back(E.Idx);
17870 return IsProfitableToDemote;
17871 };
17872 switch (E.getOpcode()) {
17873
17874 // We can always demote truncations and extensions. Since truncations can
17875 // seed additional demotion, we save the truncated value.
17876 case Instruction::Trunc:
17877 if (IsProfitableToDemoteRoot)
17878 IsProfitableToDemote = true;
17879 return TryProcessInstruction(BitWidth);
17880 case Instruction::ZExt:
17881 case Instruction::SExt:
17882 IsProfitableToDemote = true;
17883 return TryProcessInstruction(BitWidth);
17884
17885 // We can demote certain binary operations if we can demote both of their
17886 // operands.
17887 case Instruction::Add:
17888 case Instruction::Sub:
17889 case Instruction::Mul:
17890 case Instruction::And:
17891 case Instruction::Or:
17892 case Instruction::Xor: {
17893 return TryProcessInstruction(
17894 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17895 }
17896 case Instruction::Freeze:
17897 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17898 case Instruction::Shl: {
17899 // If we are truncating the result of this SHL, and if it's a shift of an
17900 // inrange amount, we can always perform a SHL in a smaller type.
17901 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17902 return all_of(E.Scalars, [&](Value *V) {
17903 if (isa<PoisonValue>(V))
17904 return true;
17905 auto *I = cast<Instruction>(V);
17906 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17907 return AmtKnownBits.getMaxValue().ult(BitWidth);
17908 });
17909 };
17910 return TryProcessInstruction(
17911 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17912 }
17913 case Instruction::LShr: {
17914 // If this is a truncate of a logical shr, we can truncate it to a smaller
17915 // lshr iff we know that the bits we would otherwise be shifting in are
17916 // already zeros.
17917 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17918 return all_of(E.Scalars, [&](Value *V) {
17919 if (isa<PoisonValue>(V))
17920 return true;
17921 auto *I = cast<Instruction>(V);
17922 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17923 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17924 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17925 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17926 SimplifyQuery(*DL));
17927 });
17928 };
17929 return TryProcessInstruction(
17930 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17931 LShrChecker);
17932 }
17933 case Instruction::AShr: {
17934 // If this is a truncate of an arithmetic shr, we can truncate it to a
17935 // smaller ashr iff we know that all the bits from the sign bit of the
17936 // original type and the sign bit of the truncate type are similar.
17937 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17938 return all_of(E.Scalars, [&](Value *V) {
17939 if (isa<PoisonValue>(V))
17940 return true;
17941 auto *I = cast<Instruction>(V);
17942 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17943 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17944 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17945 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17946 nullptr, DT);
17947 });
17948 };
17949 return TryProcessInstruction(
17950 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17951 AShrChecker);
17952 }
17953 case Instruction::UDiv:
17954 case Instruction::URem: {
17955 // UDiv and URem can be truncated if all the truncated bits are zero.
17956 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17957 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17958 return all_of(E.Scalars, [&](Value *V) {
17959 auto *I = cast<Instruction>(V);
17960 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17961 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17962 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17963 });
17964 };
17965 return TryProcessInstruction(
17966 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17967 }
17968
17969 // We can demote selects if we can demote their true and false values.
17970 case Instruction::Select: {
17971 return TryProcessInstruction(
17972 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
17973 }
17974
17975 // We can demote phis if we can demote all their incoming operands. Note that
17976 // we don't need to worry about cycles since we ensure single use above.
17977 case Instruction::PHI: {
17978 const unsigned NumOps = E.getNumOperands();
17980 transform(seq<unsigned>(0, NumOps), Ops.begin(),
17981 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
17982
17983 return TryProcessInstruction(BitWidth, Ops);
17984 }
17985
17986 case Instruction::Call: {
17987 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
17988 if (!IC)
17989 break;
17991 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
17992 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
17993 break;
17994 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
17995 function_ref<bool(unsigned, unsigned)> CallChecker;
17996 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17997 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17998 return all_of(E.Scalars, [&](Value *V) {
17999 auto *I = cast<Instruction>(V);
18000 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18001 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18002 return MaskedValueIsZero(I->getOperand(0), Mask,
18003 SimplifyQuery(*DL)) &&
18004 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18005 }
18006 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
18007 "Expected min/max intrinsics only.");
18008 unsigned SignBits = OrigBitWidth - BitWidth;
18009 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18010 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18011 nullptr, DT);
18012 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18013 nullptr, DT);
18014 return SignBits <= Op0SignBits &&
18015 ((SignBits != Op0SignBits &&
18016 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18017 MaskedValueIsZero(I->getOperand(0), Mask,
18018 SimplifyQuery(*DL))) &&
18019 SignBits <= Op1SignBits &&
18020 ((SignBits != Op1SignBits &&
18021 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
18022 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
18023 });
18024 };
18025 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18026 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18027 return all_of(E.Scalars, [&](Value *V) {
18028 auto *I = cast<Instruction>(V);
18029 unsigned SignBits = OrigBitWidth - BitWidth;
18030 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18031 unsigned Op0SignBits =
18032 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18033 return SignBits <= Op0SignBits &&
18034 ((SignBits != Op0SignBits &&
18035 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18036 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18037 });
18038 };
18039 if (ID != Intrinsic::abs) {
18040 Operands.push_back(getOperandEntry(&E, 1));
18041 CallChecker = CompChecker;
18042 } else {
18043 CallChecker = AbsChecker;
18044 }
18045 InstructionCost BestCost =
18046 std::numeric_limits<InstructionCost::CostType>::max();
18047 unsigned BestBitWidth = BitWidth;
18048 unsigned VF = E.Scalars.size();
18049 // Choose the best bitwidth based on cost estimations.
18050 auto Checker = [&](unsigned BitWidth, unsigned) {
18051 unsigned MinBW = PowerOf2Ceil(BitWidth);
18052 SmallVector<Type *> ArgTys =
18053 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
18054 auto VecCallCosts = getVectorCallCosts(
18055 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18056 TTI, TLI, ArgTys);
18057 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
18058 if (Cost < BestCost) {
18059 BestCost = Cost;
18060 BestBitWidth = BitWidth;
18061 }
18062 return false;
18063 };
18064 [[maybe_unused]] bool NeedToExit;
18065 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18066 BitWidth = BestBitWidth;
18067 return TryProcessInstruction(BitWidth, Operands, CallChecker);
18068 }
18069
18070 // Otherwise, conservatively give up.
18071 default:
18072 break;
18073 }
18074 MaxDepthLevel = 1;
18075 return FinalAnalysis();
18076}
18077
18078static RecurKind getRdxKind(Value *V);
18079
18081 // We only attempt to truncate integer expressions.
18082 bool IsStoreOrInsertElt =
18083 VectorizableTree.front()->getOpcode() == Instruction::Store ||
18084 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
18085 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18086 ExtraBitWidthNodes.size() <= 1 &&
18087 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18088 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18089 return;
18090
18091 unsigned NodeIdx = 0;
18092 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18093 NodeIdx = 1;
18094
18095 // Ensure the roots of the vectorizable tree don't form a cycle.
18096 if (VectorizableTree[NodeIdx]->isGather() ||
18097 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18098 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18099 [NodeIdx](const EdgeInfo &EI) {
18100 return EI.UserTE->Idx > NodeIdx;
18101 })))
18102 return;
18103
18104 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
18105 // resize to the final type.
18106 bool IsTruncRoot = false;
18107 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18108 SmallVector<unsigned> RootDemotes;
18109 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18110 if (NodeIdx != 0 &&
18111 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18112 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18113 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
18114 IsTruncRoot = true;
18115 RootDemotes.push_back(NodeIdx);
18116 IsProfitableToDemoteRoot = true;
18117 ++NodeIdx;
18118 }
18119
18120 // Analyzed the reduction already and not profitable - exit.
18121 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18122 return;
18123
18124 SmallVector<unsigned> ToDemote;
18125 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
18126 bool IsProfitableToDemoteRoot, unsigned Opcode,
18127 unsigned Limit, bool IsTruncRoot,
18128 bool IsSignedCmp) -> unsigned {
18129 ToDemote.clear();
18130 // Check if the root is trunc and the next node is gather/buildvector, then
18131 // keep trunc in scalars, which is free in most cases.
18132 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18133 !NodesToKeepBWs.contains(E.Idx) &&
18134 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18135 all_of(E.Scalars, [&](Value *V) {
18136 return V->hasOneUse() || isa<Constant>(V) ||
18137 (!V->hasNUsesOrMore(UsesLimit) &&
18138 none_of(V->users(), [&](User *U) {
18139 const TreeEntry *TE = getTreeEntry(U);
18140 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18141 if (TE == UserTE || !TE)
18142 return false;
18143 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18144 SelectInst>(U) ||
18145 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18146 SelectInst>(UserTE->getMainOp()))
18147 return true;
18148 unsigned UserTESz = DL->getTypeSizeInBits(
18149 UserTE->Scalars.front()->getType());
18150 auto It = MinBWs.find(TE);
18151 if (It != MinBWs.end() && It->second.first > UserTESz)
18152 return true;
18153 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18154 }));
18155 })) {
18156 ToDemote.push_back(E.Idx);
18157 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18158 auto It = MinBWs.find(UserTE);
18159 if (It != MinBWs.end())
18160 return It->second.first;
18161 unsigned MaxBitWidth =
18162 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18163 MaxBitWidth = bit_ceil(MaxBitWidth);
18164 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18165 MaxBitWidth = 8;
18166 return MaxBitWidth;
18167 }
18168
18169 unsigned VF = E.getVectorFactor();
18170 Type *ScalarTy = E.Scalars.front()->getType();
18171 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18172 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18173 if (!TreeRootIT || !Opcode)
18174 return 0u;
18175
18176 if (any_of(E.Scalars,
18177 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18178 return 0u;
18179
18180 unsigned NumParts = TTI->getNumberOfParts(
18181 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18182
18183 // The maximum bit width required to represent all the values that can be
18184 // demoted without loss of precision. It would be safe to truncate the roots
18185 // of the expression to this width.
18186 unsigned MaxBitWidth = 1u;
18187
18188 // True if the roots can be zero-extended back to their original type,
18189 // rather than sign-extended. We know that if the leading bits are not
18190 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18191 // True.
18192 // Determine if the sign bit of all the roots is known to be zero. If not,
18193 // IsKnownPositive is set to False.
18194 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18195 if (isa<PoisonValue>(R))
18196 return true;
18197 KnownBits Known = computeKnownBits(R, *DL);
18198 return Known.isNonNegative();
18199 });
18200
18201 // We first check if all the bits of the roots are demanded. If they're not,
18202 // we can truncate the roots to this narrower type.
18203 for (Value *Root : E.Scalars) {
18204 if (isa<PoisonValue>(Root))
18205 continue;
18206 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18207 TypeSize NumTypeBits =
18208 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18209 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18210 // If we can't prove that the sign bit is zero, we must add one to the
18211 // maximum bit width to account for the unknown sign bit. This preserves
18212 // the existing sign bit so we can safely sign-extend the root back to the
18213 // original type. Otherwise, if we know the sign bit is zero, we will
18214 // zero-extend the root instead.
18215 //
18216 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18217 // one to the maximum bit width will yield a larger-than-necessary
18218 // type. In general, we need to add an extra bit only if we can't
18219 // prove that the upper bit of the original type is equal to the
18220 // upper bit of the proposed smaller type. If these two bits are
18221 // the same (either zero or one) we know that sign-extending from
18222 // the smaller type will result in the same value. Here, since we
18223 // can't yet prove this, we are just making the proposed smaller
18224 // type larger to ensure correctness.
18225 if (!IsKnownPositive)
18226 ++BitWidth1;
18227
18228 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18229 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18230 MaxBitWidth =
18231 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18232 }
18233
18234 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18235 MaxBitWidth = 8;
18236
18237 // If the original type is large, but reduced type does not improve the reg
18238 // use - ignore it.
18239 if (NumParts > 1 &&
18240 NumParts ==
18242 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18243 return 0u;
18244
18245 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18246 Opcode == Instruction::SExt ||
18247 Opcode == Instruction::ZExt || NumParts > 1;
18248 // Conservatively determine if we can actually truncate the roots of the
18249 // expression. Collect the values that can be demoted in ToDemote and
18250 // additional roots that require investigating in Roots.
18252 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18253 bool NeedToDemote = IsProfitableToDemote;
18254
18255 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18256 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18257 NeedToDemote, IsTruncRoot) ||
18258 (MaxDepthLevel <= Limit &&
18259 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18260 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18261 DL->getTypeSizeInBits(TreeRootIT) /
18262 DL->getTypeSizeInBits(
18263 E.getMainOp()->getOperand(0)->getType()) >
18264 2)))))
18265 return 0u;
18266 // Round MaxBitWidth up to the next power-of-two.
18267 MaxBitWidth = bit_ceil(MaxBitWidth);
18268
18269 return MaxBitWidth;
18270 };
18271
18272 // If we can truncate the root, we must collect additional values that might
18273 // be demoted as a result. That is, those seeded by truncations we will
18274 // modify.
18275 // Add reduction ops sizes, if any.
18276 if (UserIgnoreList &&
18277 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18278 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18279 // x i1> to in)).
18280 if (all_of(*UserIgnoreList,
18281 [](Value *V) {
18282 return isa<PoisonValue>(V) ||
18283 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18284 }) &&
18285 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18286 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18287 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18288 Builder.getInt1Ty()) {
18289 ReductionBitWidth = 1;
18290 } else {
18291 for (Value *V : *UserIgnoreList) {
18292 if (isa<PoisonValue>(V))
18293 continue;
18294 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18295 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18296 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18298 ++BitWidth1;
18299 unsigned BitWidth2 = BitWidth1;
18301 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18302 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18303 }
18304 ReductionBitWidth =
18305 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18306 }
18307 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18308 ReductionBitWidth = 8;
18309
18310 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18311 }
18312 }
18313 bool IsTopRoot = NodeIdx == 0;
18314 while (NodeIdx < VectorizableTree.size() &&
18315 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18316 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18317 RootDemotes.push_back(NodeIdx);
18318 ++NodeIdx;
18319 IsTruncRoot = true;
18320 }
18321 bool IsSignedCmp = false;
18322 while (NodeIdx < VectorizableTree.size()) {
18323 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18324 unsigned Limit = 2;
18325 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18326 if (IsTopRoot &&
18327 ReductionBitWidth ==
18328 DL->getTypeSizeInBits(
18329 VectorizableTree.front()->Scalars.front()->getType()))
18330 Limit = 3;
18331 unsigned MaxBitWidth = ComputeMaxBitWidth(
18332 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18333 Limit, IsTruncRoot, IsSignedCmp);
18334 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18335 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18336 ReductionBitWidth = bit_ceil(MaxBitWidth);
18337 else if (MaxBitWidth == 0)
18338 ReductionBitWidth = 0;
18339 }
18340
18341 for (unsigned Idx : RootDemotes) {
18342 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18343 uint32_t OrigBitWidth =
18344 DL->getTypeSizeInBits(V->getType()->getScalarType());
18345 if (OrigBitWidth > MaxBitWidth) {
18346 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18347 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18348 }
18349 return false;
18350 }))
18351 ToDemote.push_back(Idx);
18352 }
18353 RootDemotes.clear();
18354 IsTopRoot = false;
18355 IsProfitableToDemoteRoot = true;
18356
18357 if (ExtraBitWidthNodes.empty()) {
18358 NodeIdx = VectorizableTree.size();
18359 } else {
18360 unsigned NewIdx = 0;
18361 do {
18362 NewIdx = *ExtraBitWidthNodes.begin();
18363 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18364 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18365 NodeIdx = NewIdx;
18366 IsTruncRoot =
18367 NodeIdx < VectorizableTree.size() &&
18368 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18369 [](const EdgeInfo &EI) {
18370 return EI.EdgeIdx == 0 &&
18371 EI.UserTE->getOpcode() == Instruction::Trunc &&
18372 !EI.UserTE->isAltShuffle();
18373 });
18374 IsSignedCmp =
18375 NodeIdx < VectorizableTree.size() &&
18376 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18377 [&](const EdgeInfo &EI) {
18378 return EI.UserTE->getOpcode() == Instruction::ICmp &&
18379 any_of(EI.UserTE->Scalars, [&](Value *V) {
18380 auto *IC = dyn_cast<ICmpInst>(V);
18381 return IC &&
18382 (IC->isSigned() ||
18383 !isKnownNonNegative(IC->getOperand(0),
18384 SimplifyQuery(*DL)) ||
18385 !isKnownNonNegative(IC->getOperand(1),
18386 SimplifyQuery(*DL)));
18387 });
18388 });
18389 }
18390
18391 // If the maximum bit width we compute is less than the width of the roots'
18392 // type, we can proceed with the narrowing. Otherwise, do nothing.
18393 if (MaxBitWidth == 0 ||
18394 MaxBitWidth >=
18395 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18396 ->getBitWidth()) {
18397 if (UserIgnoreList)
18398 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18399 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18400 continue;
18401 }
18402
18403 // Finally, map the values we can demote to the maximum bit with we
18404 // computed.
18405 for (unsigned Idx : ToDemote) {
18406 TreeEntry *TE = VectorizableTree[Idx].get();
18407 if (MinBWs.contains(TE))
18408 continue;
18409 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18410 if (isa<PoisonValue>(R))
18411 return false;
18412 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18413 });
18414 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18415 }
18416 }
18417}
18418
18420 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18421 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18422 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18423 auto *AA = &AM.getResult<AAManager>(F);
18424 auto *LI = &AM.getResult<LoopAnalysis>(F);
18425 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18426 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18427 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18429
18430 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18431 if (!Changed)
18432 return PreservedAnalyses::all();
18433
18436 return PA;
18437}
18438
18440 TargetTransformInfo *TTI_,
18441 TargetLibraryInfo *TLI_, AAResults *AA_,
18442 LoopInfo *LI_, DominatorTree *DT_,
18443 AssumptionCache *AC_, DemandedBits *DB_,
18446 return false;
18447 SE = SE_;
18448 TTI = TTI_;
18449 TLI = TLI_;
18450 AA = AA_;
18451 LI = LI_;
18452 DT = DT_;
18453 AC = AC_;
18454 DB = DB_;
18455 DL = &F.getDataLayout();
18456
18457 Stores.clear();
18458 GEPs.clear();
18459 bool Changed = false;
18460
18461 // If the target claims to have no vector registers don't attempt
18462 // vectorization.
18464 LLVM_DEBUG(
18465 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18466 return false;
18467 }
18468
18469 // Don't vectorize when the attribute NoImplicitFloat is used.
18470 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18471 return false;
18472
18473 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18474
18475 // Use the bottom up slp vectorizer to construct chains that start with
18476 // store instructions.
18477 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18478
18479 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18480 // delete instructions.
18481
18482 // Update DFS numbers now so that we can use them for ordering.
18483 DT->updateDFSNumbers();
18484
18485 // Scan the blocks in the function in post order.
18486 for (auto *BB : post_order(&F.getEntryBlock())) {
18487 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18488 continue;
18489
18490 // Start new block - clear the list of reduction roots.
18491 R.clearReductionData();
18492 collectSeedInstructions(BB);
18493
18494 // Vectorize trees that end at stores.
18495 if (!Stores.empty()) {
18496 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18497 << " underlying objects.\n");
18498 Changed |= vectorizeStoreChains(R);
18499 }
18500
18501 // Vectorize trees that end at reductions.
18502 Changed |= vectorizeChainsInBlock(BB, R);
18503
18504 // Vectorize the index computations of getelementptr instructions. This
18505 // is primarily intended to catch gather-like idioms ending at
18506 // non-consecutive loads.
18507 if (!GEPs.empty()) {
18508 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18509 << " underlying objects.\n");
18510 Changed |= vectorizeGEPIndices(BB, R);
18511 }
18512 }
18513
18514 if (Changed) {
18515 R.optimizeGatherSequence();
18516 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18517 }
18518 return Changed;
18519}
18520
18521std::optional<bool>
18522SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18523 unsigned Idx, unsigned MinVF,
18524 unsigned &Size) {
18525 Size = 0;
18526 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18527 << "\n");
18528 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18529 unsigned VF = Chain.size();
18530
18531 if (!has_single_bit(Sz) ||
18533 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18534 VF) ||
18535 VF < 2 || VF < MinVF) {
18536 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18537 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18538 // all vector lanes are used.
18539 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18540 return false;
18541 }
18542
18543 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18544 << "\n");
18545
18546 SetVector<Value *> ValOps;
18547 for (Value *V : Chain)
18548 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18549 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18550 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18551 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18552 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18553 bool IsAllowedSize =
18554 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18555 ValOps.size()) ||
18556 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18557 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18558 (!S.getMainOp()->isSafeToRemove() ||
18559 any_of(ValOps.getArrayRef(),
18560 [&](Value *V) {
18561 return !isa<ExtractElementInst>(V) &&
18562 (V->getNumUses() > Chain.size() ||
18563 any_of(V->users(), [&](User *U) {
18564 return !Stores.contains(U);
18565 }));
18566 }))) ||
18567 (ValOps.size() > Chain.size() / 2 && !S)) {
18568 Size = (!IsAllowedSize && S) ? 1 : 2;
18569 return false;
18570 }
18571 }
18572 if (R.isLoadCombineCandidate(Chain))
18573 return true;
18574 R.buildTree(Chain);
18575 // Check if tree tiny and store itself or its value is not vectorized.
18576 if (R.isTreeTinyAndNotFullyVectorizable()) {
18577 if (R.isGathered(Chain.front()) ||
18578 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18579 return std::nullopt;
18580 Size = R.getCanonicalGraphSize();
18581 return false;
18582 }
18583 R.reorderTopToBottom();
18584 R.reorderBottomToTop();
18585 R.transformNodes();
18586 R.buildExternalUses();
18587
18588 R.computeMinimumValueSizes();
18589
18590 Size = R.getCanonicalGraphSize();
18591 if (S && S.getOpcode() == Instruction::Load)
18592 Size = 2; // cut off masked gather small trees
18593 InstructionCost Cost = R.getTreeCost();
18594
18595 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18596 if (Cost < -SLPCostThreshold) {
18597 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18598
18599 using namespace ore;
18600
18601 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18602 cast<StoreInst>(Chain[0]))
18603 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18604 << " and with tree size "
18605 << NV("TreeSize", R.getTreeSize()));
18606
18607 R.vectorizeTree();
18608 return true;
18609 }
18610
18611 return false;
18612}
18613
18614/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18615static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18616 bool First) {
18617 unsigned Num = 0;
18618 uint64_t Sum = std::accumulate(
18619 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18620 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18621 unsigned Size = First ? Val.first : Val.second;
18622 if (Size == 1)
18623 return V;
18624 ++Num;
18625 return V + Size;
18626 });
18627 if (Num == 0)
18628 return true;
18629 uint64_t Mean = Sum / Num;
18630 if (Mean == 0)
18631 return true;
18632 uint64_t Dev = std::accumulate(
18633 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18634 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18635 unsigned P = First ? Val.first : Val.second;
18636 if (P == 1)
18637 return V;
18638 return V + (P - Mean) * (P - Mean);
18639 }) /
18640 Num;
18641 return Dev * 81 / (Mean * Mean) == 0;
18642}
18643
18644bool SLPVectorizerPass::vectorizeStores(
18645 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18646 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18647 &Visited) {
18648 // We may run into multiple chains that merge into a single chain. We mark the
18649 // stores that we vectorized so that we don't visit the same store twice.
18650 BoUpSLP::ValueSet VectorizedStores;
18651 bool Changed = false;
18652
18653 struct StoreDistCompare {
18654 bool operator()(const std::pair<unsigned, int> &Op1,
18655 const std::pair<unsigned, int> &Op2) const {
18656 return Op1.second < Op2.second;
18657 }
18658 };
18659 // A set of pairs (index of store in Stores array ref, Distance of the store
18660 // address relative to base store address in units).
18661 using StoreIndexToDistSet =
18662 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18663 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18664 int PrevDist = -1;
18666 // Collect the chain into a list.
18667 for (auto [Idx, Data] : enumerate(Set)) {
18668 if (Operands.empty() || Data.second - PrevDist == 1) {
18669 Operands.push_back(Stores[Data.first]);
18670 PrevDist = Data.second;
18671 if (Idx != Set.size() - 1)
18672 continue;
18673 }
18674 auto E = make_scope_exit([&, &DataVar = Data]() {
18675 Operands.clear();
18676 Operands.push_back(Stores[DataVar.first]);
18677 PrevDist = DataVar.second;
18678 });
18679
18680 if (Operands.size() <= 1 ||
18681 !Visited
18682 .insert({Operands.front(),
18683 cast<StoreInst>(Operands.front())->getValueOperand(),
18684 Operands.back(),
18685 cast<StoreInst>(Operands.back())->getValueOperand(),
18686 Operands.size()})
18687 .second)
18688 continue;
18689
18690 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18691 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18692 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18693
18694 unsigned MaxVF =
18695 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18696 auto *Store = cast<StoreInst>(Operands[0]);
18697 Type *StoreTy = Store->getValueOperand()->getType();
18698 Type *ValueTy = StoreTy;
18699 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18700 ValueTy = Trunc->getSrcTy();
18701 unsigned MinVF = std::max<unsigned>(
18703 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18704 ValueTy)));
18705
18706 if (MaxVF < MinVF) {
18707 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18708 << ") < "
18709 << "MinVF (" << MinVF << ")\n");
18710 continue;
18711 }
18712
18713 unsigned NonPowerOf2VF = 0;
18715 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18716 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18717 // lanes are used.
18718 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18719 if (has_single_bit(CandVF + 1)) {
18720 NonPowerOf2VF = CandVF;
18721 assert(NonPowerOf2VF != MaxVF &&
18722 "Non-power-of-2 VF should not be equal to MaxVF");
18723 }
18724 }
18725
18726 unsigned MaxRegVF = MaxVF;
18727 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18728 if (MaxVF < MinVF) {
18729 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18730 << ") < "
18731 << "MinVF (" << MinVF << ")\n");
18732 continue;
18733 }
18734
18735 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18736 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18737 unsigned Size = MinVF;
18738 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18739 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18740 Size *= 2;
18741 });
18742 unsigned End = Operands.size();
18743 unsigned Repeat = 0;
18744 constexpr unsigned MaxAttempts = 4;
18746 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18747 P.first = P.second = 1;
18748 });
18750 auto IsNotVectorized = [](bool First,
18751 const std::pair<unsigned, unsigned> &P) {
18752 return First ? P.first > 0 : P.second > 0;
18753 };
18754 auto IsVectorized = [](bool First,
18755 const std::pair<unsigned, unsigned> &P) {
18756 return First ? P.first == 0 : P.second == 0;
18757 };
18758 auto VFIsProfitable = [](bool First, unsigned Size,
18759 const std::pair<unsigned, unsigned> &P) {
18760 return First ? Size >= P.first : Size >= P.second;
18761 };
18762 auto FirstSizeSame = [](unsigned Size,
18763 const std::pair<unsigned, unsigned> &P) {
18764 return Size == P.first;
18765 };
18766 while (true) {
18767 ++Repeat;
18768 bool RepeatChanged = false;
18769 bool AnyProfitableGraph = false;
18770 for (unsigned Size : CandidateVFs) {
18771 AnyProfitableGraph = false;
18772 unsigned StartIdx = std::distance(
18773 RangeSizes.begin(),
18774 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18775 std::placeholders::_1)));
18776 while (StartIdx < End) {
18777 unsigned EndIdx =
18778 std::distance(RangeSizes.begin(),
18779 find_if(RangeSizes.drop_front(StartIdx),
18780 std::bind(IsVectorized, Size >= MaxRegVF,
18781 std::placeholders::_1)));
18782 unsigned Sz = EndIdx >= End ? End : EndIdx;
18783 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18784 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18785 Size >= MaxRegVF)) {
18786 ++Cnt;
18787 continue;
18788 }
18790 assert(all_of(Slice,
18791 [&](Value *V) {
18792 return cast<StoreInst>(V)
18793 ->getValueOperand()
18794 ->getType() ==
18795 cast<StoreInst>(Slice.front())
18796 ->getValueOperand()
18797 ->getType();
18798 }) &&
18799 "Expected all operands of same type.");
18800 if (!NonSchedulable.empty()) {
18801 auto [NonSchedSizeMax, NonSchedSizeMin] =
18802 NonSchedulable.lookup(Slice.front());
18803 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18804 Cnt += NonSchedSizeMax;
18805 continue;
18806 }
18807 }
18808 unsigned TreeSize;
18809 std::optional<bool> Res =
18810 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18811 if (!Res) {
18812 NonSchedulable
18813 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18814 .first->getSecond()
18815 .second = Size;
18816 } else if (*Res) {
18817 // Mark the vectorized stores so that we don't vectorize them
18818 // again.
18819 VectorizedStores.insert(Slice.begin(), Slice.end());
18820 // Mark the vectorized stores so that we don't vectorize them
18821 // again.
18822 AnyProfitableGraph = RepeatChanged = Changed = true;
18823 // If we vectorized initial block, no need to try to vectorize
18824 // it again.
18825 for_each(RangeSizes.slice(Cnt, Size),
18826 [](std::pair<unsigned, unsigned> &P) {
18827 P.first = P.second = 0;
18828 });
18829 if (Cnt < StartIdx + MinVF) {
18830 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18831 [](std::pair<unsigned, unsigned> &P) {
18832 P.first = P.second = 0;
18833 });
18834 StartIdx = Cnt + Size;
18835 }
18836 if (Cnt > Sz - Size - MinVF) {
18837 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18838 [](std::pair<unsigned, unsigned> &P) {
18839 P.first = P.second = 0;
18840 });
18841 if (Sz == End)
18842 End = Cnt;
18843 Sz = Cnt;
18844 }
18845 Cnt += Size;
18846 continue;
18847 }
18848 if (Size > 2 && Res &&
18849 !all_of(RangeSizes.slice(Cnt, Size),
18850 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18851 std::placeholders::_1))) {
18852 Cnt += Size;
18853 continue;
18854 }
18855 // Check for the very big VFs that we're not rebuilding same
18856 // trees, just with larger number of elements.
18857 if (Size > MaxRegVF && TreeSize > 1 &&
18858 all_of(RangeSizes.slice(Cnt, Size),
18859 std::bind(FirstSizeSame, TreeSize,
18860 std::placeholders::_1))) {
18861 Cnt += Size;
18862 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18863 ++Cnt;
18864 continue;
18865 }
18866 if (TreeSize > 1)
18867 for_each(RangeSizes.slice(Cnt, Size),
18868 [&](std::pair<unsigned, unsigned> &P) {
18869 if (Size >= MaxRegVF)
18870 P.second = std::max(P.second, TreeSize);
18871 else
18872 P.first = std::max(P.first, TreeSize);
18873 });
18874 ++Cnt;
18875 AnyProfitableGraph = true;
18876 }
18877 if (StartIdx >= End)
18878 break;
18879 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18880 AnyProfitableGraph = true;
18881 StartIdx = std::distance(
18882 RangeSizes.begin(),
18883 find_if(RangeSizes.drop_front(Sz),
18884 std::bind(IsNotVectorized, Size >= MaxRegVF,
18885 std::placeholders::_1)));
18886 }
18887 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18888 break;
18889 }
18890 // All values vectorized - exit.
18891 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18892 return P.first == 0 && P.second == 0;
18893 }))
18894 break;
18895 // Check if tried all attempts or no need for the last attempts at all.
18896 if (Repeat >= MaxAttempts ||
18897 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18898 break;
18899 constexpr unsigned StoresLimit = 64;
18900 const unsigned MaxTotalNum = std::min<unsigned>(
18901 Operands.size(),
18902 static_cast<unsigned>(
18903 End -
18904 std::distance(
18905 RangeSizes.begin(),
18906 find_if(RangeSizes, std::bind(IsNotVectorized, true,
18907 std::placeholders::_1))) +
18908 1));
18909 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
18910 unsigned Limit =
18911 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18912 CandidateVFs.clear();
18913 if (bit_floor(Limit) == VF)
18914 CandidateVFs.push_back(Limit);
18915 if (VF > MaxTotalNum || VF >= StoresLimit)
18916 break;
18917 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18918 if (P.first != 0)
18919 P.first = std::max(P.second, P.first);
18920 });
18921 // Last attempt to vectorize max number of elements, if all previous
18922 // attempts were unsuccessful because of the cost issues.
18923 CandidateVFs.push_back(VF);
18924 }
18925 }
18926 };
18927
18928 // Stores pair (first: index of the store into Stores array ref, address of
18929 // which taken as base, second: sorted set of pairs {index, dist}, which are
18930 // indices of stores in the set and their store location distances relative to
18931 // the base address).
18932
18933 // Need to store the index of the very first store separately, since the set
18934 // may be reordered after the insertion and the first store may be moved. This
18935 // container allows to reduce number of calls of getPointersDiff() function.
18937 // Inserts the specified store SI with the given index Idx to the set of the
18938 // stores. If the store with the same distance is found already - stop
18939 // insertion, try to vectorize already found stores. If some stores from this
18940 // sequence were not vectorized - try to vectorize them with the new store
18941 // later. But this logic is applied only to the stores, that come before the
18942 // previous store with the same distance.
18943 // Example:
18944 // 1. store x, %p
18945 // 2. store y, %p+1
18946 // 3. store z, %p+2
18947 // 4. store a, %p
18948 // 5. store b, %p+3
18949 // - Scan this from the last to first store. The very first bunch of stores is
18950 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
18951 // vector).
18952 // - The next store in the list - #1 - has the same distance from store #5 as
18953 // the store #4.
18954 // - Try to vectorize sequence of stores 4,2,3,5.
18955 // - If all these stores are vectorized - just drop them.
18956 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
18957 // - Start new stores sequence.
18958 // The new bunch of stores is {1, {1, 0}}.
18959 // - Add the stores from previous sequence, that were not vectorized.
18960 // Here we consider the stores in the reversed order, rather they are used in
18961 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
18962 // Store #3 can be added -> comes after store #4 with the same distance as
18963 // store #1.
18964 // Store #5 cannot be added - comes before store #4.
18965 // This logic allows to improve the compile time, we assume that the stores
18966 // after previous store with the same distance most likely have memory
18967 // dependencies and no need to waste compile time to try to vectorize them.
18968 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
18969 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
18970 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
18971 std::optional<int> Diff = getPointersDiff(
18972 Stores[Set.first]->getValueOperand()->getType(),
18973 Stores[Set.first]->getPointerOperand(),
18974 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
18975 /*StrictCheck=*/true);
18976 if (!Diff)
18977 continue;
18978 auto It = Set.second.find(std::make_pair(Idx, *Diff));
18979 if (It == Set.second.end()) {
18980 Set.second.emplace(Idx, *Diff);
18981 return;
18982 }
18983 // Try to vectorize the first found set to avoid duplicate analysis.
18984 TryToVectorize(Set.second);
18985 unsigned ItIdx = It->first;
18986 int ItDist = It->second;
18987 StoreIndexToDistSet PrevSet;
18988 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
18989 [&](const std::pair<unsigned, int> &Pair) {
18990 return Pair.first > ItIdx;
18991 });
18992 Set.second.clear();
18993 Set.first = Idx;
18994 Set.second.emplace(Idx, 0);
18995 // Insert stores that followed previous match to try to vectorize them
18996 // with this store.
18997 unsigned StartIdx = ItIdx + 1;
18998 SmallBitVector UsedStores(Idx - StartIdx);
18999 // Distances to previously found dup store (or this store, since they
19000 // store to the same addresses).
19001 SmallVector<int> Dists(Idx - StartIdx, 0);
19002 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
19003 // Do not try to vectorize sequences, we already tried.
19004 if (VectorizedStores.contains(Stores[Pair.first]))
19005 break;
19006 unsigned BI = Pair.first - StartIdx;
19007 UsedStores.set(BI);
19008 Dists[BI] = Pair.second - ItDist;
19009 }
19010 for (unsigned I = StartIdx; I < Idx; ++I) {
19011 unsigned BI = I - StartIdx;
19012 if (UsedStores.test(BI))
19013 Set.second.emplace(I, Dists[BI]);
19014 }
19015 return;
19016 }
19017 auto &Res = SortedStores.emplace_back();
19018 Res.first = Idx;
19019 Res.second.emplace(Idx, 0);
19020 };
19021 Type *PrevValTy = nullptr;
19022 for (auto [I, SI] : enumerate(Stores)) {
19023 if (R.isDeleted(SI))
19024 continue;
19025 if (!PrevValTy)
19026 PrevValTy = SI->getValueOperand()->getType();
19027 // Check that we do not try to vectorize stores of different types.
19028 if (PrevValTy != SI->getValueOperand()->getType()) {
19029 for (auto &Set : SortedStores)
19030 TryToVectorize(Set.second);
19031 SortedStores.clear();
19032 PrevValTy = SI->getValueOperand()->getType();
19033 }
19034 FillStoresSet(I, SI);
19035 }
19036
19037 // Final vectorization attempt.
19038 for (auto &Set : SortedStores)
19039 TryToVectorize(Set.second);
19040
19041 return Changed;
19042}
19043
19044void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19045 // Initialize the collections. We will make a single pass over the block.
19046 Stores.clear();
19047 GEPs.clear();
19048
19049 // Visit the store and getelementptr instructions in BB and organize them in
19050 // Stores and GEPs according to the underlying objects of their pointer
19051 // operands.
19052 for (Instruction &I : *BB) {
19053 // Ignore store instructions that are volatile or have a pointer operand
19054 // that doesn't point to a scalar type.
19055 if (auto *SI = dyn_cast<StoreInst>(&I)) {
19056 if (!SI->isSimple())
19057 continue;
19058 if (!isValidElementType(SI->getValueOperand()->getType()))
19059 continue;
19060 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19061 }
19062
19063 // Ignore getelementptr instructions that have more than one index, a
19064 // constant index, or a pointer operand that doesn't point to a scalar
19065 // type.
19066 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19067 if (GEP->getNumIndices() != 1)
19068 continue;
19069 Value *Idx = GEP->idx_begin()->get();
19070 if (isa<Constant>(Idx))
19071 continue;
19072 if (!isValidElementType(Idx->getType()))
19073 continue;
19074 if (GEP->getType()->isVectorTy())
19075 continue;
19076 GEPs[GEP->getPointerOperand()].push_back(GEP);
19077 }
19078 }
19079}
19080
19081bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19082 bool MaxVFOnly) {
19083 if (VL.size() < 2)
19084 return false;
19085
19086 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
19087 << VL.size() << ".\n");
19088
19089 // Check that all of the parts are instructions of the same type,
19090 // we permit an alternate opcode via InstructionsState.
19091 InstructionsState S = getSameOpcode(VL, *TLI);
19092 if (!S)
19093 return false;
19094
19095 Instruction *I0 = S.getMainOp();
19096 // Make sure invalid types (including vector type) are rejected before
19097 // determining vectorization factor for scalar instructions.
19098 for (Value *V : VL) {
19099 Type *Ty = V->getType();
19100 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19101 // NOTE: the following will give user internal llvm type name, which may
19102 // not be useful.
19103 R.getORE()->emit([&]() {
19104 std::string TypeStr;
19105 llvm::raw_string_ostream rso(TypeStr);
19106 Ty->print(rso);
19107 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
19108 << "Cannot SLP vectorize list: type "
19109 << TypeStr + " is unsupported by vectorizer";
19110 });
19111 return false;
19112 }
19113 }
19114
19115 unsigned Sz = R.getVectorElementSize(I0);
19116 unsigned MinVF = R.getMinVF(Sz);
19117 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
19118 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19119 if (MaxVF < 2) {
19120 R.getORE()->emit([&]() {
19121 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
19122 << "Cannot SLP vectorize list: vectorization factor "
19123 << "less than 2 is not supported";
19124 });
19125 return false;
19126 }
19127
19128 bool Changed = false;
19129 bool CandidateFound = false;
19130 InstructionCost MinCost = SLPCostThreshold.getValue();
19131 Type *ScalarTy = getValueType(VL[0]);
19132
19133 unsigned NextInst = 0, MaxInst = VL.size();
19134 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19135 // No actual vectorization should happen, if number of parts is the same as
19136 // provided vectorization factor (i.e. the scalar type is used for vector
19137 // code during codegen).
19138 auto *VecTy = getWidenedType(ScalarTy, VF);
19139 if (TTI->getNumberOfParts(VecTy) == VF)
19140 continue;
19141 for (unsigned I = NextInst; I < MaxInst; ++I) {
19142 unsigned ActualVF = std::min(MaxInst - I, VF);
19143
19144 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19145 continue;
19146
19147 if (MaxVFOnly && ActualVF < MaxVF)
19148 break;
19149 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19150 break;
19151
19152 SmallVector<Value *> Ops(ActualVF, nullptr);
19153 unsigned Idx = 0;
19154 for (Value *V : VL.drop_front(I)) {
19155 // Check that a previous iteration of this loop did not delete the
19156 // Value.
19157 if (auto *Inst = dyn_cast<Instruction>(V);
19158 !Inst || !R.isDeleted(Inst)) {
19159 Ops[Idx] = V;
19160 ++Idx;
19161 if (Idx == ActualVF)
19162 break;
19163 }
19164 }
19165 // Not enough vectorizable instructions - exit.
19166 if (Idx != ActualVF)
19167 break;
19168
19169 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
19170 << "\n");
19171
19172 R.buildTree(Ops);
19173 if (R.isTreeTinyAndNotFullyVectorizable())
19174 continue;
19175 R.reorderTopToBottom();
19176 R.reorderBottomToTop(
19177 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19178 !R.doesRootHaveInTreeUses());
19179 R.transformNodes();
19180 R.buildExternalUses();
19181
19182 R.computeMinimumValueSizes();
19183 InstructionCost Cost = R.getTreeCost();
19184 CandidateFound = true;
19185 MinCost = std::min(MinCost, Cost);
19186
19187 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19188 << " for VF=" << ActualVF << "\n");
19189 if (Cost < -SLPCostThreshold) {
19190 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19191 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19192 cast<Instruction>(Ops[0]))
19193 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19194 << " and with tree size "
19195 << ore::NV("TreeSize", R.getTreeSize()));
19196
19197 R.vectorizeTree();
19198 // Move to the next bundle.
19199 I += VF - 1;
19200 NextInst = I + 1;
19201 Changed = true;
19202 }
19203 }
19204 }
19205
19206 if (!Changed && CandidateFound) {
19207 R.getORE()->emit([&]() {
19208 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19209 << "List vectorization was possible but not beneficial with cost "
19210 << ore::NV("Cost", MinCost) << " >= "
19211 << ore::NV("Treshold", -SLPCostThreshold);
19212 });
19213 } else if (!Changed) {
19214 R.getORE()->emit([&]() {
19215 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19216 << "Cannot SLP vectorize list: vectorization was impossible"
19217 << " with available vectorization factors";
19218 });
19219 }
19220 return Changed;
19221}
19222
19223bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19224 if (!I)
19225 return false;
19226
19227 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19228 return false;
19229
19230 Value *P = I->getParent();
19231
19232 // Vectorize in current basic block only.
19233 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19234 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19235 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19236 R.isDeleted(Op0) || R.isDeleted(Op1))
19237 return false;
19238
19239 // First collect all possible candidates
19241 Candidates.emplace_back(Op0, Op1);
19242
19243 auto *A = dyn_cast<BinaryOperator>(Op0);
19244 auto *B = dyn_cast<BinaryOperator>(Op1);
19245 // Try to skip B.
19246 if (A && B && B->hasOneUse()) {
19247 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19248 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19249 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19250 Candidates.emplace_back(A, B0);
19251 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19252 Candidates.emplace_back(A, B1);
19253 }
19254 // Try to skip A.
19255 if (B && A && A->hasOneUse()) {
19256 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19257 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19258 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19259 Candidates.emplace_back(A0, B);
19260 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19261 Candidates.emplace_back(A1, B);
19262 }
19263
19264 if (Candidates.size() == 1)
19265 return tryToVectorizeList({Op0, Op1}, R);
19266
19267 // We have multiple options. Try to pick the single best.
19268 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19269 if (!BestCandidate)
19270 return false;
19271 return tryToVectorizeList(
19272 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19273}
19274
19275namespace {
19276
19277/// Model horizontal reductions.
19278///
19279/// A horizontal reduction is a tree of reduction instructions that has values
19280/// that can be put into a vector as its leaves. For example:
19281///
19282/// mul mul mul mul
19283/// \ / \ /
19284/// + +
19285/// \ /
19286/// +
19287/// This tree has "mul" as its leaf values and "+" as its reduction
19288/// instructions. A reduction can feed into a store or a binary operation
19289/// feeding a phi.
19290/// ...
19291/// \ /
19292/// +
19293/// |
19294/// phi +=
19295///
19296/// Or:
19297/// ...
19298/// \ /
19299/// +
19300/// |
19301/// *p =
19302///
19303class HorizontalReduction {
19304 using ReductionOpsType = SmallVector<Value *, 16>;
19305 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19306 ReductionOpsListType ReductionOps;
19307 /// List of possibly reduced values.
19309 /// Maps reduced value to the corresponding reduction operation.
19311 WeakTrackingVH ReductionRoot;
19312 /// The type of reduction operation.
19313 RecurKind RdxKind;
19314 /// Checks if the optimization of original scalar identity operations on
19315 /// matched horizontal reductions is enabled and allowed.
19316 bool IsSupportedHorRdxIdentityOp = false;
19317
19318 static bool isCmpSelMinMax(Instruction *I) {
19319 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19321 }
19322
19323 // And/or are potentially poison-safe logical patterns like:
19324 // select x, y, false
19325 // select x, true, y
19326 static bool isBoolLogicOp(Instruction *I) {
19327 return isa<SelectInst>(I) &&
19328 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19329 }
19330
19331 /// Checks if instruction is associative and can be vectorized.
19332 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19333 if (Kind == RecurKind::None)
19334 return false;
19335
19336 // Integer ops that map to select instructions or intrinsics are fine.
19338 isBoolLogicOp(I))
19339 return true;
19340
19341 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19342 // FP min/max are associative except for NaN and -0.0. We do not
19343 // have to rule out -0.0 here because the intrinsic semantics do not
19344 // specify a fixed result for it.
19345 return I->getFastMathFlags().noNaNs();
19346 }
19347
19348 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19349 return true;
19350
19351 return I->isAssociative();
19352 }
19353
19354 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19355 // Poison-safe 'or' takes the form: select X, true, Y
19356 // To make that work with the normal operand processing, we skip the
19357 // true value operand.
19358 // TODO: Change the code and data structures to handle this without a hack.
19359 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19360 return I->getOperand(2);
19361 return I->getOperand(Index);
19362 }
19363
19364 /// Creates reduction operation with the current opcode.
19365 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19366 Value *RHS, const Twine &Name, bool UseSelect) {
19367 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19368 switch (Kind) {
19369 case RecurKind::Or:
19370 if (UseSelect &&
19372 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19373 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19374 Name);
19375 case RecurKind::And:
19376 if (UseSelect &&
19378 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19379 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19380 Name);
19381 case RecurKind::Add:
19382 case RecurKind::Mul:
19383 case RecurKind::Xor:
19384 case RecurKind::FAdd:
19385 case RecurKind::FMul:
19386 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19387 Name);
19388 case RecurKind::FMax:
19389 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
19390 case RecurKind::FMin:
19391 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
19392 case RecurKind::FMaximum:
19393 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
19394 case RecurKind::FMinimum:
19395 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
19396 case RecurKind::SMax:
19397 if (UseSelect) {
19398 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
19399 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19400 }
19401 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
19402 case RecurKind::SMin:
19403 if (UseSelect) {
19404 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
19405 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19406 }
19407 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
19408 case RecurKind::UMax:
19409 if (UseSelect) {
19410 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
19411 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19412 }
19413 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
19414 case RecurKind::UMin:
19415 if (UseSelect) {
19416 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
19417 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19418 }
19419 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
19420 default:
19421 llvm_unreachable("Unknown reduction operation.");
19422 }
19423 }
19424
19425 /// Creates reduction operation with the current opcode with the IR flags
19426 /// from \p ReductionOps, dropping nuw/nsw flags.
19427 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19428 Value *RHS, const Twine &Name,
19429 const ReductionOpsListType &ReductionOps) {
19430 bool UseSelect = ReductionOps.size() == 2 ||
19431 // Logical or/and.
19432 (ReductionOps.size() == 1 &&
19433 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19434 assert((!UseSelect || ReductionOps.size() != 2 ||
19435 isa<SelectInst>(ReductionOps[1][0])) &&
19436 "Expected cmp + select pairs for reduction");
19437 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19439 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19440 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19441 /*IncludeWrapFlags=*/false);
19442 propagateIRFlags(Op, ReductionOps[1], nullptr,
19443 /*IncludeWrapFlags=*/false);
19444 return Op;
19445 }
19446 }
19447 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19448 return Op;
19449 }
19450
19451public:
19452 static RecurKind getRdxKind(Value *V) {
19453 auto *I = dyn_cast<Instruction>(V);
19454 if (!I)
19455 return RecurKind::None;
19456 if (match(I, m_Add(m_Value(), m_Value())))
19457 return RecurKind::Add;
19458 if (match(I, m_Mul(m_Value(), m_Value())))
19459 return RecurKind::Mul;
19460 if (match(I, m_And(m_Value(), m_Value())) ||
19462 return RecurKind::And;
19463 if (match(I, m_Or(m_Value(), m_Value())) ||
19465 return RecurKind::Or;
19466 if (match(I, m_Xor(m_Value(), m_Value())))
19467 return RecurKind::Xor;
19468 if (match(I, m_FAdd(m_Value(), m_Value())))
19469 return RecurKind::FAdd;
19470 if (match(I, m_FMul(m_Value(), m_Value())))
19471 return RecurKind::FMul;
19472
19473 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19474 return RecurKind::FMax;
19475 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19476 return RecurKind::FMin;
19477
19478 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19479 return RecurKind::FMaximum;
19480 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19481 return RecurKind::FMinimum;
19482 // This matches either cmp+select or intrinsics. SLP is expected to handle
19483 // either form.
19484 // TODO: If we are canonicalizing to intrinsics, we can remove several
19485 // special-case paths that deal with selects.
19486 if (match(I, m_SMax(m_Value(), m_Value())))
19487 return RecurKind::SMax;
19488 if (match(I, m_SMin(m_Value(), m_Value())))
19489 return RecurKind::SMin;
19490 if (match(I, m_UMax(m_Value(), m_Value())))
19491 return RecurKind::UMax;
19492 if (match(I, m_UMin(m_Value(), m_Value())))
19493 return RecurKind::UMin;
19494
19495 if (auto *Select = dyn_cast<SelectInst>(I)) {
19496 // Try harder: look for min/max pattern based on instructions producing
19497 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19498 // During the intermediate stages of SLP, it's very common to have
19499 // pattern like this (since optimizeGatherSequence is run only once
19500 // at the end):
19501 // %1 = extractelement <2 x i32> %a, i32 0
19502 // %2 = extractelement <2 x i32> %a, i32 1
19503 // %cond = icmp sgt i32 %1, %2
19504 // %3 = extractelement <2 x i32> %a, i32 0
19505 // %4 = extractelement <2 x i32> %a, i32 1
19506 // %select = select i1 %cond, i32 %3, i32 %4
19507 CmpPredicate Pred;
19508 Instruction *L1;
19509 Instruction *L2;
19510
19511 Value *LHS = Select->getTrueValue();
19512 Value *RHS = Select->getFalseValue();
19513 Value *Cond = Select->getCondition();
19514
19515 // TODO: Support inverse predicates.
19516 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19517 if (!isa<ExtractElementInst>(RHS) ||
19518 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19519 return RecurKind::None;
19520 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19521 if (!isa<ExtractElementInst>(LHS) ||
19522 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19523 return RecurKind::None;
19524 } else {
19525 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19526 return RecurKind::None;
19527 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19528 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19529 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19530 return RecurKind::None;
19531 }
19532
19533 switch (Pred) {
19534 default:
19535 return RecurKind::None;
19536 case CmpInst::ICMP_SGT:
19537 case CmpInst::ICMP_SGE:
19538 return RecurKind::SMax;
19539 case CmpInst::ICMP_SLT:
19540 case CmpInst::ICMP_SLE:
19541 return RecurKind::SMin;
19542 case CmpInst::ICMP_UGT:
19543 case CmpInst::ICMP_UGE:
19544 return RecurKind::UMax;
19545 case CmpInst::ICMP_ULT:
19546 case CmpInst::ICMP_ULE:
19547 return RecurKind::UMin;
19548 }
19549 }
19550 return RecurKind::None;
19551 }
19552
19553 /// Get the index of the first operand.
19554 static unsigned getFirstOperandIndex(Instruction *I) {
19555 return isCmpSelMinMax(I) ? 1 : 0;
19556 }
19557
19558private:
19559 /// Total number of operands in the reduction operation.
19560 static unsigned getNumberOfOperands(Instruction *I) {
19561 return isCmpSelMinMax(I) ? 3 : 2;
19562 }
19563
19564 /// Checks if the instruction is in basic block \p BB.
19565 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19566 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19567 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19568 auto *Sel = cast<SelectInst>(I);
19569 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19570 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19571 }
19572 return I->getParent() == BB;
19573 }
19574
19575 /// Expected number of uses for reduction operations/reduced values.
19576 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19577 if (IsCmpSelMinMax) {
19578 // SelectInst must be used twice while the condition op must have single
19579 // use only.
19580 if (auto *Sel = dyn_cast<SelectInst>(I))
19581 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19582 return I->hasNUses(2);
19583 }
19584
19585 // Arithmetic reduction operation must be used once only.
19586 return I->hasOneUse();
19587 }
19588
19589 /// Initializes the list of reduction operations.
19590 void initReductionOps(Instruction *I) {
19591 if (isCmpSelMinMax(I))
19592 ReductionOps.assign(2, ReductionOpsType());
19593 else
19594 ReductionOps.assign(1, ReductionOpsType());
19595 }
19596
19597 /// Add all reduction operations for the reduction instruction \p I.
19598 void addReductionOps(Instruction *I) {
19599 if (isCmpSelMinMax(I)) {
19600 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19601 ReductionOps[1].emplace_back(I);
19602 } else {
19603 ReductionOps[0].emplace_back(I);
19604 }
19605 }
19606
19607 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19608 int Sz = Data.size();
19609 auto *I = dyn_cast<Instruction>(Data.front());
19610 return Sz > 1 || isConstant(Data.front()) ||
19611 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19612 }
19613
19614public:
19615 HorizontalReduction() = default;
19616
19617 /// Try to find a reduction tree.
19618 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19619 ScalarEvolution &SE, const DataLayout &DL,
19620 const TargetLibraryInfo &TLI) {
19621 RdxKind = HorizontalReduction::getRdxKind(Root);
19622 if (!isVectorizable(RdxKind, Root))
19623 return false;
19624
19625 // Analyze "regular" integer/FP types for reductions - no target-specific
19626 // types or pointers.
19627 Type *Ty = Root->getType();
19628 if (!isValidElementType(Ty) || Ty->isPointerTy())
19629 return false;
19630
19631 // Though the ultimate reduction may have multiple uses, its condition must
19632 // have only single use.
19633 if (auto *Sel = dyn_cast<SelectInst>(Root))
19634 if (!Sel->getCondition()->hasOneUse())
19635 return false;
19636
19637 ReductionRoot = Root;
19638
19639 // Iterate through all the operands of the possible reduction tree and
19640 // gather all the reduced values, sorting them by their value id.
19641 BasicBlock *BB = Root->getParent();
19642 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19644 1, std::make_pair(Root, 0));
19645 // Checks if the operands of the \p TreeN instruction are also reduction
19646 // operations or should be treated as reduced values or an extra argument,
19647 // which is not part of the reduction.
19648 auto CheckOperands = [&](Instruction *TreeN,
19649 SmallVectorImpl<Value *> &PossibleReducedVals,
19650 SmallVectorImpl<Instruction *> &ReductionOps,
19651 unsigned Level) {
19652 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19653 getNumberOfOperands(TreeN)))) {
19654 Value *EdgeVal = getRdxOperand(TreeN, I);
19655 ReducedValsToOps[EdgeVal].push_back(TreeN);
19656 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19657 // If the edge is not an instruction, or it is different from the main
19658 // reduction opcode or has too many uses - possible reduced value.
19659 // Also, do not try to reduce const values, if the operation is not
19660 // foldable.
19661 if (!EdgeInst || Level > RecursionMaxDepth ||
19662 getRdxKind(EdgeInst) != RdxKind ||
19663 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19664 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19665 !isVectorizable(RdxKind, EdgeInst) ||
19666 (R.isAnalyzedReductionRoot(EdgeInst) &&
19667 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19668 PossibleReducedVals.push_back(EdgeVal);
19669 continue;
19670 }
19671 ReductionOps.push_back(EdgeInst);
19672 }
19673 };
19674 // Try to regroup reduced values so that it gets more profitable to try to
19675 // reduce them. Values are grouped by their value ids, instructions - by
19676 // instruction op id and/or alternate op id, plus do extra analysis for
19677 // loads (grouping them by the distabce between pointers) and cmp
19678 // instructions (grouping them by the predicate).
19681 8>
19682 PossibleReducedVals;
19683 initReductionOps(Root);
19685 SmallSet<size_t, 2> LoadKeyUsed;
19686
19687 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19688 Key = hash_combine(hash_value(LI->getParent()), Key);
19689 Value *Ptr =
19691 if (!LoadKeyUsed.insert(Key).second) {
19692 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19693 if (LIt != LoadsMap.end()) {
19694 for (LoadInst *RLI : LIt->second) {
19695 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19696 LI->getType(), LI->getPointerOperand(), DL, SE,
19697 /*StrictCheck=*/true))
19698 return hash_value(RLI->getPointerOperand());
19699 }
19700 for (LoadInst *RLI : LIt->second) {
19702 LI->getPointerOperand(), TLI)) {
19703 hash_code SubKey = hash_value(RLI->getPointerOperand());
19704 return SubKey;
19705 }
19706 }
19707 if (LIt->second.size() > 2) {
19708 hash_code SubKey =
19709 hash_value(LIt->second.back()->getPointerOperand());
19710 return SubKey;
19711 }
19712 }
19713 }
19714 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19715 .first->second.push_back(LI);
19716 return hash_value(LI->getPointerOperand());
19717 };
19718
19719 while (!Worklist.empty()) {
19720 auto [TreeN, Level] = Worklist.pop_back_val();
19721 SmallVector<Value *> PossibleRedVals;
19722 SmallVector<Instruction *> PossibleReductionOps;
19723 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19724 addReductionOps(TreeN);
19725 // Add reduction values. The values are sorted for better vectorization
19726 // results.
19727 for (Value *V : PossibleRedVals) {
19728 size_t Key, Idx;
19729 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19730 /*AllowAlternate=*/false);
19731 ++PossibleReducedVals[Key][Idx]
19732 .insert(std::make_pair(V, 0))
19733 .first->second;
19734 }
19735 for (Instruction *I : reverse(PossibleReductionOps))
19736 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19737 }
19738 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19739 // Sort values by the total number of values kinds to start the reduction
19740 // from the longest possible reduced values sequences.
19741 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19742 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19743 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19744 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19745 It != E; ++It) {
19746 PossibleRedValsVect.emplace_back();
19747 auto RedValsVect = It->second.takeVector();
19748 stable_sort(RedValsVect, llvm::less_second());
19749 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19750 PossibleRedValsVect.back().append(Data.second, Data.first);
19751 }
19752 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19753 return P1.size() > P2.size();
19754 });
19755 int NewIdx = -1;
19756 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19757 if (NewIdx < 0 ||
19758 (!isGoodForReduction(Data) &&
19759 (!isa<LoadInst>(Data.front()) ||
19760 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19762 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19764 cast<LoadInst>(ReducedVals[NewIdx].front())
19765 ->getPointerOperand())))) {
19766 NewIdx = ReducedVals.size();
19767 ReducedVals.emplace_back();
19768 }
19769 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19770 }
19771 }
19772 // Sort the reduced values by number of same/alternate opcode and/or pointer
19773 // operand.
19774 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19775 return P1.size() > P2.size();
19776 });
19777 return true;
19778 }
19779
19780 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19781 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19782 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
19783 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19784 constexpr unsigned RegMaxNumber = 4;
19785 constexpr unsigned RedValsMaxNumber = 128;
19786 // If there are a sufficient number of reduction values, reduce
19787 // to a nearby power-of-2. We can safely generate oversized
19788 // vectors and rely on the backend to split them to legal sizes.
19789 if (unsigned NumReducedVals = std::accumulate(
19790 ReducedVals.begin(), ReducedVals.end(), 0,
19791 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19792 if (!isGoodForReduction(Vals))
19793 return Num;
19794 return Num + Vals.size();
19795 });
19796 NumReducedVals < ReductionLimit &&
19797 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19798 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19799 })) {
19800 for (ReductionOpsType &RdxOps : ReductionOps)
19801 for (Value *RdxOp : RdxOps)
19802 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19803 return nullptr;
19804 }
19805
19806 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19807 TargetFolder(DL));
19808 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19809
19810 // Track the reduced values in case if they are replaced by extractelement
19811 // because of the vectorization.
19812 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19813 ReducedVals.front().size());
19814
19815 // The compare instruction of a min/max is the insertion point for new
19816 // instructions and may be replaced with a new compare instruction.
19817 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19818 assert(isa<SelectInst>(RdxRootInst) &&
19819 "Expected min/max reduction to have select root instruction");
19820 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19821 assert(isa<Instruction>(ScalarCond) &&
19822 "Expected min/max reduction to have compare condition");
19823 return cast<Instruction>(ScalarCond);
19824 };
19825
19826 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19827 return isBoolLogicOp(cast<Instruction>(V));
19828 });
19829 // Return new VectorizedTree, based on previous value.
19830 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19831 if (VectorizedTree) {
19832 // Update the final value in the reduction.
19834 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19835 if (AnyBoolLogicOp) {
19836 auto It = ReducedValsToOps.find(VectorizedTree);
19837 auto It1 = ReducedValsToOps.find(Res);
19838 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19839 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19840 (It != ReducedValsToOps.end() &&
19841 any_of(It->getSecond(), [&](Instruction *I) {
19842 return isBoolLogicOp(I) &&
19843 getRdxOperand(I, 0) == VectorizedTree;
19844 }))) {
19845 ;
19846 } else if (isGuaranteedNotToBePoison(Res, AC) ||
19847 (It1 != ReducedValsToOps.end() &&
19848 any_of(It1->getSecond(), [&](Instruction *I) {
19849 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19850 }))) {
19851 std::swap(VectorizedTree, Res);
19852 } else {
19853 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19854 }
19855 }
19856
19857 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19858 ReductionOps);
19859 }
19860 // Initialize the final value in the reduction.
19861 return Res;
19862 };
19863 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19864 ReductionOps.front().size());
19865 for (ReductionOpsType &RdxOps : ReductionOps)
19866 for (Value *RdxOp : RdxOps) {
19867 if (!RdxOp)
19868 continue;
19869 IgnoreList.insert(RdxOp);
19870 }
19871 // Intersect the fast-math-flags from all reduction operations.
19872 FastMathFlags RdxFMF;
19873 RdxFMF.set();
19874 for (Value *U : IgnoreList)
19875 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19876 RdxFMF &= FPMO->getFastMathFlags();
19877 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19878
19879 // Need to track reduced vals, they may be changed during vectorization of
19880 // subvectors.
19881 for (ArrayRef<Value *> Candidates : ReducedVals)
19882 for (Value *V : Candidates)
19883 TrackedVals.try_emplace(V, V);
19884
19886 Value *V) -> unsigned & {
19887 auto *It = MV.find(V);
19888 assert(It != MV.end() && "Unable to find given key.");
19889 return It->second;
19890 };
19891
19892 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19893 // List of the values that were reduced in other trees as part of gather
19894 // nodes and thus requiring extract if fully vectorized in other trees.
19895 SmallPtrSet<Value *, 4> RequiredExtract;
19896 WeakTrackingVH VectorizedTree = nullptr;
19897 bool CheckForReusedReductionOps = false;
19898 // Try to vectorize elements based on their type.
19900 for (ArrayRef<Value *> RV : ReducedVals)
19901 States.push_back(getSameOpcode(RV, TLI));
19902 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19903 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19904 InstructionsState S = States[I];
19905 SmallVector<Value *> Candidates;
19906 Candidates.reserve(2 * OrigReducedVals.size());
19907 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19908 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19909 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19910 // Check if the reduction value was not overriden by the extractelement
19911 // instruction because of the vectorization and exclude it, if it is not
19912 // compatible with other values.
19913 // Also check if the instruction was folded to constant/other value.
19914 auto *Inst = dyn_cast<Instruction>(RdxVal);
19915 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
19916 (!S || !S.isOpcodeOrAlt(Inst))) ||
19917 (S && !Inst))
19918 continue;
19919 Candidates.push_back(RdxVal);
19920 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19921 }
19922 bool ShuffledExtracts = false;
19923 // Try to handle shuffled extractelements.
19924 if (S && S.getOpcode() == Instruction::ExtractElement &&
19925 !S.isAltShuffle() && I + 1 < E) {
19926 SmallVector<Value *> CommonCandidates(Candidates);
19927 for (Value *RV : ReducedVals[I + 1]) {
19928 Value *RdxVal = TrackedVals.at(RV);
19929 // Check if the reduction value was not overriden by the
19930 // extractelement instruction because of the vectorization and
19931 // exclude it, if it is not compatible with other values.
19932 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19933 if (!Inst)
19934 continue;
19935 CommonCandidates.push_back(RdxVal);
19936 TrackedToOrig.try_emplace(RdxVal, RV);
19937 }
19939 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
19940 ++I;
19941 Candidates.swap(CommonCandidates);
19942 ShuffledExtracts = true;
19943 }
19944 }
19945
19946 // Emit code for constant values.
19947 if (Candidates.size() > 1 && allConstant(Candidates)) {
19948 Value *Res = Candidates.front();
19949 Value *OrigV = TrackedToOrig.at(Candidates.front());
19950 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19951 for (Value *VC : ArrayRef(Candidates).drop_front()) {
19952 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
19953 Value *OrigV = TrackedToOrig.at(VC);
19954 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19955 if (auto *ResI = dyn_cast<Instruction>(Res))
19956 V.analyzedReductionRoot(ResI);
19957 }
19958 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19959 continue;
19960 }
19961
19962 unsigned NumReducedVals = Candidates.size();
19963 if (NumReducedVals < ReductionLimit &&
19964 (NumReducedVals < 2 || !isSplat(Candidates)))
19965 continue;
19966
19967 // Check if we support repeated scalar values processing (optimization of
19968 // original scalar identity operations on matched horizontal reductions).
19969 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
19970 RdxKind != RecurKind::FMul &&
19971 RdxKind != RecurKind::FMulAdd;
19972 // Gather same values.
19973 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
19974 if (IsSupportedHorRdxIdentityOp)
19975 for (Value *V : Candidates) {
19976 Value *OrigV = TrackedToOrig.at(V);
19977 ++SameValuesCounter.try_emplace(OrigV).first->second;
19978 }
19979 // Used to check if the reduced values used same number of times. In this
19980 // case the compiler may produce better code. E.g. if reduced values are
19981 // aabbccdd (8 x values), then the first node of the tree will have a node
19982 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
19983 // Plus, the final reduction will be performed on <8 x aabbccdd>.
19984 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
19985 // x abcd) * 2.
19986 // Currently it only handles add/fadd/xor. and/or/min/max do not require
19987 // this analysis, other operations may require an extra estimation of
19988 // the profitability.
19989 bool SameScaleFactor = false;
19990 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
19991 SameValuesCounter.size() != Candidates.size();
19992 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
19993 if (OptReusedScalars) {
19994 SameScaleFactor =
19995 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
19996 RdxKind == RecurKind::Xor) &&
19997 all_of(drop_begin(SameValuesCounter),
19998 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
19999 return P.second == SameValuesCounter.front().second;
20000 });
20001 Candidates.resize(SameValuesCounter.size());
20002 transform(SameValuesCounter, Candidates.begin(),
20003 [&](const auto &P) { return TrackedVals.at(P.first); });
20004 NumReducedVals = Candidates.size();
20005 // Have a reduction of the same element.
20006 if (NumReducedVals == 1) {
20007 Value *OrigV = TrackedToOrig.at(Candidates.front());
20008 unsigned Cnt = At(SameValuesCounter, OrigV);
20009 Value *RedVal =
20010 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20011 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20012 VectorizedVals.try_emplace(OrigV, Cnt);
20013 ExternallyUsedValues.insert(OrigV);
20014 continue;
20015 }
20016 }
20017
20018 unsigned MaxVecRegSize = V.getMaxVecRegSize();
20019 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
20020 const unsigned MaxElts = std::clamp<unsigned>(
20021 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20022 RegMaxNumber * RedValsMaxNumber);
20023
20024 unsigned ReduxWidth = NumReducedVals;
20025 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20026 unsigned NumParts, NumRegs;
20027 Type *ScalarTy = Candidates.front()->getType();
20028 ReduxWidth =
20029 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20030 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20031 NumParts = TTI.getNumberOfParts(Tp);
20032 NumRegs =
20034 while (NumParts > NumRegs) {
20035 ReduxWidth = bit_floor(ReduxWidth - 1);
20036 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20037 NumParts = TTI.getNumberOfParts(Tp);
20038 NumRegs =
20040 }
20041 if (NumParts > NumRegs / 2)
20042 ReduxWidth = bit_floor(ReduxWidth);
20043 return ReduxWidth;
20044 };
20045 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20046 ReduxWidth = GetVectorFactor(ReduxWidth);
20047 ReduxWidth = std::min(ReduxWidth, MaxElts);
20048
20049 unsigned Start = 0;
20050 unsigned Pos = Start;
20051 // Restarts vectorization attempt with lower vector factor.
20052 unsigned PrevReduxWidth = ReduxWidth;
20053 bool CheckForReusedReductionOpsLocal = false;
20054 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
20055 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
20056 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20057 // Check if any of the reduction ops are gathered. If so, worth
20058 // trying again with less number of reduction ops.
20059 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20060 }
20061 ++Pos;
20062 if (Pos < NumReducedVals - ReduxWidth + 1)
20063 return IsAnyRedOpGathered;
20064 Pos = Start;
20065 --ReduxWidth;
20066 if (ReduxWidth > 1)
20067 ReduxWidth = GetVectorFactor(ReduxWidth);
20068 return IsAnyRedOpGathered;
20069 };
20070 bool AnyVectorized = false;
20071 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20072 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20073 ReduxWidth >= ReductionLimit) {
20074 // Dependency in tree of the reduction ops - drop this attempt, try
20075 // later.
20076 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20077 Start == 0) {
20078 CheckForReusedReductionOps = true;
20079 break;
20080 }
20081 PrevReduxWidth = ReduxWidth;
20082 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20083 // Been analyzed already - skip.
20084 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20085 (!has_single_bit(ReduxWidth) &&
20086 (IgnoredCandidates.contains(
20087 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
20088 IgnoredCandidates.contains(
20089 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
20090 bit_floor(ReduxWidth))))) ||
20091 V.areAnalyzedReductionVals(VL)) {
20092 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20093 continue;
20094 }
20095 // Early exit if any of the reduction values were deleted during
20096 // previous vectorization attempts.
20097 if (any_of(VL, [&V](Value *RedVal) {
20098 auto *RedValI = dyn_cast<Instruction>(RedVal);
20099 if (!RedValI)
20100 return false;
20101 return V.isDeleted(RedValI);
20102 }))
20103 break;
20104 V.buildTree(VL, IgnoreList);
20105 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20106 if (!AdjustReducedVals())
20107 V.analyzedReductionVals(VL);
20108 continue;
20109 }
20110 if (V.isLoadCombineReductionCandidate(RdxKind)) {
20111 if (!AdjustReducedVals())
20112 V.analyzedReductionVals(VL);
20113 continue;
20114 }
20115 V.reorderTopToBottom();
20116 // No need to reorder the root node at all.
20117 V.reorderBottomToTop(/*IgnoreReorder=*/true);
20118 // Keep extracted other reduction values, if they are used in the
20119 // vectorization trees.
20120 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20121 ExternallyUsedValues);
20122 // The reduction root is used as the insertion point for new
20123 // instructions, so set it as externally used to prevent it from being
20124 // deleted.
20125 LocalExternallyUsedValues.insert(ReductionRoot);
20126 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20127 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
20128 continue;
20129 for (Value *V : ReducedVals[Cnt])
20130 if (isa<Instruction>(V))
20131 LocalExternallyUsedValues.insert(TrackedVals[V]);
20132 }
20133 if (!IsSupportedHorRdxIdentityOp) {
20134 // Number of uses of the candidates in the vector of values.
20135 assert(SameValuesCounter.empty() &&
20136 "Reused values counter map is not empty");
20137 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20138 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20139 continue;
20140 Value *V = Candidates[Cnt];
20141 Value *OrigV = TrackedToOrig.at(V);
20142 ++SameValuesCounter.try_emplace(OrigV).first->second;
20143 }
20144 }
20145 V.transformNodes();
20146 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20147 // Gather externally used values.
20149 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20150 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20151 continue;
20152 Value *RdxVal = Candidates[Cnt];
20153 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20154 RdxVal = It->second;
20155 if (!Visited.insert(RdxVal).second)
20156 continue;
20157 // Check if the scalar was vectorized as part of the vectorization
20158 // tree but not the top node.
20159 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
20160 LocalExternallyUsedValues.insert(RdxVal);
20161 continue;
20162 }
20163 Value *OrigV = TrackedToOrig.at(RdxVal);
20164 unsigned NumOps =
20165 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20166 if (NumOps != ReducedValsToOps.at(OrigV).size())
20167 LocalExternallyUsedValues.insert(RdxVal);
20168 }
20169 // Do not need the list of reused scalars in regular mode anymore.
20170 if (!IsSupportedHorRdxIdentityOp)
20171 SameValuesCounter.clear();
20172 for (Value *RdxVal : VL)
20173 if (RequiredExtract.contains(RdxVal))
20174 LocalExternallyUsedValues.insert(RdxVal);
20175 V.buildExternalUses(LocalExternallyUsedValues);
20176
20177 V.computeMinimumValueSizes();
20178
20179 // Estimate cost.
20180 InstructionCost TreeCost = V.getTreeCost(VL);
20181 InstructionCost ReductionCost =
20182 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20183 InstructionCost Cost = TreeCost + ReductionCost;
20184 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
20185 << " for reduction\n");
20186 if (!Cost.isValid())
20187 break;
20188 if (Cost >= -SLPCostThreshold) {
20189 V.getORE()->emit([&]() {
20190 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20191 ReducedValsToOps.at(VL[0]).front())
20192 << "Vectorizing horizontal reduction is possible "
20193 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20194 << " and threshold "
20195 << ore::NV("Threshold", -SLPCostThreshold);
20196 });
20197 if (!AdjustReducedVals()) {
20198 V.analyzedReductionVals(VL);
20199 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20200 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20201 // Add subvectors of VL to the list of the analyzed values.
20202 for (unsigned VF = getFloorFullVectorNumberOfElements(
20203 *TTI, VL.front()->getType(), ReduxWidth - 1);
20204 VF >= ReductionLimit;
20206 *TTI, VL.front()->getType(), VF - 1)) {
20207 if (has_single_bit(VF) &&
20208 V.getCanonicalGraphSize() != V.getTreeSize())
20209 continue;
20210 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20211 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20212 }
20213 }
20214 }
20215 continue;
20216 }
20217
20218 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20219 << Cost << ". (HorRdx)\n");
20220 V.getORE()->emit([&]() {
20221 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20222 ReducedValsToOps.at(VL[0]).front())
20223 << "Vectorized horizontal reduction with cost "
20224 << ore::NV("Cost", Cost) << " and with tree size "
20225 << ore::NV("TreeSize", V.getTreeSize());
20226 });
20227
20228 Builder.setFastMathFlags(RdxFMF);
20229
20230 // Emit a reduction. If the root is a select (min/max idiom), the insert
20231 // point is the compare condition of that select.
20232 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20233 Instruction *InsertPt = RdxRootInst;
20234 if (IsCmpSelMinMax)
20235 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20236
20237 // Vectorize a tree.
20238 Value *VectorizedRoot =
20239 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20240 // Update TrackedToOrig mapping, since the tracked values might be
20241 // updated.
20242 for (Value *RdxVal : Candidates) {
20243 Value *OrigVal = TrackedToOrig.at(RdxVal);
20244 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20245 if (TransformedRdxVal != RdxVal)
20246 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20247 }
20248
20249 Builder.SetInsertPoint(InsertPt);
20250
20251 // To prevent poison from leaking across what used to be sequential,
20252 // safe, scalar boolean logic operations, the reduction operand must be
20253 // frozen.
20254 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20255 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20256
20257 // Emit code to correctly handle reused reduced values, if required.
20258 if (OptReusedScalars && !SameScaleFactor) {
20259 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20260 SameValuesCounter, TrackedToOrig);
20261 }
20262
20263 Value *ReducedSubTree;
20264 Type *ScalarTy = VL.front()->getType();
20265 if (isa<FixedVectorType>(ScalarTy)) {
20266 assert(SLPReVec && "FixedVectorType is not expected.");
20267 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20268 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20269 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20270 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20271 // Do reduction for each lane.
20272 // e.g., do reduce add for
20273 // VL[0] = <4 x Ty> <a, b, c, d>
20274 // VL[1] = <4 x Ty> <e, f, g, h>
20275 // Lane[0] = <2 x Ty> <a, e>
20276 // Lane[1] = <2 x Ty> <b, f>
20277 // Lane[2] = <2 x Ty> <c, g>
20278 // Lane[3] = <2 x Ty> <d, h>
20279 // result[0] = reduce add Lane[0]
20280 // result[1] = reduce add Lane[1]
20281 // result[2] = reduce add Lane[2]
20282 // result[3] = reduce add Lane[3]
20284 createStrideMask(I, ScalarTyNumElements, VL.size());
20285 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20286 ReducedSubTree = Builder.CreateInsertElement(
20287 ReducedSubTree,
20288 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20289 }
20290 } else {
20291 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20292 RdxRootInst->getType());
20293 }
20294 if (ReducedSubTree->getType() != VL.front()->getType()) {
20295 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20296 "Expected different reduction type.");
20297 ReducedSubTree =
20298 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20299 V.isSignedMinBitwidthRootNode());
20300 }
20301
20302 // Improved analysis for add/fadd/xor reductions with same scale factor
20303 // for all operands of reductions. We can emit scalar ops for them
20304 // instead.
20305 if (OptReusedScalars && SameScaleFactor)
20306 ReducedSubTree = emitScaleForReusedOps(
20307 ReducedSubTree, Builder, SameValuesCounter.front().second);
20308
20309 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20310 // Count vectorized reduced values to exclude them from final reduction.
20311 for (Value *RdxVal : VL) {
20312 Value *OrigV = TrackedToOrig.at(RdxVal);
20313 if (IsSupportedHorRdxIdentityOp) {
20314 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20315 continue;
20316 }
20317 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20318 if (!V.isVectorized(RdxVal))
20319 RequiredExtract.insert(RdxVal);
20320 }
20321 Pos += ReduxWidth;
20322 Start = Pos;
20323 ReduxWidth = NumReducedVals - Pos;
20324 if (ReduxWidth > 1)
20325 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20326 AnyVectorized = true;
20327 }
20328 if (OptReusedScalars && !AnyVectorized) {
20329 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20330 Value *RdxVal = TrackedVals.at(P.first);
20331 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20332 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20333 VectorizedVals.try_emplace(P.first, P.second);
20334 }
20335 continue;
20336 }
20337 }
20338 if (VectorizedTree) {
20339 // Reorder operands of bool logical op in the natural order to avoid
20340 // possible problem with poison propagation. If not possible to reorder
20341 // (both operands are originally RHS), emit an extra freeze instruction
20342 // for the LHS operand.
20343 // I.e., if we have original code like this:
20344 // RedOp1 = select i1 ?, i1 LHS, i1 false
20345 // RedOp2 = select i1 RHS, i1 ?, i1 false
20346
20347 // Then, we swap LHS/RHS to create a new op that matches the poison
20348 // semantics of the original code.
20349
20350 // If we have original code like this and both values could be poison:
20351 // RedOp1 = select i1 ?, i1 LHS, i1 false
20352 // RedOp2 = select i1 ?, i1 RHS, i1 false
20353
20354 // Then, we must freeze LHS in the new op.
20355 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20356 Instruction *RedOp1,
20357 Instruction *RedOp2,
20358 bool InitStep) {
20359 if (!AnyBoolLogicOp)
20360 return;
20361 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
20362 getRdxOperand(RedOp1, 0) == LHS ||
20364 return;
20365 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20366 getRdxOperand(RedOp2, 0) == RHS ||
20368 std::swap(LHS, RHS);
20369 return;
20370 }
20371 if (LHS != VectorizedTree)
20372 LHS = Builder.CreateFreeze(LHS);
20373 };
20374 // Finish the reduction.
20375 // Need to add extra arguments and not vectorized possible reduction
20376 // values.
20377 // Try to avoid dependencies between the scalar remainders after
20378 // reductions.
20379 auto FinalGen =
20381 bool InitStep) {
20382 unsigned Sz = InstVals.size();
20384 Sz % 2);
20385 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20386 Instruction *RedOp = InstVals[I + 1].first;
20387 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20388 Value *RdxVal1 = InstVals[I].second;
20389 Value *StableRdxVal1 = RdxVal1;
20390 auto It1 = TrackedVals.find(RdxVal1);
20391 if (It1 != TrackedVals.end())
20392 StableRdxVal1 = It1->second;
20393 Value *RdxVal2 = InstVals[I + 1].second;
20394 Value *StableRdxVal2 = RdxVal2;
20395 auto It2 = TrackedVals.find(RdxVal2);
20396 if (It2 != TrackedVals.end())
20397 StableRdxVal2 = It2->second;
20398 // To prevent poison from leaking across what used to be
20399 // sequential, safe, scalar boolean logic operations, the
20400 // reduction operand must be frozen.
20401 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20402 RedOp, InitStep);
20403 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20404 StableRdxVal2, "op.rdx", ReductionOps);
20405 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20406 }
20407 if (Sz % 2 == 1)
20408 ExtraReds[Sz / 2] = InstVals.back();
20409 return ExtraReds;
20410 };
20412 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20413 VectorizedTree);
20415 for (ArrayRef<Value *> Candidates : ReducedVals) {
20416 for (Value *RdxVal : Candidates) {
20417 if (!Visited.insert(RdxVal).second)
20418 continue;
20419 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20420 for (Instruction *RedOp :
20421 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20422 ExtraReductions.emplace_back(RedOp, RdxVal);
20423 }
20424 }
20425 // Iterate through all not-vectorized reduction values/extra arguments.
20426 bool InitStep = true;
20427 while (ExtraReductions.size() > 1) {
20429 FinalGen(ExtraReductions, InitStep);
20430 ExtraReductions.swap(NewReds);
20431 InitStep = false;
20432 }
20433 VectorizedTree = ExtraReductions.front().second;
20434
20435 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20436
20437 // The original scalar reduction is expected to have no remaining
20438 // uses outside the reduction tree itself. Assert that we got this
20439 // correct, replace internal uses with undef, and mark for eventual
20440 // deletion.
20441#ifndef NDEBUG
20442 SmallSet<Value *, 4> IgnoreSet;
20443 for (ArrayRef<Value *> RdxOps : ReductionOps)
20444 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20445#endif
20446 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20447 for (Value *Ignore : RdxOps) {
20448 if (!Ignore)
20449 continue;
20450#ifndef NDEBUG
20451 for (auto *U : Ignore->users()) {
20452 assert(IgnoreSet.count(U) &&
20453 "All users must be either in the reduction ops list.");
20454 }
20455#endif
20456 if (!Ignore->use_empty()) {
20457 Value *P = PoisonValue::get(Ignore->getType());
20458 Ignore->replaceAllUsesWith(P);
20459 }
20460 }
20461 V.removeInstructionsAndOperands(RdxOps);
20462 }
20463 } else if (!CheckForReusedReductionOps) {
20464 for (ReductionOpsType &RdxOps : ReductionOps)
20465 for (Value *RdxOp : RdxOps)
20466 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20467 }
20468 return VectorizedTree;
20469 }
20470
20471private:
20472 /// Calculate the cost of a reduction.
20473 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20474 ArrayRef<Value *> ReducedVals,
20475 bool IsCmpSelMinMax, FastMathFlags FMF,
20476 const BoUpSLP &R) {
20478 Type *ScalarTy = ReducedVals.front()->getType();
20479 unsigned ReduxWidth = ReducedVals.size();
20480 FixedVectorType *VectorTy = R.getReductionType();
20481 InstructionCost VectorCost = 0, ScalarCost;
20482 // If all of the reduced values are constant, the vector cost is 0, since
20483 // the reduction value can be calculated at the compile time.
20484 bool AllConsts = allConstant(ReducedVals);
20485 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20487 // Scalar cost is repeated for N-1 elements.
20488 int Cnt = ReducedVals.size();
20489 for (Value *RdxVal : ReducedVals) {
20490 if (Cnt == 1)
20491 break;
20492 --Cnt;
20493 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20494 Cost += GenCostFn();
20495 continue;
20496 }
20497 InstructionCost ScalarCost = 0;
20498 for (User *U : RdxVal->users()) {
20499 auto *RdxOp = cast<Instruction>(U);
20500 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20501 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20502 continue;
20503 }
20504 ScalarCost = InstructionCost::getInvalid();
20505 break;
20506 }
20507 if (ScalarCost.isValid())
20508 Cost += ScalarCost;
20509 else
20510 Cost += GenCostFn();
20511 }
20512 return Cost;
20513 };
20514 switch (RdxKind) {
20515 case RecurKind::Add:
20516 case RecurKind::Mul:
20517 case RecurKind::Or:
20518 case RecurKind::And:
20519 case RecurKind::Xor:
20520 case RecurKind::FAdd:
20521 case RecurKind::FMul: {
20522 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20523 if (!AllConsts) {
20524 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20525 assert(SLPReVec && "FixedVectorType is not expected.");
20526 unsigned ScalarTyNumElements = VecTy->getNumElements();
20527 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20528 VectorCost += TTI->getShuffleCost(
20529 TTI::SK_PermuteSingleSrc, VectorTy,
20530 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20531 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20532 CostKind);
20533 }
20534 VectorCost += TTI->getScalarizationOverhead(
20535 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20536 /*Extract*/ false, TTI::TCK_RecipThroughput);
20537 } else {
20538 Type *RedTy = VectorTy->getElementType();
20539 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20540 std::make_pair(RedTy, true));
20541 if (RType == RedTy) {
20542 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20543 FMF, CostKind);
20544 } else {
20545 VectorCost = TTI->getExtendedReductionCost(
20546 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20547 FMF, CostKind);
20548 }
20549 }
20550 }
20551 ScalarCost = EvaluateScalarCost([&]() {
20552 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20553 });
20554 break;
20555 }
20556 case RecurKind::FMax:
20557 case RecurKind::FMin:
20558 case RecurKind::FMaximum:
20559 case RecurKind::FMinimum:
20560 case RecurKind::SMax:
20561 case RecurKind::SMin:
20562 case RecurKind::UMax:
20563 case RecurKind::UMin: {
20565 if (!AllConsts)
20566 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20567 ScalarCost = EvaluateScalarCost([&]() {
20568 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20569 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20570 });
20571 break;
20572 }
20573 default:
20574 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20575 }
20576
20577 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20578 << " for reduction of " << shortBundleName(ReducedVals)
20579 << " (It is a splitting reduction)\n");
20580 return VectorCost - ScalarCost;
20581 }
20582
20583 /// Emit a horizontal reduction of the vectorized value.
20584 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20585 const TargetTransformInfo *TTI, Type *DestTy) {
20586 assert(VectorizedValue && "Need to have a vectorized tree node");
20587 assert(RdxKind != RecurKind::FMulAdd &&
20588 "A call to the llvm.fmuladd intrinsic is not handled yet");
20589
20590 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20591 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20592 RdxKind == RecurKind::Add &&
20593 DestTy->getScalarType() != FTy->getScalarType()) {
20594 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20595 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20596 Value *V = Builder.CreateBitCast(
20597 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20598 ++NumVectorInstructions;
20599 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20600 }
20601 ++NumVectorInstructions;
20602 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20603 }
20604
20605 /// Emits optimized code for unique scalar value reused \p Cnt times.
20606 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20607 unsigned Cnt) {
20608 assert(IsSupportedHorRdxIdentityOp &&
20609 "The optimization of matched scalar identity horizontal reductions "
20610 "must be supported.");
20611 if (Cnt == 1)
20612 return VectorizedValue;
20613 switch (RdxKind) {
20614 case RecurKind::Add: {
20615 // res = mul vv, n
20616 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20617 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20618 << VectorizedValue << ". (HorRdx)\n");
20619 return Builder.CreateMul(VectorizedValue, Scale);
20620 }
20621 case RecurKind::Xor: {
20622 // res = n % 2 ? 0 : vv
20623 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20624 << ". (HorRdx)\n");
20625 if (Cnt % 2 == 0)
20626 return Constant::getNullValue(VectorizedValue->getType());
20627 return VectorizedValue;
20628 }
20629 case RecurKind::FAdd: {
20630 // res = fmul v, n
20631 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20632 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20633 << VectorizedValue << ". (HorRdx)\n");
20634 return Builder.CreateFMul(VectorizedValue, Scale);
20635 }
20636 case RecurKind::And:
20637 case RecurKind::Or:
20638 case RecurKind::SMax:
20639 case RecurKind::SMin:
20640 case RecurKind::UMax:
20641 case RecurKind::UMin:
20642 case RecurKind::FMax:
20643 case RecurKind::FMin:
20644 case RecurKind::FMaximum:
20645 case RecurKind::FMinimum:
20646 // res = vv
20647 return VectorizedValue;
20648 case RecurKind::Mul:
20649 case RecurKind::FMul:
20650 case RecurKind::FMulAdd:
20651 case RecurKind::IAnyOf:
20652 case RecurKind::FAnyOf:
20653 case RecurKind::IFindLastIV:
20654 case RecurKind::FFindLastIV:
20655 case RecurKind::None:
20656 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20657 }
20658 return nullptr;
20659 }
20660
20661 /// Emits actual operation for the scalar identity values, found during
20662 /// horizontal reduction analysis.
20663 Value *
20664 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20665 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20666 const DenseMap<Value *, Value *> &TrackedToOrig) {
20667 assert(IsSupportedHorRdxIdentityOp &&
20668 "The optimization of matched scalar identity horizontal reductions "
20669 "must be supported.");
20670 ArrayRef<Value *> VL = R.getRootNodeScalars();
20671 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20672 if (VTy->getElementType() != VL.front()->getType()) {
20673 VectorizedValue = Builder.CreateIntCast(
20674 VectorizedValue,
20675 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20676 R.isSignedMinBitwidthRootNode());
20677 }
20678 switch (RdxKind) {
20679 case RecurKind::Add: {
20680 // root = mul prev_root, <1, 1, n, 1>
20682 for (Value *V : VL) {
20683 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20684 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20685 }
20686 auto *Scale = ConstantVector::get(Vals);
20687 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20688 << VectorizedValue << ". (HorRdx)\n");
20689 return Builder.CreateMul(VectorizedValue, Scale);
20690 }
20691 case RecurKind::And:
20692 case RecurKind::Or:
20693 // No need for multiple or/and(s).
20694 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20695 << ". (HorRdx)\n");
20696 return VectorizedValue;
20697 case RecurKind::SMax:
20698 case RecurKind::SMin:
20699 case RecurKind::UMax:
20700 case RecurKind::UMin:
20701 case RecurKind::FMax:
20702 case RecurKind::FMin:
20703 case RecurKind::FMaximum:
20704 case RecurKind::FMinimum:
20705 // No need for multiple min/max(s) of the same value.
20706 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20707 << ". (HorRdx)\n");
20708 return VectorizedValue;
20709 case RecurKind::Xor: {
20710 // Replace values with even number of repeats with 0, since
20711 // x xor x = 0.
20712 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20713 // 7>, if elements 4th and 6th elements have even number of repeats.
20715 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20717 std::iota(Mask.begin(), Mask.end(), 0);
20718 bool NeedShuffle = false;
20719 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20720 Value *V = VL[I];
20721 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20722 if (Cnt % 2 == 0) {
20723 Mask[I] = VF;
20724 NeedShuffle = true;
20725 }
20726 }
20727 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20728 : Mask) dbgs()
20729 << I << " ";
20730 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20731 if (NeedShuffle)
20732 VectorizedValue = Builder.CreateShuffleVector(
20733 VectorizedValue,
20734 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20735 return VectorizedValue;
20736 }
20737 case RecurKind::FAdd: {
20738 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20740 for (Value *V : VL) {
20741 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20742 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20743 }
20744 auto *Scale = ConstantVector::get(Vals);
20745 return Builder.CreateFMul(VectorizedValue, Scale);
20746 }
20747 case RecurKind::Mul:
20748 case RecurKind::FMul:
20749 case RecurKind::FMulAdd:
20750 case RecurKind::IAnyOf:
20751 case RecurKind::FAnyOf:
20752 case RecurKind::IFindLastIV:
20753 case RecurKind::FFindLastIV:
20754 case RecurKind::None:
20755 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20756 }
20757 return nullptr;
20758 }
20759};
20760} // end anonymous namespace
20761
20762/// Gets recurrence kind from the specified value.
20764 return HorizontalReduction::getRdxKind(V);
20765}
20766static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20767 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20768 return cast<FixedVectorType>(IE->getType())->getNumElements();
20769
20770 unsigned AggregateSize = 1;
20771 auto *IV = cast<InsertValueInst>(InsertInst);
20772 Type *CurrentType = IV->getType();
20773 do {
20774 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20775 for (auto *Elt : ST->elements())
20776 if (Elt != ST->getElementType(0)) // check homogeneity
20777 return std::nullopt;
20778 AggregateSize *= ST->getNumElements();
20779 CurrentType = ST->getElementType(0);
20780 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20781 AggregateSize *= AT->getNumElements();
20782 CurrentType = AT->getElementType();
20783 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20784 AggregateSize *= VT->getNumElements();
20785 return AggregateSize;
20786 } else if (CurrentType->isSingleValueType()) {
20787 return AggregateSize;
20788 } else {
20789 return std::nullopt;
20790 }
20791 } while (true);
20792}
20793
20794static void findBuildAggregate_rec(Instruction *LastInsertInst,
20796 SmallVectorImpl<Value *> &BuildVectorOpds,
20797 SmallVectorImpl<Value *> &InsertElts,
20798 unsigned OperandOffset, const BoUpSLP &R) {
20799 do {
20800 Value *InsertedOperand = LastInsertInst->getOperand(1);
20801 std::optional<unsigned> OperandIndex =
20802 getElementIndex(LastInsertInst, OperandOffset);
20803 if (!OperandIndex || R.isDeleted(LastInsertInst))
20804 return;
20805 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20806 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20807 BuildVectorOpds, InsertElts, *OperandIndex, R);
20808
20809 } else {
20810 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20811 InsertElts[*OperandIndex] = LastInsertInst;
20812 }
20813 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20814 } while (LastInsertInst != nullptr &&
20815 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20816 LastInsertInst->hasOneUse());
20817}
20818
20819/// Recognize construction of vectors like
20820/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20821/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20822/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20823/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20824/// starting from the last insertelement or insertvalue instruction.
20825///
20826/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20827/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20828/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20829///
20830/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20831///
20832/// \return true if it matches.
20833static bool findBuildAggregate(Instruction *LastInsertInst,
20835 SmallVectorImpl<Value *> &BuildVectorOpds,
20836 SmallVectorImpl<Value *> &InsertElts,
20837 const BoUpSLP &R) {
20838
20839 assert((isa<InsertElementInst>(LastInsertInst) ||
20840 isa<InsertValueInst>(LastInsertInst)) &&
20841 "Expected insertelement or insertvalue instruction!");
20842
20843 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20844 "Expected empty result vectors!");
20845
20846 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20847 if (!AggregateSize)
20848 return false;
20849 BuildVectorOpds.resize(*AggregateSize);
20850 InsertElts.resize(*AggregateSize);
20851
20852 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20853 R);
20854 llvm::erase(BuildVectorOpds, nullptr);
20855 llvm::erase(InsertElts, nullptr);
20856 if (BuildVectorOpds.size() >= 2)
20857 return true;
20858
20859 return false;
20860}
20861
20862/// Try and get a reduction instruction from a phi node.
20863///
20864/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20865/// if they come from either \p ParentBB or a containing loop latch.
20866///
20867/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20868/// if not possible.
20870 BasicBlock *ParentBB, LoopInfo *LI) {
20871 // There are situations where the reduction value is not dominated by the
20872 // reduction phi. Vectorizing such cases has been reported to cause
20873 // miscompiles. See PR25787.
20874 auto DominatedReduxValue = [&](Value *R) {
20875 return isa<Instruction>(R) &&
20876 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20877 };
20878
20879 Instruction *Rdx = nullptr;
20880
20881 // Return the incoming value if it comes from the same BB as the phi node.
20882 if (P->getIncomingBlock(0) == ParentBB) {
20883 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20884 } else if (P->getIncomingBlock(1) == ParentBB) {
20885 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20886 }
20887
20888 if (Rdx && DominatedReduxValue(Rdx))
20889 return Rdx;
20890
20891 // Otherwise, check whether we have a loop latch to look at.
20892 Loop *BBL = LI->getLoopFor(ParentBB);
20893 if (!BBL)
20894 return nullptr;
20895 BasicBlock *BBLatch = BBL->getLoopLatch();
20896 if (!BBLatch)
20897 return nullptr;
20898
20899 // There is a loop latch, return the incoming value if it comes from
20900 // that. This reduction pattern occasionally turns up.
20901 if (P->getIncomingBlock(0) == BBLatch) {
20902 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20903 } else if (P->getIncomingBlock(1) == BBLatch) {
20904 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20905 }
20906
20907 if (Rdx && DominatedReduxValue(Rdx))
20908 return Rdx;
20909
20910 return nullptr;
20911}
20912
20913static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
20914 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
20915 return true;
20916 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
20917 return true;
20918 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
20919 return true;
20920 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
20921 return true;
20922 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
20923 return true;
20924 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
20925 return true;
20926 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
20927 return true;
20928 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
20929 return true;
20930 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
20931 return true;
20932 return false;
20933}
20934
20935/// We could have an initial reduction that is not an add.
20936/// r *= v1 + v2 + v3 + v4
20937/// In such a case start looking for a tree rooted in the first '+'.
20938/// \Returns the new root if found, which may be nullptr if not an instruction.
20940 Instruction *Root) {
20941 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20942 isa<IntrinsicInst>(Root)) &&
20943 "Expected binop, select, or intrinsic for reduction matching");
20944 Value *LHS =
20945 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20946 Value *RHS =
20947 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20948 if (LHS == Phi)
20949 return dyn_cast<Instruction>(RHS);
20950 if (RHS == Phi)
20951 return dyn_cast<Instruction>(LHS);
20952 return nullptr;
20953}
20954
20955/// \p Returns the first operand of \p I that does not match \p Phi. If
20956/// operand is not an instruction it returns nullptr.
20958 Value *Op0 = nullptr;
20959 Value *Op1 = nullptr;
20960 if (!matchRdxBop(I, Op0, Op1))
20961 return nullptr;
20962 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
20963}
20964
20965/// \Returns true if \p I is a candidate instruction for reduction vectorization.
20967 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
20968 Value *B0 = nullptr, *B1 = nullptr;
20969 bool IsBinop = matchRdxBop(I, B0, B1);
20970 return IsBinop || IsSelect;
20971}
20972
20973bool SLPVectorizerPass::vectorizeHorReduction(
20974 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
20975 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
20976 if (!ShouldVectorizeHor)
20977 return false;
20978 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
20979
20980 if (Root->getParent() != BB || isa<PHINode>(Root))
20981 return false;
20982
20983 // If we can find a secondary reduction root, use that instead.
20984 auto SelectRoot = [&]() {
20985 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
20986 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
20987 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
20988 return NewRoot;
20989 return Root;
20990 };
20991
20992 // Start analysis starting from Root instruction. If horizontal reduction is
20993 // found, try to vectorize it. If it is not a horizontal reduction or
20994 // vectorization is not possible or not effective, and currently analyzed
20995 // instruction is a binary operation, try to vectorize the operands, using
20996 // pre-order DFS traversal order. If the operands were not vectorized, repeat
20997 // the same procedure considering each operand as a possible root of the
20998 // horizontal reduction.
20999 // Interrupt the process if the Root instruction itself was vectorized or all
21000 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21001 // If a horizintal reduction was not matched or vectorized we collect
21002 // instructions for possible later attempts for vectorization.
21003 std::queue<std::pair<Instruction *, unsigned>> Stack;
21004 Stack.emplace(SelectRoot(), 0);
21005 SmallPtrSet<Value *, 8> VisitedInstrs;
21006 bool Res = false;
21007 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
21008 if (R.isAnalyzedReductionRoot(Inst))
21009 return nullptr;
21010 if (!isReductionCandidate(Inst))
21011 return nullptr;
21012 HorizontalReduction HorRdx;
21013 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21014 return nullptr;
21015 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
21016 };
21017 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21018 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21019 FutureSeed = getNonPhiOperand(Root, P);
21020 if (!FutureSeed)
21021 return false;
21022 }
21023 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21024 // analysis is done separately.
21025 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21026 PostponedInsts.push_back(FutureSeed);
21027 return true;
21028 };
21029
21030 while (!Stack.empty()) {
21031 Instruction *Inst;
21032 unsigned Level;
21033 std::tie(Inst, Level) = Stack.front();
21034 Stack.pop();
21035 // Do not try to analyze instruction that has already been vectorized.
21036 // This may happen when we vectorize instruction operands on a previous
21037 // iteration while stack was populated before that happened.
21038 if (R.isDeleted(Inst))
21039 continue;
21040 if (Value *VectorizedV = TryToReduce(Inst)) {
21041 Res = true;
21042 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21043 // Try to find another reduction.
21044 Stack.emplace(I, Level);
21045 continue;
21046 }
21047 if (R.isDeleted(Inst))
21048 continue;
21049 } else {
21050 // We could not vectorize `Inst` so try to use it as a future seed.
21051 if (!TryAppendToPostponedInsts(Inst)) {
21052 assert(Stack.empty() && "Expected empty stack");
21053 break;
21054 }
21055 }
21056
21057 // Try to vectorize operands.
21058 // Continue analysis for the instruction from the same basic block only to
21059 // save compile time.
21060 if (++Level < RecursionMaxDepth)
21061 for (auto *Op : Inst->operand_values())
21062 if (VisitedInstrs.insert(Op).second)
21063 if (auto *I = dyn_cast<Instruction>(Op))
21064 // Do not try to vectorize CmpInst operands, this is done
21065 // separately.
21066 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21067 !R.isDeleted(I) && I->getParent() == BB)
21068 Stack.emplace(I, Level);
21069 }
21070 return Res;
21071}
21072
21073bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
21074 BasicBlock *BB, BoUpSLP &R) {
21075 SmallVector<WeakTrackingVH> PostponedInsts;
21076 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21077 Res |= tryToVectorize(PostponedInsts, R);
21078 return Res;
21079}
21080
21081bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21082 BoUpSLP &R) {
21083 bool Res = false;
21084 for (Value *V : Insts)
21085 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21086 Res |= tryToVectorize(Inst, R);
21087 return Res;
21088}
21089
21090bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21091 BasicBlock *BB, BoUpSLP &R,
21092 bool MaxVFOnly) {
21093 if (!R.canMapToVector(IVI->getType()))
21094 return false;
21095
21096 SmallVector<Value *, 16> BuildVectorOpds;
21097 SmallVector<Value *, 16> BuildVectorInsts;
21098 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
21099 return false;
21100
21101 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21102 R.getORE()->emit([&]() {
21103 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
21104 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
21105 "trying reduction first.";
21106 });
21107 return false;
21108 }
21109 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
21110 // Aggregate value is unlikely to be processed in vector register.
21111 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21112}
21113
21114bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21115 BasicBlock *BB, BoUpSLP &R,
21116 bool MaxVFOnly) {
21117 SmallVector<Value *, 16> BuildVectorInsts;
21118 SmallVector<Value *, 16> BuildVectorOpds;
21120 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21121 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21122 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21123 return false;
21124
21125 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21126 R.getORE()->emit([&]() {
21127 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
21128 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
21129 "trying reduction first.";
21130 });
21131 return false;
21132 }
21133 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
21134 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21135}
21136
21137template <typename T>
21139 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
21140 function_ref<bool(T *, T *)> AreCompatible,
21141 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
21142 bool MaxVFOnly, BoUpSLP &R) {
21143 bool Changed = false;
21144 // Sort by type, parent, operands.
21145 stable_sort(Incoming, Comparator);
21146
21147 // Try to vectorize elements base on their type.
21148 SmallVector<T *> Candidates;
21150 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
21151 VL.clear()) {
21152 // Look for the next elements with the same type, parent and operand
21153 // kinds.
21154 auto *I = dyn_cast<Instruction>(*IncIt);
21155 if (!I || R.isDeleted(I)) {
21156 ++IncIt;
21157 continue;
21158 }
21159 auto *SameTypeIt = IncIt;
21160 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21161 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21162 AreCompatible(*SameTypeIt, *IncIt))) {
21163 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21164 ++SameTypeIt;
21165 if (I && !R.isDeleted(I))
21166 VL.push_back(cast<T>(I));
21167 }
21168
21169 // Try to vectorize them.
21170 unsigned NumElts = VL.size();
21171 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
21172 << NumElts << ")\n");
21173 // The vectorization is a 3-state attempt:
21174 // 1. Try to vectorize instructions with the same/alternate opcodes with the
21175 // size of maximal register at first.
21176 // 2. Try to vectorize remaining instructions with the same type, if
21177 // possible. This may result in the better vectorization results rather than
21178 // if we try just to vectorize instructions with the same/alternate opcodes.
21179 // 3. Final attempt to try to vectorize all instructions with the
21180 // same/alternate ops only, this may result in some extra final
21181 // vectorization.
21182 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21183 // Success start over because instructions might have been changed.
21184 Changed = true;
21185 VL.swap(Candidates);
21186 Candidates.clear();
21187 for (T *V : VL) {
21188 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21189 Candidates.push_back(V);
21190 }
21191 } else {
21192 /// \Returns the minimum number of elements that we will attempt to
21193 /// vectorize.
21194 auto GetMinNumElements = [&R](Value *V) {
21195 unsigned EltSize = R.getVectorElementSize(V);
21196 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21197 };
21198 if (NumElts < GetMinNumElements(*IncIt) &&
21199 (Candidates.empty() ||
21200 Candidates.front()->getType() == (*IncIt)->getType())) {
21201 for (T *V : VL) {
21202 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21203 Candidates.push_back(V);
21204 }
21205 }
21206 }
21207 // Final attempt to vectorize instructions with the same types.
21208 if (Candidates.size() > 1 &&
21209 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21210 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21211 // Success start over because instructions might have been changed.
21212 Changed = true;
21213 } else if (MaxVFOnly) {
21214 // Try to vectorize using small vectors.
21216 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21217 VL.clear()) {
21218 auto *I = dyn_cast<Instruction>(*It);
21219 if (!I || R.isDeleted(I)) {
21220 ++It;
21221 continue;
21222 }
21223 auto *SameTypeIt = It;
21224 while (SameTypeIt != End &&
21225 (!isa<Instruction>(*SameTypeIt) ||
21226 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21227 AreCompatible(*SameTypeIt, *It))) {
21228 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21229 ++SameTypeIt;
21230 if (I && !R.isDeleted(I))
21231 VL.push_back(cast<T>(I));
21232 }
21233 unsigned NumElts = VL.size();
21234 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21235 /*MaxVFOnly=*/false))
21236 Changed = true;
21237 It = SameTypeIt;
21238 }
21239 }
21240 Candidates.clear();
21241 }
21242
21243 // Start over at the next instruction of a different type (or the end).
21244 IncIt = SameTypeIt;
21245 }
21246 return Changed;
21247}
21248
21249/// Compare two cmp instructions. If IsCompatibility is true, function returns
21250/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21251/// operands. If IsCompatibility is false, function implements strict weak
21252/// ordering relation between two cmp instructions, returning true if the first
21253/// instruction is "less" than the second, i.e. its predicate is less than the
21254/// predicate of the second or the operands IDs are less than the operands IDs
21255/// of the second cmp instruction.
21256template <bool IsCompatibility>
21257static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21258 const DominatorTree &DT) {
21259 assert(isValidElementType(V->getType()) &&
21260 isValidElementType(V2->getType()) &&
21261 "Expected valid element types only.");
21262 if (V == V2)
21263 return IsCompatibility;
21264 auto *CI1 = cast<CmpInst>(V);
21265 auto *CI2 = cast<CmpInst>(V2);
21266 if (CI1->getOperand(0)->getType()->getTypeID() <
21267 CI2->getOperand(0)->getType()->getTypeID())
21268 return !IsCompatibility;
21269 if (CI1->getOperand(0)->getType()->getTypeID() >
21270 CI2->getOperand(0)->getType()->getTypeID())
21271 return false;
21272 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21274 return !IsCompatibility;
21275 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21277 return false;
21278 CmpInst::Predicate Pred1 = CI1->getPredicate();
21279 CmpInst::Predicate Pred2 = CI2->getPredicate();
21282 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21283 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21284 if (BasePred1 < BasePred2)
21285 return !IsCompatibility;
21286 if (BasePred1 > BasePred2)
21287 return false;
21288 // Compare operands.
21289 bool CI1Preds = Pred1 == BasePred1;
21290 bool CI2Preds = Pred2 == BasePred1;
21291 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21292 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21293 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21294 if (Op1 == Op2)
21295 continue;
21296 if (Op1->getValueID() < Op2->getValueID())
21297 return !IsCompatibility;
21298 if (Op1->getValueID() > Op2->getValueID())
21299 return false;
21300 if (auto *I1 = dyn_cast<Instruction>(Op1))
21301 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21302 if (IsCompatibility) {
21303 if (I1->getParent() != I2->getParent())
21304 return false;
21305 } else {
21306 // Try to compare nodes with same parent.
21307 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21308 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21309 if (!NodeI1)
21310 return NodeI2 != nullptr;
21311 if (!NodeI2)
21312 return false;
21313 assert((NodeI1 == NodeI2) ==
21314 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21315 "Different nodes should have different DFS numbers");
21316 if (NodeI1 != NodeI2)
21317 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21318 }
21319 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21320 if (S && (IsCompatibility || !S.isAltShuffle()))
21321 continue;
21322 if (IsCompatibility)
21323 return false;
21324 if (I1->getOpcode() != I2->getOpcode())
21325 return I1->getOpcode() < I2->getOpcode();
21326 }
21327 }
21328 return IsCompatibility;
21329}
21330
21331template <typename ItT>
21332bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21333 BasicBlock *BB, BoUpSLP &R) {
21334 bool Changed = false;
21335 // Try to find reductions first.
21336 for (CmpInst *I : CmpInsts) {
21337 if (R.isDeleted(I))
21338 continue;
21339 for (Value *Op : I->operands())
21340 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21341 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21342 if (R.isDeleted(I))
21343 break;
21344 }
21345 }
21346 // Try to vectorize operands as vector bundles.
21347 for (CmpInst *I : CmpInsts) {
21348 if (R.isDeleted(I))
21349 continue;
21350 Changed |= tryToVectorize(I, R);
21351 }
21352 // Try to vectorize list of compares.
21353 // Sort by type, compare predicate, etc.
21354 auto CompareSorter = [&](Value *V, Value *V2) {
21355 if (V == V2)
21356 return false;
21357 return compareCmp<false>(V, V2, *TLI, *DT);
21358 };
21359
21360 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21361 if (V1 == V2)
21362 return true;
21363 return compareCmp<true>(V1, V2, *TLI, *DT);
21364 };
21365
21367 for (Instruction *V : CmpInsts)
21368 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21369 Vals.push_back(V);
21370 if (Vals.size() <= 1)
21371 return Changed;
21372 Changed |= tryToVectorizeSequence<Value>(
21373 Vals, CompareSorter, AreCompatibleCompares,
21374 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21375 // Exclude possible reductions from other blocks.
21376 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21377 return any_of(V->users(), [V](User *U) {
21378 auto *Select = dyn_cast<SelectInst>(U);
21379 return Select &&
21380 Select->getParent() != cast<Instruction>(V)->getParent();
21381 });
21382 });
21383 if (ArePossiblyReducedInOtherBlock)
21384 return false;
21385 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21386 },
21387 /*MaxVFOnly=*/true, R);
21388 return Changed;
21389}
21390
21391bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21392 BasicBlock *BB, BoUpSLP &R) {
21393 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21394 "This function only accepts Insert instructions");
21395 bool OpsChanged = false;
21396 SmallVector<WeakTrackingVH> PostponedInsts;
21397 for (auto *I : reverse(Instructions)) {
21398 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21399 if (R.isDeleted(I) || isa<CmpInst>(I))
21400 continue;
21401 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21402 OpsChanged |=
21403 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21404 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21405 OpsChanged |=
21406 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21407 }
21408 // pass2 - try to vectorize reductions only
21409 if (R.isDeleted(I))
21410 continue;
21411 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21412 if (R.isDeleted(I) || isa<CmpInst>(I))
21413 continue;
21414 // pass3 - try to match and vectorize a buildvector sequence.
21415 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21416 OpsChanged |=
21417 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21418 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21419 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21420 /*MaxVFOnly=*/false);
21421 }
21422 }
21423 // Now try to vectorize postponed instructions.
21424 OpsChanged |= tryToVectorize(PostponedInsts, R);
21425
21426 Instructions.clear();
21427 return OpsChanged;
21428}
21429
21430bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21431 bool Changed = false;
21433 SmallPtrSet<Value *, 16> VisitedInstrs;
21434 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21435 // node. Allows better to identify the chains that can be vectorized in the
21436 // better way.
21438 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21440 isValidElementType(V2->getType()) &&
21441 "Expected vectorizable types only.");
21442 // It is fine to compare type IDs here, since we expect only vectorizable
21443 // types, like ints, floats and pointers, we don't care about other type.
21444 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21445 return true;
21446 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21447 return false;
21448 if (V1->getType()->getScalarSizeInBits() <
21449 V2->getType()->getScalarSizeInBits())
21450 return true;
21451 if (V1->getType()->getScalarSizeInBits() >
21452 V2->getType()->getScalarSizeInBits())
21453 return false;
21454 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21455 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21456 if (Opcodes1.size() < Opcodes2.size())
21457 return true;
21458 if (Opcodes1.size() > Opcodes2.size())
21459 return false;
21460 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21461 {
21462 // Instructions come first.
21463 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21464 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21465 if (I1 && I2) {
21466 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21467 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21468 if (!NodeI1)
21469 return NodeI2 != nullptr;
21470 if (!NodeI2)
21471 return false;
21472 assert((NodeI1 == NodeI2) ==
21473 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21474 "Different nodes should have different DFS numbers");
21475 if (NodeI1 != NodeI2)
21476 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21477 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21478 if (S && !S.isAltShuffle())
21479 continue;
21480 return I1->getOpcode() < I2->getOpcode();
21481 }
21482 if (I1)
21483 return true;
21484 if (I2)
21485 return false;
21486 }
21487 {
21488 // Non-undef constants come next.
21489 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21490 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21491 if (C1 && C2)
21492 continue;
21493 if (C1)
21494 return true;
21495 if (C2)
21496 return false;
21497 }
21498 bool U1 = isa<UndefValue>(Opcodes1[I]);
21499 bool U2 = isa<UndefValue>(Opcodes2[I]);
21500 {
21501 // Non-constant non-instructions come next.
21502 if (!U1 && !U2) {
21503 auto ValID1 = Opcodes1[I]->getValueID();
21504 auto ValID2 = Opcodes2[I]->getValueID();
21505 if (ValID1 == ValID2)
21506 continue;
21507 if (ValID1 < ValID2)
21508 return true;
21509 if (ValID1 > ValID2)
21510 return false;
21511 }
21512 if (!U1)
21513 return true;
21514 if (!U2)
21515 return false;
21516 }
21517 // Undefs come last.
21518 assert(U1 && U2 && "The only thing left should be undef & undef.");
21519 }
21520 return false;
21521 };
21522 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21523 if (V1 == V2)
21524 return true;
21525 if (V1->getType() != V2->getType())
21526 return false;
21527 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21528 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21529 if (Opcodes1.size() != Opcodes2.size())
21530 return false;
21531 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21532 // Undefs are compatible with any other value.
21533 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21534 continue;
21535 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21536 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21537 if (R.isDeleted(I1) || R.isDeleted(I2))
21538 return false;
21539 if (I1->getParent() != I2->getParent())
21540 return false;
21541 if (getSameOpcode({I1, I2}, *TLI))
21542 continue;
21543 return false;
21544 }
21545 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21546 continue;
21547 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21548 return false;
21549 }
21550 return true;
21551 };
21552
21553 bool HaveVectorizedPhiNodes = false;
21554 do {
21555 // Collect the incoming values from the PHIs.
21556 Incoming.clear();
21557 for (Instruction &I : *BB) {
21558 auto *P = dyn_cast<PHINode>(&I);
21559 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21560 break;
21561
21562 // No need to analyze deleted, vectorized and non-vectorizable
21563 // instructions.
21564 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21565 isValidElementType(P->getType()))
21566 Incoming.push_back(P);
21567 }
21568
21569 if (Incoming.size() <= 1)
21570 break;
21571
21572 // Find the corresponding non-phi nodes for better matching when trying to
21573 // build the tree.
21574 for (Value *V : Incoming) {
21575 SmallVectorImpl<Value *> &Opcodes =
21576 PHIToOpcodes.try_emplace(V).first->getSecond();
21577 if (!Opcodes.empty())
21578 continue;
21579 SmallVector<Value *, 4> Nodes(1, V);
21581 while (!Nodes.empty()) {
21582 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21583 if (!Visited.insert(PHI).second)
21584 continue;
21585 for (Value *V : PHI->incoming_values()) {
21586 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21587 Nodes.push_back(PHI1);
21588 continue;
21589 }
21590 Opcodes.emplace_back(V);
21591 }
21592 }
21593 }
21594
21595 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21596 Incoming, PHICompare, AreCompatiblePHIs,
21597 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21598 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21599 },
21600 /*MaxVFOnly=*/true, R);
21601 Changed |= HaveVectorizedPhiNodes;
21602 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21603 auto *PHI = dyn_cast<PHINode>(P.first);
21604 return !PHI || R.isDeleted(PHI);
21605 }))
21606 PHIToOpcodes.clear();
21607 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21608 } while (HaveVectorizedPhiNodes);
21609
21610 VisitedInstrs.clear();
21611
21612 InstSetVector PostProcessInserts;
21613 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21614 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21615 // also vectorizes `PostProcessCmps`.
21616 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21617 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21618 if (VectorizeCmps) {
21619 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21620 PostProcessCmps.clear();
21621 }
21622 PostProcessInserts.clear();
21623 return Changed;
21624 };
21625 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21626 auto IsInPostProcessInstrs = [&](Instruction *I) {
21627 if (auto *Cmp = dyn_cast<CmpInst>(I))
21628 return PostProcessCmps.contains(Cmp);
21629 return isa<InsertElementInst, InsertValueInst>(I) &&
21630 PostProcessInserts.contains(I);
21631 };
21632 // Returns true if `I` is an instruction without users, like terminator, or
21633 // function call with ignored return value, store. Ignore unused instructions
21634 // (basing on instruction type, except for CallInst and InvokeInst).
21635 auto HasNoUsers = [](Instruction *I) {
21636 return I->use_empty() &&
21637 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21638 };
21639 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21640 // Skip instructions with scalable type. The num of elements is unknown at
21641 // compile-time for scalable type.
21642 if (isa<ScalableVectorType>(It->getType()))
21643 continue;
21644
21645 // Skip instructions marked for the deletion.
21646 if (R.isDeleted(&*It))
21647 continue;
21648 // We may go through BB multiple times so skip the one we have checked.
21649 if (!VisitedInstrs.insert(&*It).second) {
21650 if (HasNoUsers(&*It) &&
21651 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21652 // We would like to start over since some instructions are deleted
21653 // and the iterator may become invalid value.
21654 Changed = true;
21655 It = BB->begin();
21656 E = BB->end();
21657 }
21658 continue;
21659 }
21660
21661 if (isa<DbgInfoIntrinsic>(It))
21662 continue;
21663
21664 // Try to vectorize reductions that use PHINodes.
21665 if (PHINode *P = dyn_cast<PHINode>(It)) {
21666 // Check that the PHI is a reduction PHI.
21667 if (P->getNumIncomingValues() == 2) {
21668 // Try to match and vectorize a horizontal reduction.
21669 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21670 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21671 Changed = true;
21672 It = BB->begin();
21673 E = BB->end();
21674 continue;
21675 }
21676 }
21677 // Try to vectorize the incoming values of the PHI, to catch reductions
21678 // that feed into PHIs.
21679 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21680 // Skip if the incoming block is the current BB for now. Also, bypass
21681 // unreachable IR for efficiency and to avoid crashing.
21682 // TODO: Collect the skipped incoming values and try to vectorize them
21683 // after processing BB.
21684 if (BB == P->getIncomingBlock(I) ||
21685 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21686 continue;
21687
21688 // Postponed instructions should not be vectorized here, delay their
21689 // vectorization.
21690 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21691 PI && !IsInPostProcessInstrs(PI)) {
21692 bool Res =
21693 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21694 Changed |= Res;
21695 if (Res && R.isDeleted(P)) {
21696 It = BB->begin();
21697 E = BB->end();
21698 break;
21699 }
21700 }
21701 }
21702 continue;
21703 }
21704
21705 if (HasNoUsers(&*It)) {
21706 bool OpsChanged = false;
21707 auto *SI = dyn_cast<StoreInst>(It);
21708 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21709 if (SI) {
21710 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21711 // Try to vectorize chain in store, if this is the only store to the
21712 // address in the block.
21713 // TODO: This is just a temporarily solution to save compile time. Need
21714 // to investigate if we can safely turn on slp-vectorize-hor-store
21715 // instead to allow lookup for reduction chains in all non-vectorized
21716 // stores (need to check side effects and compile time).
21717 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21718 SI->getValueOperand()->hasOneUse();
21719 }
21720 if (TryToVectorizeRoot) {
21721 for (auto *V : It->operand_values()) {
21722 // Postponed instructions should not be vectorized here, delay their
21723 // vectorization.
21724 if (auto *VI = dyn_cast<Instruction>(V);
21725 VI && !IsInPostProcessInstrs(VI))
21726 // Try to match and vectorize a horizontal reduction.
21727 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21728 }
21729 }
21730 // Start vectorization of post-process list of instructions from the
21731 // top-tree instructions to try to vectorize as many instructions as
21732 // possible.
21733 OpsChanged |=
21734 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21735 if (OpsChanged) {
21736 // We would like to start over since some instructions are deleted
21737 // and the iterator may become invalid value.
21738 Changed = true;
21739 It = BB->begin();
21740 E = BB->end();
21741 continue;
21742 }
21743 }
21744
21745 if (isa<InsertElementInst, InsertValueInst>(It))
21746 PostProcessInserts.insert(&*It);
21747 else if (isa<CmpInst>(It))
21748 PostProcessCmps.insert(cast<CmpInst>(&*It));
21749 }
21750
21751 return Changed;
21752}
21753
21754bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21755 auto Changed = false;
21756 for (auto &Entry : GEPs) {
21757 // If the getelementptr list has fewer than two elements, there's nothing
21758 // to do.
21759 if (Entry.second.size() < 2)
21760 continue;
21761
21762 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21763 << Entry.second.size() << ".\n");
21764
21765 // Process the GEP list in chunks suitable for the target's supported
21766 // vector size. If a vector register can't hold 1 element, we are done. We
21767 // are trying to vectorize the index computations, so the maximum number of
21768 // elements is based on the size of the index expression, rather than the
21769 // size of the GEP itself (the target's pointer size).
21770 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21771 return !R.isDeleted(GEP);
21772 });
21773 if (It == Entry.second.end())
21774 continue;
21775 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21776 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21777 if (MaxVecRegSize < EltSize)
21778 continue;
21779
21780 unsigned MaxElts = MaxVecRegSize / EltSize;
21781 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21782 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21783 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21784
21785 // Initialize a set a candidate getelementptrs. Note that we use a
21786 // SetVector here to preserve program order. If the index computations
21787 // are vectorizable and begin with loads, we want to minimize the chance
21788 // of having to reorder them later.
21789 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21790
21791 // Some of the candidates may have already been vectorized after we
21792 // initially collected them or their index is optimized to constant value.
21793 // If so, they are marked as deleted, so remove them from the set of
21794 // candidates.
21795 Candidates.remove_if([&R](Value *I) {
21796 return R.isDeleted(cast<Instruction>(I)) ||
21797 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21798 });
21799
21800 // Remove from the set of candidates all pairs of getelementptrs with
21801 // constant differences. Such getelementptrs are likely not good
21802 // candidates for vectorization in a bottom-up phase since one can be
21803 // computed from the other. We also ensure all candidate getelementptr
21804 // indices are unique.
21805 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21806 auto *GEPI = GEPList[I];
21807 if (!Candidates.count(GEPI))
21808 continue;
21809 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21810 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21811 auto *GEPJ = GEPList[J];
21812 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21813 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21814 Candidates.remove(GEPI);
21815 Candidates.remove(GEPJ);
21816 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21817 Candidates.remove(GEPJ);
21818 }
21819 }
21820 }
21821
21822 // We break out of the above computation as soon as we know there are
21823 // fewer than two candidates remaining.
21824 if (Candidates.size() < 2)
21825 continue;
21826
21827 // Add the single, non-constant index of each candidate to the bundle. We
21828 // ensured the indices met these constraints when we originally collected
21829 // the getelementptrs.
21830 SmallVector<Value *, 16> Bundle(Candidates.size());
21831 auto BundleIndex = 0u;
21832 for (auto *V : Candidates) {
21833 auto *GEP = cast<GetElementPtrInst>(V);
21834 auto *GEPIdx = GEP->idx_begin()->get();
21835 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21836 Bundle[BundleIndex++] = GEPIdx;
21837 }
21838
21839 // Try and vectorize the indices. We are currently only interested in
21840 // gather-like cases of the form:
21841 //
21842 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21843 //
21844 // where the loads of "a", the loads of "b", and the subtractions can be
21845 // performed in parallel. It's likely that detecting this pattern in a
21846 // bottom-up phase will be simpler and less costly than building a
21847 // full-blown top-down phase beginning at the consecutive loads.
21848 Changed |= tryToVectorizeList(Bundle, R);
21849 }
21850 }
21851 return Changed;
21852}
21853
21854bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21855 bool Changed = false;
21856 // Sort by type, base pointers and values operand. Value operands must be
21857 // compatible (have the same opcode, same parent), otherwise it is
21858 // definitely not profitable to try to vectorize them.
21859 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21860 if (V->getValueOperand()->getType()->getTypeID() <
21861 V2->getValueOperand()->getType()->getTypeID())
21862 return true;
21863 if (V->getValueOperand()->getType()->getTypeID() >
21864 V2->getValueOperand()->getType()->getTypeID())
21865 return false;
21866 if (V->getPointerOperandType()->getTypeID() <
21867 V2->getPointerOperandType()->getTypeID())
21868 return true;
21869 if (V->getPointerOperandType()->getTypeID() >
21870 V2->getPointerOperandType()->getTypeID())
21871 return false;
21872 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21873 V2->getValueOperand()->getType()->getScalarSizeInBits())
21874 return true;
21875 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21876 V2->getValueOperand()->getType()->getScalarSizeInBits())
21877 return false;
21878 // UndefValues are compatible with all other values.
21879 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21880 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21882 DT->getNode(I1->getParent());
21884 DT->getNode(I2->getParent());
21885 assert(NodeI1 && "Should only process reachable instructions");
21886 assert(NodeI2 && "Should only process reachable instructions");
21887 assert((NodeI1 == NodeI2) ==
21888 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21889 "Different nodes should have different DFS numbers");
21890 if (NodeI1 != NodeI2)
21891 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21892 return I1->getOpcode() < I2->getOpcode();
21893 }
21894 return V->getValueOperand()->getValueID() <
21895 V2->getValueOperand()->getValueID();
21896 };
21897
21898 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21899 if (V1 == V2)
21900 return true;
21901 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21902 return false;
21903 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21904 return false;
21905 // Undefs are compatible with any other value.
21906 if (isa<UndefValue>(V1->getValueOperand()) ||
21907 isa<UndefValue>(V2->getValueOperand()))
21908 return true;
21909 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21910 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21911 if (I1->getParent() != I2->getParent())
21912 return false;
21913 return getSameOpcode({I1, I2}, *TLI).valid();
21914 }
21915 if (isa<Constant>(V1->getValueOperand()) &&
21916 isa<Constant>(V2->getValueOperand()))
21917 return true;
21918 return V1->getValueOperand()->getValueID() ==
21919 V2->getValueOperand()->getValueID();
21920 };
21921
21922 // Attempt to sort and vectorize each of the store-groups.
21924 for (auto &Pair : Stores) {
21925 if (Pair.second.size() < 2)
21926 continue;
21927
21928 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
21929 << Pair.second.size() << ".\n");
21930
21931 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21932 continue;
21933
21934 // Reverse stores to do bottom-to-top analysis. This is important if the
21935 // values are stores to the same addresses several times, in this case need
21936 // to follow the stores order (reversed to meet the memory dependecies).
21937 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21938 Pair.second.rend());
21939 Changed |= tryToVectorizeSequence<StoreInst>(
21940 ReversedStores, StoreSorter, AreCompatibleStores,
21941 [&](ArrayRef<StoreInst *> Candidates, bool) {
21942 return vectorizeStores(Candidates, R, Attempted);
21943 },
21944 /*MaxVFOnly=*/false, R);
21945 }
21946 return Changed;
21947}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
reverse_iterator rend()
Definition: BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:675
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1978
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1873
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2115
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1972
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1207
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1285
unsigned arg_size() const
Definition: InstrTypes.h:1292
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1502
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1969
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:988
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:859
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:108
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2280
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1066
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2505
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:525
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1074
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2493
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:553
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2288
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1809
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:480
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:189
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2568
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2180
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:188
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:325
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:234
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1868
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:505
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:861
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1755
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:879
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:500
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2398
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2429
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2146
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2272
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:871
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2527
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:485
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2443
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1665
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:183
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2296
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2219
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:194
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1828
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1608
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1398
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2699
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:283
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:763
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
bool erase(const ValueT &V)
Definition: DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7297
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1683
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:250
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2136
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.