LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107
108#define SV_NAME "slp-vectorizer"
109#define DEBUG_TYPE "SLP"
110
111STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
112
113DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
114 "Controls which SLP graphs should be vectorized.");
115
116static cl::opt<bool>
117 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
118 cl::desc("Run the SLP vectorization passes"));
119
120static cl::opt<bool>
121 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
122 cl::desc("Enable vectorization for wider vector utilization"));
123
124static cl::opt<int>
126 cl::desc("Only vectorize if you gain more than this "
127 "number "));
128
130 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
131 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
132 "heuristics and makes vectorization decision via cost modeling."));
133
134static cl::opt<bool>
135ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
136 cl::desc("Attempt to vectorize horizontal reductions"));
137
139 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
140 cl::desc(
141 "Attempt to vectorize horizontal reductions feeding into a store"));
142
143static cl::opt<int>
145 cl::desc("Attempt to vectorize for this register size in bits"));
146
149 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
150
151/// Limits the size of scheduling regions in a block.
152/// It avoid long compile times for _very_ large blocks where vector
153/// instructions are spread over a wide range.
154/// This limit is way higher than needed by real-world functions.
155static cl::opt<int>
156ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
157 cl::desc("Limit the size of the SLP scheduling region per block"));
158
160 "slp-min-reg-size", cl::init(128), cl::Hidden,
161 cl::desc("Attempt to vectorize for this register size in bits"));
162
164 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
165 cl::desc("Limit the recursion depth when building a vectorizable tree"));
166
168 "slp-min-tree-size", cl::init(3), cl::Hidden,
169 cl::desc("Only vectorize small trees if they are fully vectorizable"));
170
171// The maximum depth that the look-ahead score heuristic will explore.
172// The higher this value, the higher the compilation time overhead.
174 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
175 cl::desc("The maximum look-ahead depth for operand reordering scores"));
176
177// The maximum depth that the look-ahead score heuristic will explore
178// when it probing among candidates for vectorization tree roots.
179// The higher this value, the higher the compilation time overhead but unlike
180// similar limit for operands ordering this is less frequently used, hence
181// impact of higher value is less noticeable.
183 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
184 cl::desc("The maximum look-ahead depth for searching best rooting option"));
185
187 "slp-min-strided-loads", cl::init(2), cl::Hidden,
188 cl::desc("The minimum number of loads, which should be considered strided, "
189 "if the stride is > 1 or is runtime value"));
190
192 "slp-max-stride", cl::init(8), cl::Hidden,
193 cl::desc("The maximum stride, considered to be profitable."));
194
195static cl::opt<bool>
196 ViewSLPTree("view-slp-tree", cl::Hidden,
197 cl::desc("Display the SLP trees with Graphviz"));
198
200 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
201 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
202
203// Limit the number of alias checks. The limit is chosen so that
204// it has no negative effect on the llvm benchmarks.
205static const unsigned AliasedCheckLimit = 10;
206
207// Limit of the number of uses for potentially transformed instructions/values,
208// used in checks to avoid compile-time explode.
209static constexpr int UsesLimit = 64;
210
211// Another limit for the alias checks: The maximum distance between load/store
212// instructions where alias checks are done.
213// This limit is useful for very large basic blocks.
214static const unsigned MaxMemDepDistance = 160;
215
216/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
217/// regions to be handled.
218static const int MinScheduleRegionSize = 16;
219
220/// Maximum allowed number of operands in the PHI nodes.
221static const unsigned MaxPHINumOperands = 128;
222
223/// Predicate for the element types that the SLP vectorizer supports.
224///
225/// The most important thing to filter here are types which are invalid in LLVM
226/// vectors. We also filter target specific types which have absolutely no
227/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
228/// avoids spending time checking the cost model and realizing that they will
229/// be inevitably scalarized.
230static bool isValidElementType(Type *Ty) {
231 // TODO: Support ScalableVectorType.
232 if (SLPReVec && isa<FixedVectorType>(Ty))
233 Ty = Ty->getScalarType();
234 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
235 !Ty->isPPC_FP128Ty();
236}
237
238/// Returns the type of the given value/instruction \p V. If it is store,
239/// returns the type of its value operand, for Cmp - the types of the compare
240/// operands and for insertelement - the type os the inserted operand.
241/// Otherwise, just the type of the value is returned.
243 if (auto *SI = dyn_cast<StoreInst>(V))
244 return SI->getValueOperand()->getType();
245 if (auto *CI = dyn_cast<CmpInst>(V))
246 return CI->getOperand(0)->getType();
247 if (auto *IE = dyn_cast<InsertElementInst>(V))
248 return IE->getOperand(1)->getType();
249 return V->getType();
250}
251
252/// \returns the number of elements for Ty.
253static unsigned getNumElements(Type *Ty) {
254 assert(!isa<ScalableVectorType>(Ty) &&
255 "ScalableVectorType is not supported.");
256 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
257 return VecTy->getNumElements();
258 return 1;
259}
260
261/// \returns the vector type of ScalarTy based on vectorization factor.
262static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
263 return FixedVectorType::get(ScalarTy->getScalarType(),
264 VF * getNumElements(ScalarTy));
265}
266
267/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
268/// which forms type, which splits by \p TTI into whole vector types during
269/// legalization.
271 Type *Ty, unsigned Sz) {
272 if (!isValidElementType(Ty))
273 return bit_ceil(Sz);
274 // Find the number of elements, which forms full vectors.
275 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
276 if (NumParts == 0 || NumParts >= Sz)
277 return bit_ceil(Sz);
278 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
279}
280
281/// Returns the number of elements of the given type \p Ty, not greater than \p
282/// Sz, which forms type, which splits by \p TTI into whole vector types during
283/// legalization.
284static unsigned
286 unsigned Sz) {
287 if (!isValidElementType(Ty))
288 return bit_floor(Sz);
289 // Find the number of elements, which forms full vectors.
290 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
291 if (NumParts == 0 || NumParts >= Sz)
292 return bit_floor(Sz);
293 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294 if (RegVF > Sz)
295 return bit_floor(Sz);
296 return (Sz / RegVF) * RegVF;
297}
298
299static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
300 SmallVectorImpl<int> &Mask) {
301 // The ShuffleBuilder implementation use shufflevector to splat an "element".
302 // But the element have different meaning for SLP (scalar) and REVEC
303 // (vector). We need to expand Mask into masks which shufflevector can use
304 // directly.
305 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
306 for (unsigned I : seq<unsigned>(Mask.size()))
307 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
308 I * VecTyNumElements, VecTyNumElements)))
309 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
310 : Mask[I] * VecTyNumElements + J;
311 Mask.swap(NewMask);
312}
313
314/// \returns the number of groups of shufflevector
315/// A group has the following features
316/// 1. All of value in a group are shufflevector.
317/// 2. The mask of all shufflevector is isExtractSubvectorMask.
318/// 3. The mask of all shufflevector uses all of the elements of the source.
319/// e.g., it is 1 group (%0)
320/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
321/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
322/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
323/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
324/// it is 2 groups (%3 and %4)
325/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
326/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
327/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
328/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
329/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
330/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
331/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
332/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
333/// it is 0 group
334/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
335/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
336/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
337/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339 if (VL.empty())
340 return 0;
341 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
342 return 0;
343 auto *SV = cast<ShuffleVectorInst>(VL.front());
344 unsigned SVNumElements =
345 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
346 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
347 if (SVNumElements % ShuffleMaskSize != 0)
348 return 0;
349 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
350 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
351 return 0;
352 unsigned NumGroup = 0;
353 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
354 auto *SV = cast<ShuffleVectorInst>(VL[I]);
355 Value *Src = SV->getOperand(0);
356 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
357 SmallBitVector ExpectedIndex(GroupSize);
358 if (!all_of(Group, [&](Value *V) {
359 auto *SV = cast<ShuffleVectorInst>(V);
360 // From the same source.
361 if (SV->getOperand(0) != Src)
362 return false;
363 int Index;
364 if (!SV->isExtractSubvectorMask(Index))
365 return false;
366 ExpectedIndex.set(Index / ShuffleMaskSize);
367 return true;
368 }))
369 return 0;
370 if (!ExpectedIndex.all())
371 return 0;
372 ++NumGroup;
373 }
374 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
375 return NumGroup;
376}
377
378/// \returns a shufflevector mask which is used to vectorize shufflevectors
379/// e.g.,
380/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
381/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
382/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
383/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
384/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
385/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
386/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
387/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
388/// the result is
389/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
391 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
392 auto *SV = cast<ShuffleVectorInst>(VL.front());
393 unsigned SVNumElements =
394 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
395 SmallVector<int> Mask;
396 unsigned AccumulateLength = 0;
397 for (Value *V : VL) {
398 auto *SV = cast<ShuffleVectorInst>(V);
399 for (int M : SV->getShuffleMask())
400 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
401 : AccumulateLength + M);
402 AccumulateLength += SVNumElements;
403 }
404 return Mask;
405}
406
407/// \returns True if the value is a constant (but not globals/constant
408/// expressions).
409static bool isConstant(Value *V) {
410 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
411}
412
413/// Checks if \p V is one of vector-like instructions, i.e. undef,
414/// insertelement/extractelement with constant indices for fixed vector type or
415/// extractvalue instruction.
417 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
418 !isa<ExtractValueInst, UndefValue>(V))
419 return false;
420 auto *I = dyn_cast<Instruction>(V);
421 if (!I || isa<ExtractValueInst>(I))
422 return true;
423 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
424 return false;
425 if (isa<ExtractElementInst>(I))
426 return isConstant(I->getOperand(1));
427 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
428 return isConstant(I->getOperand(2));
429}
430
431/// Returns power-of-2 number of elements in a single register (part), given the
432/// total number of elements \p Size and number of registers (parts) \p
433/// NumParts.
434static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
435 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
436}
437
438/// Returns correct remaining number of elements, considering total amount \p
439/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
440/// and current register (part) \p Part.
441static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
442 unsigned Part) {
443 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
444}
445
446#if !defined(NDEBUG)
447/// Print a short descriptor of the instruction bundle suitable for debug output.
448static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
449 std::string Result;
450 raw_string_ostream OS(Result);
451 if (Idx >= 0)
452 OS << "Idx: " << Idx << ", ";
453 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
454 return Result;
455}
456#endif
457
458/// \returns true if all of the instructions in \p VL are in the same block or
459/// false otherwise.
461 auto *It = find_if(VL, IsaPred<Instruction>);
462 if (It == VL.end())
463 return false;
464 Instruction *I0 = cast<Instruction>(*It);
466 return true;
467
468 BasicBlock *BB = I0->getParent();
469 for (Value *V : iterator_range(It, VL.end())) {
470 if (isa<PoisonValue>(V))
471 continue;
472 auto *II = dyn_cast<Instruction>(V);
473 if (!II)
474 return false;
475
476 if (BB != II->getParent())
477 return false;
478 }
479 return true;
480}
481
482/// \returns True if all of the values in \p VL are constants (but not
483/// globals/constant expressions).
485 // Constant expressions and globals can't be vectorized like normal integer/FP
486 // constants.
487 return all_of(VL, isConstant);
488}
489
490/// \returns True if all of the values in \p VL are identical or some of them
491/// are UndefValue.
492static bool isSplat(ArrayRef<Value *> VL) {
493 Value *FirstNonUndef = nullptr;
494 for (Value *V : VL) {
495 if (isa<UndefValue>(V))
496 continue;
497 if (!FirstNonUndef) {
498 FirstNonUndef = V;
499 continue;
500 }
501 if (V != FirstNonUndef)
502 return false;
503 }
504 return FirstNonUndef != nullptr;
505}
506
507/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
509 if (auto *Cmp = dyn_cast<CmpInst>(I))
510 return Cmp->isCommutative();
511 if (auto *BO = dyn_cast<BinaryOperator>(I))
512 return BO->isCommutative() ||
513 (BO->getOpcode() == Instruction::Sub &&
514 !BO->hasNUsesOrMore(UsesLimit) &&
515 all_of(
516 BO->uses(),
517 [](const Use &U) {
518 // Commutative, if icmp eq/ne sub, 0
519 CmpPredicate Pred;
520 if (match(U.getUser(),
521 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
522 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
523 return true;
524 // Commutative, if abs(sub nsw, true) or abs(sub, false).
525 ConstantInt *Flag;
526 return match(U.getUser(),
527 m_Intrinsic<Intrinsic::abs>(
528 m_Specific(U.get()), m_ConstantInt(Flag))) &&
529 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
530 Flag->isOne());
531 })) ||
532 (BO->getOpcode() == Instruction::FSub &&
533 !BO->hasNUsesOrMore(UsesLimit) &&
534 all_of(BO->uses(), [](const Use &U) {
535 return match(U.getUser(),
536 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
537 }));
538 return I->isCommutative();
539}
540
541template <typename T>
542static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
543 unsigned Offset) {
544 static_assert(std::is_same_v<T, InsertElementInst> ||
545 std::is_same_v<T, ExtractElementInst>,
546 "unsupported T");
547 int Index = Offset;
548 if (const auto *IE = dyn_cast<T>(Inst)) {
549 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
550 if (!VT)
551 return std::nullopt;
552 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
553 if (!CI)
554 return std::nullopt;
555 if (CI->getValue().uge(VT->getNumElements()))
556 return std::nullopt;
557 Index *= VT->getNumElements();
558 Index += CI->getZExtValue();
559 return Index;
560 }
561 return std::nullopt;
562}
563
564/// \returns inserting or extracting index of InsertElement, ExtractElement or
565/// InsertValue instruction, using Offset as base offset for index.
566/// \returns std::nullopt if the index is not an immediate.
567static std::optional<unsigned> getElementIndex(const Value *Inst,
568 unsigned Offset = 0) {
569 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
570 return Index;
571 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
572 return Index;
573
574 int Index = Offset;
575
576 const auto *IV = dyn_cast<InsertValueInst>(Inst);
577 if (!IV)
578 return std::nullopt;
579
580 Type *CurrentType = IV->getType();
581 for (unsigned I : IV->indices()) {
582 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
583 Index *= ST->getNumElements();
584 CurrentType = ST->getElementType(I);
585 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
586 Index *= AT->getNumElements();
587 CurrentType = AT->getElementType();
588 } else {
589 return std::nullopt;
590 }
591 Index += I;
592 }
593 return Index;
594}
595
596namespace {
597/// Specifies the way the mask should be analyzed for undefs/poisonous elements
598/// in the shuffle mask.
599enum class UseMask {
600 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
601 ///< check for the mask elements for the first argument (mask
602 ///< indices are in range [0:VF)).
603 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
604 ///< for the mask elements for the second argument (mask indices
605 ///< are in range [VF:2*VF))
606 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
607 ///< future shuffle elements and mark them as ones as being used
608 ///< in future. Non-undef elements are considered as unused since
609 ///< they're already marked as used in the mask.
610};
611} // namespace
612
613/// Prepares a use bitset for the given mask either for the first argument or
614/// for the second.
616 UseMask MaskArg) {
617 SmallBitVector UseMask(VF, true);
618 for (auto [Idx, Value] : enumerate(Mask)) {
619 if (Value == PoisonMaskElem) {
620 if (MaskArg == UseMask::UndefsAsMask)
621 UseMask.reset(Idx);
622 continue;
623 }
624 if (MaskArg == UseMask::FirstArg && Value < VF)
625 UseMask.reset(Value);
626 else if (MaskArg == UseMask::SecondArg && Value >= VF)
627 UseMask.reset(Value - VF);
628 }
629 return UseMask;
630}
631
632/// Checks if the given value is actually an undefined constant vector.
633/// Also, if the \p UseMask is not empty, tries to check if the non-masked
634/// elements actually mask the insertelement buildvector, if any.
635template <bool IsPoisonOnly = false>
637 const SmallBitVector &UseMask = {}) {
638 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
639 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
640 if (isa<T>(V))
641 return Res;
642 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
643 if (!VecTy)
644 return Res.reset();
645 auto *C = dyn_cast<Constant>(V);
646 if (!C) {
647 if (!UseMask.empty()) {
648 const Value *Base = V;
649 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
650 Base = II->getOperand(0);
651 if (isa<T>(II->getOperand(1)))
652 continue;
653 std::optional<unsigned> Idx = getElementIndex(II);
654 if (!Idx) {
655 Res.reset();
656 return Res;
657 }
658 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
659 Res.reset(*Idx);
660 }
661 // TODO: Add analysis for shuffles here too.
662 if (V == Base) {
663 Res.reset();
664 } else {
665 SmallBitVector SubMask(UseMask.size(), false);
666 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
667 }
668 } else {
669 Res.reset();
670 }
671 return Res;
672 }
673 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
674 if (Constant *Elem = C->getAggregateElement(I))
675 if (!isa<T>(Elem) &&
676 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
677 Res.reset(I);
678 }
679 return Res;
680}
681
682/// Checks if the vector of instructions can be represented as a shuffle, like:
683/// %x0 = extractelement <4 x i8> %x, i32 0
684/// %x3 = extractelement <4 x i8> %x, i32 3
685/// %y1 = extractelement <4 x i8> %y, i32 1
686/// %y2 = extractelement <4 x i8> %y, i32 2
687/// %x0x0 = mul i8 %x0, %x0
688/// %x3x3 = mul i8 %x3, %x3
689/// %y1y1 = mul i8 %y1, %y1
690/// %y2y2 = mul i8 %y2, %y2
691/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
692/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
693/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
694/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
695/// ret <4 x i8> %ins4
696/// can be transformed into:
697/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
698/// i32 6>
699/// %2 = mul <4 x i8> %1, %1
700/// ret <4 x i8> %2
701/// Mask will return the Shuffle Mask equivalent to the extracted elements.
702/// TODO: Can we split off and reuse the shuffle mask detection from
703/// ShuffleVectorInst/getShuffleCost?
704static std::optional<TargetTransformInfo::ShuffleKind>
706 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
707 if (It == VL.end())
708 return std::nullopt;
709 unsigned Size =
710 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
711 auto *EI = dyn_cast<ExtractElementInst>(V);
712 if (!EI)
713 return S;
714 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
715 if (!VTy)
716 return S;
717 return std::max(S, VTy->getNumElements());
718 });
719
720 Value *Vec1 = nullptr;
721 Value *Vec2 = nullptr;
722 bool HasNonUndefVec = any_of(VL, [](Value *V) {
723 auto *EE = dyn_cast<ExtractElementInst>(V);
724 if (!EE)
725 return false;
726 Value *Vec = EE->getVectorOperand();
727 if (isa<UndefValue>(Vec))
728 return false;
729 return isGuaranteedNotToBePoison(Vec);
730 });
731 enum ShuffleMode { Unknown, Select, Permute };
732 ShuffleMode CommonShuffleMode = Unknown;
733 Mask.assign(VL.size(), PoisonMaskElem);
734 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
735 // Undef can be represented as an undef element in a vector.
736 if (isa<UndefValue>(VL[I]))
737 continue;
738 auto *EI = cast<ExtractElementInst>(VL[I]);
739 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
740 return std::nullopt;
741 auto *Vec = EI->getVectorOperand();
742 // We can extractelement from undef or poison vector.
743 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
744 continue;
745 // All vector operands must have the same number of vector elements.
746 if (isa<UndefValue>(Vec)) {
747 Mask[I] = I;
748 } else {
749 if (isa<UndefValue>(EI->getIndexOperand()))
750 continue;
751 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
752 if (!Idx)
753 return std::nullopt;
754 // Undefined behavior if Idx is negative or >= Size.
755 if (Idx->getValue().uge(Size))
756 continue;
757 unsigned IntIdx = Idx->getValue().getZExtValue();
758 Mask[I] = IntIdx;
759 }
760 if (isUndefVector(Vec).all() && HasNonUndefVec)
761 continue;
762 // For correct shuffling we have to have at most 2 different vector operands
763 // in all extractelement instructions.
764 if (!Vec1 || Vec1 == Vec) {
765 Vec1 = Vec;
766 } else if (!Vec2 || Vec2 == Vec) {
767 Vec2 = Vec;
768 Mask[I] += Size;
769 } else {
770 return std::nullopt;
771 }
772 if (CommonShuffleMode == Permute)
773 continue;
774 // If the extract index is not the same as the operation number, it is a
775 // permutation.
776 if (Mask[I] % Size != I) {
777 CommonShuffleMode = Permute;
778 continue;
779 }
780 CommonShuffleMode = Select;
781 }
782 // If we're not crossing lanes in different vectors, consider it as blending.
783 if (CommonShuffleMode == Select && Vec2)
785 // If Vec2 was never used, we have a permutation of a single vector, otherwise
786 // we have permutation of 2 vectors.
789}
790
791/// \returns True if Extract{Value,Element} instruction extracts element Idx.
792static std::optional<unsigned> getExtractIndex(Instruction *E) {
793 unsigned Opcode = E->getOpcode();
794 assert((Opcode == Instruction::ExtractElement ||
795 Opcode == Instruction::ExtractValue) &&
796 "Expected extractelement or extractvalue instruction.");
797 if (Opcode == Instruction::ExtractElement) {
798 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
799 if (!CI)
800 return std::nullopt;
801 return CI->getZExtValue();
802 }
803 auto *EI = cast<ExtractValueInst>(E);
804 if (EI->getNumIndices() != 1)
805 return std::nullopt;
806 return *EI->idx_begin();
807}
808
809namespace {
810
811/// Main data required for vectorization of instructions.
812class InstructionsState {
813 /// The main/alternate instruction. MainOp is also VL0.
814 Instruction *MainOp = nullptr;
815 Instruction *AltOp = nullptr;
816
817public:
818 Instruction *getMainOp() const { return MainOp; }
819
820 Instruction *getAltOp() const { return AltOp; }
821
822 /// The main/alternate opcodes for the list of instructions.
823 unsigned getOpcode() const {
824 return MainOp ? MainOp->getOpcode() : 0;
825 }
826
827 unsigned getAltOpcode() const {
828 return AltOp ? AltOp->getOpcode() : 0;
829 }
830
831 /// Some of the instructions in the list have alternate opcodes.
832 bool isAltShuffle() const { return AltOp != MainOp; }
833
834 bool isOpcodeOrAlt(Instruction *I) const {
835 unsigned CheckedOpcode = I->getOpcode();
836 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
837 }
838
839 InstructionsState() = delete;
840 InstructionsState(Instruction *MainOp, Instruction *AltOp)
841 : MainOp(MainOp), AltOp(AltOp) {}
842 static InstructionsState invalid() { return {nullptr, nullptr}; }
843};
844
845} // end anonymous namespace
846
847/// \returns true if \p Opcode is allowed as part of the main/alternate
848/// instruction for SLP vectorization.
849///
850/// Example of unsupported opcode is SDIV that can potentially cause UB if the
851/// "shuffled out" lane would result in division by zero.
852static bool isValidForAlternation(unsigned Opcode) {
853 if (Instruction::isIntDivRem(Opcode))
854 return false;
855
856 return true;
857}
858
859static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
860 const TargetLibraryInfo &TLI);
861
862/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
863/// compatible instructions or constants, or just some other regular values.
864static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
865 Value *Op1, const TargetLibraryInfo &TLI) {
866 return (isConstant(BaseOp0) && isConstant(Op0)) ||
867 (isConstant(BaseOp1) && isConstant(Op1)) ||
868 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
869 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
870 BaseOp0 == Op0 || BaseOp1 == Op1 ||
871 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
872 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
873}
874
875/// \returns true if a compare instruction \p CI has similar "look" and
876/// same predicate as \p BaseCI, "as is" or with its operands and predicate
877/// swapped, false otherwise.
878static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
879 const TargetLibraryInfo &TLI) {
880 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
881 "Assessing comparisons of different types?");
882 CmpInst::Predicate BasePred = BaseCI->getPredicate();
883 CmpInst::Predicate Pred = CI->getPredicate();
885
886 Value *BaseOp0 = BaseCI->getOperand(0);
887 Value *BaseOp1 = BaseCI->getOperand(1);
888 Value *Op0 = CI->getOperand(0);
889 Value *Op1 = CI->getOperand(1);
890
891 return (BasePred == Pred &&
892 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
893 (BasePred == SwappedPred &&
894 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
895}
896
897/// \returns analysis of the Instructions in \p VL described in
898/// InstructionsState, the Opcode that we suppose the whole list
899/// could be vectorized even if its structure is diverse.
900static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
901 const TargetLibraryInfo &TLI) {
902 // Make sure these are all Instructions.
903 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
904 return InstructionsState::invalid();
905
906 auto *It = find_if(VL, IsaPred<Instruction>);
907 if (It == VL.end())
908 return InstructionsState::invalid();
909
910 Value *V = *It;
911 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
912 if ((VL.size() > 2 && !isa<PHINode>(V) && InstCnt < VL.size() / 2) ||
913 (VL.size() == 2 && InstCnt < 2))
914 return InstructionsState::invalid();
915
916 bool IsCastOp = isa<CastInst>(V);
917 bool IsBinOp = isa<BinaryOperator>(V);
918 bool IsCmpOp = isa<CmpInst>(V);
919 CmpInst::Predicate BasePred =
920 IsCmpOp ? cast<CmpInst>(V)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE;
921 unsigned Opcode = cast<Instruction>(V)->getOpcode();
922 unsigned AltOpcode = Opcode;
923 unsigned AltIndex = std::distance(VL.begin(), It);
924
925 bool SwappedPredsCompatible = [&]() {
926 if (!IsCmpOp)
927 return false;
928 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
929 UniquePreds.insert(BasePred);
930 UniqueNonSwappedPreds.insert(BasePred);
931 for (Value *V : VL) {
932 auto *I = dyn_cast<CmpInst>(V);
933 if (!I)
934 return false;
935 CmpInst::Predicate CurrentPred = I->getPredicate();
936 CmpInst::Predicate SwappedCurrentPred =
937 CmpInst::getSwappedPredicate(CurrentPred);
938 UniqueNonSwappedPreds.insert(CurrentPred);
939 if (!UniquePreds.contains(CurrentPred) &&
940 !UniquePreds.contains(SwappedCurrentPred))
941 UniquePreds.insert(CurrentPred);
942 }
943 // Total number of predicates > 2, but if consider swapped predicates
944 // compatible only 2, consider swappable predicates as compatible opcodes,
945 // not alternate.
946 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
947 }();
948 // Check for one alternate opcode from another BinaryOperator.
949 // TODO - generalize to support all operators (types, calls etc.).
950 auto *IBase = cast<Instruction>(V);
951 Intrinsic::ID BaseID = 0;
952 SmallVector<VFInfo> BaseMappings;
953 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
955 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
956 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
957 return InstructionsState::invalid();
958 }
959 bool AnyPoison = InstCnt != VL.size();
960 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
961 auto *I = dyn_cast<Instruction>(VL[Cnt]);
962 if (!I)
963 continue;
964
965 // Cannot combine poison and divisions.
966 // TODO: do some smart analysis of the CallInsts to exclude divide-like
967 // intrinsics/functions only.
968 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
969 return InstructionsState::invalid();
970 unsigned InstOpcode = I->getOpcode();
971 if (IsBinOp && isa<BinaryOperator>(I)) {
972 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
973 continue;
974 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
975 isValidForAlternation(Opcode)) {
976 AltOpcode = InstOpcode;
977 AltIndex = Cnt;
978 continue;
979 }
980 } else if (IsCastOp && isa<CastInst>(I)) {
981 Value *Op0 = IBase->getOperand(0);
982 Type *Ty0 = Op0->getType();
983 Value *Op1 = I->getOperand(0);
984 Type *Ty1 = Op1->getType();
985 if (Ty0 == Ty1) {
986 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
987 continue;
988 if (Opcode == AltOpcode) {
990 isValidForAlternation(InstOpcode) &&
991 "Cast isn't safe for alternation, logic needs to be updated!");
992 AltOpcode = InstOpcode;
993 AltIndex = Cnt;
994 continue;
995 }
996 }
997 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
998 auto *BaseInst = cast<CmpInst>(V);
999 Type *Ty0 = BaseInst->getOperand(0)->getType();
1000 Type *Ty1 = Inst->getOperand(0)->getType();
1001 if (Ty0 == Ty1) {
1002 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1003 assert(InstOpcode == AltOpcode &&
1004 "Alternate instructions are only supported by BinaryOperator "
1005 "and CastInst.");
1006 // Check for compatible operands. If the corresponding operands are not
1007 // compatible - need to perform alternate vectorization.
1008 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1009 CmpInst::Predicate SwappedCurrentPred =
1010 CmpInst::getSwappedPredicate(CurrentPred);
1011
1012 if ((E == 2 || SwappedPredsCompatible) &&
1013 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1014 continue;
1015
1016 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1017 continue;
1018 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
1019 if (AltIndex) {
1020 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1021 continue;
1022 } else if (BasePred != CurrentPred) {
1023 assert(
1024 isValidForAlternation(InstOpcode) &&
1025 "CmpInst isn't safe for alternation, logic needs to be updated!");
1026 AltIndex = Cnt;
1027 continue;
1028 }
1029 CmpInst::Predicate AltPred = AltInst->getPredicate();
1030 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1031 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1032 continue;
1033 }
1034 } else if (InstOpcode == Opcode) {
1035 assert(InstOpcode == AltOpcode &&
1036 "Alternate instructions are only supported by BinaryOperator and "
1037 "CastInst.");
1038 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1039 if (Gep->getNumOperands() != 2 ||
1040 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
1041 return InstructionsState::invalid();
1042 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1044 return InstructionsState::invalid();
1045 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1046 auto *BaseLI = cast<LoadInst>(IBase);
1047 if (!LI->isSimple() || !BaseLI->isSimple())
1048 return InstructionsState::invalid();
1049 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1050 auto *CallBase = cast<CallInst>(IBase);
1051 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1052 return InstructionsState::invalid();
1053 if (Call->hasOperandBundles() &&
1055 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1056 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1057 CallBase->op_begin() +
1059 return InstructionsState::invalid();
1061 if (ID != BaseID)
1062 return InstructionsState::invalid();
1063 if (!ID) {
1064 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1065 if (Mappings.size() != BaseMappings.size() ||
1066 Mappings.front().ISA != BaseMappings.front().ISA ||
1067 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1068 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1069 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1070 Mappings.front().Shape.Parameters !=
1071 BaseMappings.front().Shape.Parameters)
1072 return InstructionsState::invalid();
1073 }
1074 }
1075 continue;
1076 }
1077 return InstructionsState::invalid();
1078 }
1079
1080 return InstructionsState(cast<Instruction>(V),
1081 cast<Instruction>(VL[AltIndex]));
1082}
1083
1084/// \returns true if all of the values in \p VL have the same type or false
1085/// otherwise.
1087 Type *Ty = VL.front()->getType();
1088 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1089}
1090
1091/// \returns True if in-tree use also needs extract. This refers to
1092/// possible scalar operand in vectorized instruction.
1093static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1094 TargetLibraryInfo *TLI,
1095 const TargetTransformInfo *TTI) {
1096 if (!UserInst)
1097 return false;
1098 unsigned Opcode = UserInst->getOpcode();
1099 switch (Opcode) {
1100 case Instruction::Load: {
1101 LoadInst *LI = cast<LoadInst>(UserInst);
1102 return (LI->getPointerOperand() == Scalar);
1103 }
1104 case Instruction::Store: {
1105 StoreInst *SI = cast<StoreInst>(UserInst);
1106 return (SI->getPointerOperand() == Scalar);
1107 }
1108 case Instruction::Call: {
1109 CallInst *CI = cast<CallInst>(UserInst);
1111 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1112 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1113 Arg.value().get() == Scalar;
1114 });
1115 }
1116 default:
1117 return false;
1118 }
1119}
1120
1121/// \returns the AA location that is being access by the instruction.
1123 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1124 return MemoryLocation::get(SI);
1125 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1126 return MemoryLocation::get(LI);
1127 return MemoryLocation();
1128}
1129
1130/// \returns True if the instruction is not a volatile or atomic load/store.
1131static bool isSimple(Instruction *I) {
1132 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1133 return LI->isSimple();
1134 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1135 return SI->isSimple();
1136 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1137 return !MI->isVolatile();
1138 return true;
1139}
1140
1141/// Shuffles \p Mask in accordance with the given \p SubMask.
1142/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1143/// one but two input vectors.
1144static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1145 bool ExtendingManyInputs = false) {
1146 if (SubMask.empty())
1147 return;
1148 assert(
1149 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1150 // Check if input scalars were extended to match the size of other node.
1151 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1152 "SubMask with many inputs support must be larger than the mask.");
1153 if (Mask.empty()) {
1154 Mask.append(SubMask.begin(), SubMask.end());
1155 return;
1156 }
1157 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1158 int TermValue = std::min(Mask.size(), SubMask.size());
1159 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1160 if (SubMask[I] == PoisonMaskElem ||
1161 (!ExtendingManyInputs &&
1162 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1163 continue;
1164 NewMask[I] = Mask[SubMask[I]];
1165 }
1166 Mask.swap(NewMask);
1167}
1168
1169/// Order may have elements assigned special value (size) which is out of
1170/// bounds. Such indices only appear on places which correspond to undef values
1171/// (see canReuseExtract for details) and used in order to avoid undef values
1172/// have effect on operands ordering.
1173/// The first loop below simply finds all unused indices and then the next loop
1174/// nest assigns these indices for undef values positions.
1175/// As an example below Order has two undef positions and they have assigned
1176/// values 3 and 7 respectively:
1177/// before: 6 9 5 4 9 2 1 0
1178/// after: 6 3 5 4 7 2 1 0
1180 const unsigned Sz = Order.size();
1181 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1182 SmallBitVector MaskedIndices(Sz);
1183 for (unsigned I = 0; I < Sz; ++I) {
1184 if (Order[I] < Sz)
1185 UnusedIndices.reset(Order[I]);
1186 else
1187 MaskedIndices.set(I);
1188 }
1189 if (MaskedIndices.none())
1190 return;
1191 assert(UnusedIndices.count() == MaskedIndices.count() &&
1192 "Non-synced masked/available indices.");
1193 int Idx = UnusedIndices.find_first();
1194 int MIdx = MaskedIndices.find_first();
1195 while (MIdx >= 0) {
1196 assert(Idx >= 0 && "Indices must be synced.");
1197 Order[MIdx] = Idx;
1198 Idx = UnusedIndices.find_next(Idx);
1199 MIdx = MaskedIndices.find_next(MIdx);
1200 }
1201}
1202
1203/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1204/// Opcode1.
1206 unsigned Opcode1) {
1207 Type *ScalarTy = VL[0]->getType();
1208 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1209 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1210 for (unsigned Lane : seq<unsigned>(VL.size())) {
1211 if (isa<PoisonValue>(VL[Lane]))
1212 continue;
1213 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1214 OpcodeMask.set(Lane * ScalarTyNumElements,
1215 Lane * ScalarTyNumElements + ScalarTyNumElements);
1216 }
1217 return OpcodeMask;
1218}
1219
1220namespace llvm {
1221
1223 SmallVectorImpl<int> &Mask) {
1224 Mask.clear();
1225 const unsigned E = Indices.size();
1226 Mask.resize(E, PoisonMaskElem);
1227 for (unsigned I = 0; I < E; ++I)
1228 Mask[Indices[I]] = I;
1229}
1230
1231/// Reorders the list of scalars in accordance with the given \p Mask.
1233 ArrayRef<int> Mask) {
1234 assert(!Mask.empty() && "Expected non-empty mask.");
1235 SmallVector<Value *> Prev(Scalars.size(),
1236 PoisonValue::get(Scalars.front()->getType()));
1237 Prev.swap(Scalars);
1238 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1239 if (Mask[I] != PoisonMaskElem)
1240 Scalars[Mask[I]] = Prev[I];
1241}
1242
1243/// Checks if the provided value does not require scheduling. It does not
1244/// require scheduling if this is not an instruction or it is an instruction
1245/// that does not read/write memory and all operands are either not instructions
1246/// or phi nodes or instructions from different blocks.
1248 auto *I = dyn_cast<Instruction>(V);
1249 if (!I)
1250 return true;
1251 return !mayHaveNonDefUseDependency(*I) &&
1252 all_of(I->operands(), [I](Value *V) {
1253 auto *IO = dyn_cast<Instruction>(V);
1254 if (!IO)
1255 return true;
1256 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1257 });
1258}
1259
1260/// Checks if the provided value does not require scheduling. It does not
1261/// require scheduling if this is not an instruction or it is an instruction
1262/// that does not read/write memory and all users are phi nodes or instructions
1263/// from the different blocks.
1264static bool isUsedOutsideBlock(Value *V) {
1265 auto *I = dyn_cast<Instruction>(V);
1266 if (!I)
1267 return true;
1268 // Limits the number of uses to save compile time.
1269 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1270 all_of(I->users(), [I](User *U) {
1271 auto *IU = dyn_cast<Instruction>(U);
1272 if (!IU)
1273 return true;
1274 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1275 });
1276}
1277
1278/// Checks if the specified value does not require scheduling. It does not
1279/// require scheduling if all operands and all users do not need to be scheduled
1280/// in the current basic block.
1283}
1284
1285/// Checks if the specified array of instructions does not require scheduling.
1286/// It is so if all either instructions have operands that do not require
1287/// scheduling or their users do not require scheduling since they are phis or
1288/// in other basic blocks.
1290 return !VL.empty() &&
1292}
1293
1294/// Returns true if widened type of \p Ty elements with size \p Sz represents
1295/// full vector type, i.e. adding extra element results in extra parts upon type
1296/// legalization.
1298 unsigned Sz) {
1299 if (Sz <= 1)
1300 return false;
1301 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1302 return false;
1303 if (has_single_bit(Sz))
1304 return true;
1305 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1306 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1307 Sz % NumParts == 0;
1308}
1309
1310namespace slpvectorizer {
1311
1312/// Bottom Up SLP Vectorizer.
1313class BoUpSLP {
1314 struct TreeEntry;
1315 struct ScheduleData;
1318
1319public:
1320 /// Tracks the state we can represent the loads in the given sequence.
1321 enum class LoadsState {
1322 Gather,
1323 Vectorize,
1326 };
1327
1334
1336 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1339 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1340 AC(AC), DB(DB), DL(DL), ORE(ORE),
1341 Builder(Se->getContext(), TargetFolder(*DL)) {
1342 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1343 // Use the vector register size specified by the target unless overridden
1344 // by a command-line option.
1345 // TODO: It would be better to limit the vectorization factor based on
1346 // data type rather than just register size. For example, x86 AVX has
1347 // 256-bit registers, but it does not support integer operations
1348 // at that width (that requires AVX2).
1349 if (MaxVectorRegSizeOption.getNumOccurrences())
1350 MaxVecRegSize = MaxVectorRegSizeOption;
1351 else
1352 MaxVecRegSize =
1354 .getFixedValue();
1355
1356 if (MinVectorRegSizeOption.getNumOccurrences())
1357 MinVecRegSize = MinVectorRegSizeOption;
1358 else
1359 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1360 }
1361
1362 /// Vectorize the tree that starts with the elements in \p VL.
1363 /// Returns the vectorized root.
1365
1366 /// Vectorize the tree but with the list of externally used values \p
1367 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1368 /// generated extractvalue instructions.
1369 Value *
1370 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1371 Instruction *ReductionRoot = nullptr);
1372
1373 /// \returns the cost incurred by unwanted spills and fills, caused by
1374 /// holding live values over call sites.
1376
1377 /// \returns the vectorization cost of the subtree that starts at \p VL.
1378 /// A negative number means that this is profitable.
1379 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1380
1381 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1382 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1383 void buildTree(ArrayRef<Value *> Roots,
1384 const SmallDenseSet<Value *> &UserIgnoreLst);
1385
1386 /// Construct a vectorizable tree that starts at \p Roots.
1387 void buildTree(ArrayRef<Value *> Roots);
1388
1389 /// Returns whether the root node has in-tree uses.
1391 return !VectorizableTree.empty() &&
1392 !VectorizableTree.front()->UserTreeIndices.empty();
1393 }
1394
1395 /// Return the scalars of the root node.
1397 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1398 return VectorizableTree.front()->Scalars;
1399 }
1400
1401 /// Returns the type/is-signed info for the root node in the graph without
1402 /// casting.
1403 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1404 const TreeEntry &Root = *VectorizableTree.front().get();
1405 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1406 !Root.Scalars.front()->getType()->isIntegerTy())
1407 return std::nullopt;
1408 auto It = MinBWs.find(&Root);
1409 if (It != MinBWs.end())
1410 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1411 It->second.first),
1412 It->second.second);
1413 if (Root.getOpcode() == Instruction::ZExt ||
1414 Root.getOpcode() == Instruction::SExt)
1415 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1416 Root.getOpcode() == Instruction::SExt);
1417 return std::nullopt;
1418 }
1419
1420 /// Checks if the root graph node can be emitted with narrower bitwidth at
1421 /// codegen and returns it signedness, if so.
1423 return MinBWs.at(VectorizableTree.front().get()).second;
1424 }
1425
1426 /// Returns reduction type after minbitdth analysis.
1428 if (ReductionBitWidth == 0 ||
1429 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1430 ReductionBitWidth >=
1431 DL->getTypeSizeInBits(
1432 VectorizableTree.front()->Scalars.front()->getType()))
1433 return getWidenedType(
1434 VectorizableTree.front()->Scalars.front()->getType(),
1435 VectorizableTree.front()->getVectorFactor());
1436 return getWidenedType(
1438 VectorizableTree.front()->Scalars.front()->getContext(),
1439 ReductionBitWidth),
1440 VectorizableTree.front()->getVectorFactor());
1441 }
1442
1443 /// Builds external uses of the vectorized scalars, i.e. the list of
1444 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1445 /// ExternallyUsedValues contains additional list of external uses to handle
1446 /// vectorization of reductions.
1447 void
1448 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1449
1450 /// Transforms graph nodes to target specific representations, if profitable.
1451 void transformNodes();
1452
1453 /// Clear the internal data structures that are created by 'buildTree'.
1454 void deleteTree() {
1455 VectorizableTree.clear();
1456 ScalarToTreeEntry.clear();
1457 MultiNodeScalars.clear();
1458 MustGather.clear();
1459 NonScheduledFirst.clear();
1460 EntryToLastInstruction.clear();
1461 LoadEntriesToVectorize.clear();
1462 IsGraphTransformMode = false;
1463 GatheredLoadsEntriesFirst.reset();
1464 ExternalUses.clear();
1465 ExternalUsesAsOriginalScalar.clear();
1466 for (auto &Iter : BlocksSchedules) {
1467 BlockScheduling *BS = Iter.second.get();
1468 BS->clear();
1469 }
1470 MinBWs.clear();
1471 ReductionBitWidth = 0;
1472 BaseGraphSize = 1;
1473 CastMaxMinBWSizes.reset();
1474 ExtraBitWidthNodes.clear();
1475 InstrElementSize.clear();
1476 UserIgnoreList = nullptr;
1477 PostponedGathers.clear();
1478 ValueToGatherNodes.clear();
1479 }
1480
1481 unsigned getTreeSize() const { return VectorizableTree.size(); }
1482
1483 /// Returns the base graph size, before any transformations.
1484 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1485
1486 /// Perform LICM and CSE on the newly generated gather sequences.
1488
1489 /// Does this non-empty order represent an identity order? Identity
1490 /// should be represented as an empty order, so this is used to
1491 /// decide if we can canonicalize a computed order. Undef elements
1492 /// (represented as size) are ignored.
1494 assert(!Order.empty() && "expected non-empty order");
1495 const unsigned Sz = Order.size();
1496 return all_of(enumerate(Order), [&](const auto &P) {
1497 return P.value() == P.index() || P.value() == Sz;
1498 });
1499 }
1500
1501 /// Checks if the specified gather tree entry \p TE can be represented as a
1502 /// shuffled vector entry + (possibly) permutation with other gathers. It
1503 /// implements the checks only for possibly ordered scalars (Loads,
1504 /// ExtractElement, ExtractValue), which can be part of the graph.
1505 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1506
1507 /// Sort loads into increasing pointers offsets to allow greater clustering.
1508 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1509
1510 /// Gets reordering data for the given tree entry. If the entry is vectorized
1511 /// - just return ReorderIndices, otherwise check if the scalars can be
1512 /// reordered and return the most optimal order.
1513 /// \return std::nullopt if ordering is not important, empty order, if
1514 /// identity order is important, or the actual order.
1515 /// \param TopToBottom If true, include the order of vectorized stores and
1516 /// insertelement nodes, otherwise skip them.
1517 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1518 bool TopToBottom);
1519
1520 /// Reorders the current graph to the most profitable order starting from the
1521 /// root node to the leaf nodes. The best order is chosen only from the nodes
1522 /// of the same size (vectorization factor). Smaller nodes are considered
1523 /// parts of subgraph with smaller VF and they are reordered independently. We
1524 /// can make it because we still need to extend smaller nodes to the wider VF
1525 /// and we can merge reordering shuffles with the widening shuffles.
1526 void reorderTopToBottom();
1527
1528 /// Reorders the current graph to the most profitable order starting from
1529 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1530 /// number of reshuffles if the leaf nodes use the same order. In this case we
1531 /// can merge the orders and just shuffle user node instead of shuffling its
1532 /// operands. Plus, even the leaf nodes have different orders, it allows to
1533 /// sink reordering in the graph closer to the root node and merge it later
1534 /// during analysis.
1535 void reorderBottomToTop(bool IgnoreReorder = false);
1536
1537 /// \return The vector element size in bits to use when vectorizing the
1538 /// expression tree ending at \p V. If V is a store, the size is the width of
1539 /// the stored value. Otherwise, the size is the width of the largest loaded
1540 /// value reaching V. This method is used by the vectorizer to calculate
1541 /// vectorization factors.
1542 unsigned getVectorElementSize(Value *V);
1543
1544 /// Compute the minimum type sizes required to represent the entries in a
1545 /// vectorizable tree.
1547
1548 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1549 unsigned getMaxVecRegSize() const {
1550 return MaxVecRegSize;
1551 }
1552
1553 // \returns minimum vector register size as set by cl::opt.
1554 unsigned getMinVecRegSize() const {
1555 return MinVecRegSize;
1556 }
1557
1558 unsigned getMinVF(unsigned Sz) const {
1559 return std::max(2U, getMinVecRegSize() / Sz);
1560 }
1561
1562 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1563 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1564 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1565 return MaxVF ? MaxVF : UINT_MAX;
1566 }
1567
1568 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1569 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1570 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1571 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1572 ///
1573 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1574 unsigned canMapToVector(Type *T) const;
1575
1576 /// \returns True if the VectorizableTree is both tiny and not fully
1577 /// vectorizable. We do not vectorize such trees.
1578 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1579
1580 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1581 /// It may happen, if all gather nodes are loads and they cannot be
1582 /// "clusterized". In this case even subgraphs cannot be vectorized more
1583 /// effectively than the base graph.
1584 bool isTreeNotExtendable() const;
1585
1586 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1587 /// can be load combined in the backend. Load combining may not be allowed in
1588 /// the IR optimizer, so we do not want to alter the pattern. For example,
1589 /// partially transforming a scalar bswap() pattern into vector code is
1590 /// effectively impossible for the backend to undo.
1591 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1592 /// may not be necessary.
1593 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1594
1595 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1596 /// can be load combined in the backend. Load combining may not be allowed in
1597 /// the IR optimizer, so we do not want to alter the pattern. For example,
1598 /// partially transforming a scalar bswap() pattern into vector code is
1599 /// effectively impossible for the backend to undo.
1600 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1601 /// may not be necessary.
1602 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1603
1604 /// Checks if the given array of loads can be represented as a vectorized,
1605 /// scatter or just simple gather.
1606 /// \param VL list of loads.
1607 /// \param VL0 main load value.
1608 /// \param Order returned order of load instructions.
1609 /// \param PointerOps returned list of pointer operands.
1610 /// \param BestVF return best vector factor, if recursive check found better
1611 /// vectorization sequences rather than masked gather.
1612 /// \param TryRecursiveCheck used to check if long masked gather can be
1613 /// represented as a serie of loads/insert subvector, if profitable.
1616 SmallVectorImpl<Value *> &PointerOps,
1617 unsigned *BestVF = nullptr,
1618 bool TryRecursiveCheck = true) const;
1619
1620 /// Registers non-vectorizable sequence of loads
1621 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1622 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1623 }
1624
1625 /// Checks if the given loads sequence is known as not vectorizable
1626 template <typename T>
1628 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1629 }
1630
1632
1633 /// This structure holds any data we need about the edges being traversed
1634 /// during buildTree_rec(). We keep track of:
1635 /// (i) the user TreeEntry index, and
1636 /// (ii) the index of the edge.
1637 struct EdgeInfo {
1638 EdgeInfo() = default;
1639 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1641 /// The user TreeEntry.
1642 TreeEntry *UserTE = nullptr;
1643 /// The operand index of the use.
1644 unsigned EdgeIdx = UINT_MAX;
1645#ifndef NDEBUG
1647 const BoUpSLP::EdgeInfo &EI) {
1648 EI.dump(OS);
1649 return OS;
1650 }
1651 /// Debug print.
1652 void dump(raw_ostream &OS) const {
1653 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1654 << " EdgeIdx:" << EdgeIdx << "}";
1655 }
1656 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1657#endif
1658 bool operator == (const EdgeInfo &Other) const {
1659 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1660 }
1661 };
1662
1663 /// A helper class used for scoring candidates for two consecutive lanes.
1665 const TargetLibraryInfo &TLI;
1666 const DataLayout &DL;
1667 ScalarEvolution &SE;
1668 const BoUpSLP &R;
1669 int NumLanes; // Total number of lanes (aka vectorization factor).
1670 int MaxLevel; // The maximum recursion depth for accumulating score.
1671
1672 public:
1674 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1675 int MaxLevel)
1676 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1677 MaxLevel(MaxLevel) {}
1678
1679 // The hard-coded scores listed here are not very important, though it shall
1680 // be higher for better matches to improve the resulting cost. When
1681 // computing the scores of matching one sub-tree with another, we are
1682 // basically counting the number of values that are matching. So even if all
1683 // scores are set to 1, we would still get a decent matching result.
1684 // However, sometimes we have to break ties. For example we may have to
1685 // choose between matching loads vs matching opcodes. This is what these
1686 // scores are helping us with: they provide the order of preference. Also,
1687 // this is important if the scalar is externally used or used in another
1688 // tree entry node in the different lane.
1689
1690 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1691 static const int ScoreConsecutiveLoads = 4;
1692 /// The same load multiple times. This should have a better score than
1693 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1694 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1695 /// a vector load and 1.0 for a broadcast.
1696 static const int ScoreSplatLoads = 3;
1697 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1698 static const int ScoreReversedLoads = 3;
1699 /// A load candidate for masked gather.
1700 static const int ScoreMaskedGatherCandidate = 1;
1701 /// ExtractElementInst from same vector and consecutive indexes.
1702 static const int ScoreConsecutiveExtracts = 4;
1703 /// ExtractElementInst from same vector and reversed indices.
1704 static const int ScoreReversedExtracts = 3;
1705 /// Constants.
1706 static const int ScoreConstants = 2;
1707 /// Instructions with the same opcode.
1708 static const int ScoreSameOpcode = 2;
1709 /// Instructions with alt opcodes (e.g, add + sub).
1710 static const int ScoreAltOpcodes = 1;
1711 /// Identical instructions (a.k.a. splat or broadcast).
1712 static const int ScoreSplat = 1;
1713 /// Matching with an undef is preferable to failing.
1714 static const int ScoreUndef = 1;
1715 /// Score for failing to find a decent match.
1716 static const int ScoreFail = 0;
1717 /// Score if all users are vectorized.
1718 static const int ScoreAllUserVectorized = 1;
1719
1720 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1721 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1722 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1723 /// MainAltOps.
1725 ArrayRef<Value *> MainAltOps) const {
1726 if (!isValidElementType(V1->getType()) ||
1727 !isValidElementType(V2->getType()))
1729
1730 if (V1 == V2) {
1731 if (isa<LoadInst>(V1)) {
1732 // Retruns true if the users of V1 and V2 won't need to be extracted.
1733 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1734 // Bail out if we have too many uses to save compilation time.
1735 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1736 return false;
1737
1738 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1739 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1740 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1741 });
1742 };
1743 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1744 };
1745 // A broadcast of a load can be cheaper on some targets.
1746 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1747 ElementCount::getFixed(NumLanes)) &&
1748 ((int)V1->getNumUses() == NumLanes ||
1749 AllUsersAreInternal(V1, V2)))
1751 }
1753 }
1754
1755 auto CheckSameEntryOrFail = [&]() {
1756 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1757 TE1 && TE1 == R.getTreeEntry(V2))
1760 };
1761
1762 auto *LI1 = dyn_cast<LoadInst>(V1);
1763 auto *LI2 = dyn_cast<LoadInst>(V2);
1764 if (LI1 && LI2) {
1765 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1766 !LI2->isSimple())
1767 return CheckSameEntryOrFail();
1768
1769 std::optional<int> Dist = getPointersDiff(
1770 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1771 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1772 if (!Dist || *Dist == 0) {
1773 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1774 getUnderlyingObject(LI2->getPointerOperand()) &&
1775 R.TTI->isLegalMaskedGather(
1776 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1778 return CheckSameEntryOrFail();
1779 }
1780 // The distance is too large - still may be profitable to use masked
1781 // loads/gathers.
1782 if (std::abs(*Dist) > NumLanes / 2)
1784 // This still will detect consecutive loads, but we might have "holes"
1785 // in some cases. It is ok for non-power-2 vectorization and may produce
1786 // better results. It should not affect current vectorization.
1789 }
1790
1791 auto *C1 = dyn_cast<Constant>(V1);
1792 auto *C2 = dyn_cast<Constant>(V2);
1793 if (C1 && C2)
1795
1796 // Extracts from consecutive indexes of the same vector better score as
1797 // the extracts could be optimized away.
1798 Value *EV1;
1799 ConstantInt *Ex1Idx;
1800 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1801 // Undefs are always profitable for extractelements.
1802 // Compiler can easily combine poison and extractelement <non-poison> or
1803 // undef and extractelement <poison>. But combining undef +
1804 // extractelement <non-poison-but-may-produce-poison> requires some
1805 // extra operations.
1806 if (isa<UndefValue>(V2))
1807 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1810 Value *EV2 = nullptr;
1811 ConstantInt *Ex2Idx = nullptr;
1812 if (match(V2,
1814 m_Undef())))) {
1815 // Undefs are always profitable for extractelements.
1816 if (!Ex2Idx)
1818 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1820 if (EV2 == EV1) {
1821 int Idx1 = Ex1Idx->getZExtValue();
1822 int Idx2 = Ex2Idx->getZExtValue();
1823 int Dist = Idx2 - Idx1;
1824 // The distance is too large - still may be profitable to use
1825 // shuffles.
1826 if (std::abs(Dist) == 0)
1828 if (std::abs(Dist) > NumLanes / 2)
1832 }
1834 }
1835 return CheckSameEntryOrFail();
1836 }
1837
1838 auto *I1 = dyn_cast<Instruction>(V1);
1839 auto *I2 = dyn_cast<Instruction>(V2);
1840 if (I1 && I2) {
1841 if (I1->getParent() != I2->getParent())
1842 return CheckSameEntryOrFail();
1843 SmallVector<Value *, 4> Ops(MainAltOps);
1844 Ops.push_back(I1);
1845 Ops.push_back(I2);
1846 InstructionsState S = getSameOpcode(Ops, TLI);
1847 // Note: Only consider instructions with <= 2 operands to avoid
1848 // complexity explosion.
1849 if (S.getOpcode() &&
1850 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1851 !S.isAltShuffle()) &&
1852 all_of(Ops, [&S](Value *V) {
1853 return isa<PoisonValue>(V) ||
1854 cast<Instruction>(V)->getNumOperands() ==
1855 S.getMainOp()->getNumOperands();
1856 }))
1857 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1859 }
1860
1861 if (I1 && isa<PoisonValue>(V2))
1863
1864 if (isa<UndefValue>(V2))
1866
1867 return CheckSameEntryOrFail();
1868 }
1869
1870 /// Go through the operands of \p LHS and \p RHS recursively until
1871 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1872 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1873 /// of \p U1 and \p U2), except at the beginning of the recursion where
1874 /// these are set to nullptr.
1875 ///
1876 /// For example:
1877 /// \verbatim
1878 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1879 /// \ / \ / \ / \ /
1880 /// + + + +
1881 /// G1 G2 G3 G4
1882 /// \endverbatim
1883 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1884 /// each level recursively, accumulating the score. It starts from matching
1885 /// the additions at level 0, then moves on to the loads (level 1). The
1886 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1887 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1888 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1889 /// Please note that the order of the operands does not matter, as we
1890 /// evaluate the score of all profitable combinations of operands. In
1891 /// other words the score of G1 and G4 is the same as G1 and G2. This
1892 /// heuristic is based on ideas described in:
1893 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1894 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1895 /// Luís F. W. Góes
1897 Instruction *U2, int CurrLevel,
1898 ArrayRef<Value *> MainAltOps) const {
1899
1900 // Get the shallow score of V1 and V2.
1901 int ShallowScoreAtThisLevel =
1902 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1903
1904 // If reached MaxLevel,
1905 // or if V1 and V2 are not instructions,
1906 // or if they are SPLAT,
1907 // or if they are not consecutive,
1908 // or if profitable to vectorize loads or extractelements, early return
1909 // the current cost.
1910 auto *I1 = dyn_cast<Instruction>(LHS);
1911 auto *I2 = dyn_cast<Instruction>(RHS);
1912 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1913 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1914 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1915 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1916 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1917 ShallowScoreAtThisLevel))
1918 return ShallowScoreAtThisLevel;
1919 assert(I1 && I2 && "Should have early exited.");
1920
1921 // Contains the I2 operand indexes that got matched with I1 operands.
1922 SmallSet<unsigned, 4> Op2Used;
1923
1924 // Recursion towards the operands of I1 and I2. We are trying all possible
1925 // operand pairs, and keeping track of the best score.
1926 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1927 OpIdx1 != NumOperands1; ++OpIdx1) {
1928 // Try to pair op1I with the best operand of I2.
1929 int MaxTmpScore = 0;
1930 unsigned MaxOpIdx2 = 0;
1931 bool FoundBest = false;
1932 // If I2 is commutative try all combinations.
1933 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1934 unsigned ToIdx = isCommutative(I2)
1935 ? I2->getNumOperands()
1936 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1937 assert(FromIdx <= ToIdx && "Bad index");
1938 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1939 // Skip operands already paired with OpIdx1.
1940 if (Op2Used.count(OpIdx2))
1941 continue;
1942 // Recursively calculate the cost at each level
1943 int TmpScore =
1944 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1945 I1, I2, CurrLevel + 1, {});
1946 // Look for the best score.
1947 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1948 TmpScore > MaxTmpScore) {
1949 MaxTmpScore = TmpScore;
1950 MaxOpIdx2 = OpIdx2;
1951 FoundBest = true;
1952 }
1953 }
1954 if (FoundBest) {
1955 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1956 Op2Used.insert(MaxOpIdx2);
1957 ShallowScoreAtThisLevel += MaxTmpScore;
1958 }
1959 }
1960 return ShallowScoreAtThisLevel;
1961 }
1962 };
1963 /// A helper data structure to hold the operands of a vector of instructions.
1964 /// This supports a fixed vector length for all operand vectors.
1966 /// For each operand we need (i) the value, and (ii) the opcode that it
1967 /// would be attached to if the expression was in a left-linearized form.
1968 /// This is required to avoid illegal operand reordering.
1969 /// For example:
1970 /// \verbatim
1971 /// 0 Op1
1972 /// |/
1973 /// Op1 Op2 Linearized + Op2
1974 /// \ / ----------> |/
1975 /// - -
1976 ///
1977 /// Op1 - Op2 (0 + Op1) - Op2
1978 /// \endverbatim
1979 ///
1980 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1981 ///
1982 /// Another way to think of this is to track all the operations across the
1983 /// path from the operand all the way to the root of the tree and to
1984 /// calculate the operation that corresponds to this path. For example, the
1985 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1986 /// corresponding operation is a '-' (which matches the one in the
1987 /// linearized tree, as shown above).
1988 ///
1989 /// For lack of a better term, we refer to this operation as Accumulated
1990 /// Path Operation (APO).
1991 struct OperandData {
1992 OperandData() = default;
1993 OperandData(Value *V, bool APO, bool IsUsed)
1994 : V(V), APO(APO), IsUsed(IsUsed) {}
1995 /// The operand value.
1996 Value *V = nullptr;
1997 /// TreeEntries only allow a single opcode, or an alternate sequence of
1998 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1999 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2000 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2001 /// (e.g., Add/Mul)
2002 bool APO = false;
2003 /// Helper data for the reordering function.
2004 bool IsUsed = false;
2005 };
2006
2007 /// During operand reordering, we are trying to select the operand at lane
2008 /// that matches best with the operand at the neighboring lane. Our
2009 /// selection is based on the type of value we are looking for. For example,
2010 /// if the neighboring lane has a load, we need to look for a load that is
2011 /// accessing a consecutive address. These strategies are summarized in the
2012 /// 'ReorderingMode' enumerator.
2013 enum class ReorderingMode {
2014 Load, ///< Matching loads to consecutive memory addresses
2015 Opcode, ///< Matching instructions based on opcode (same or alternate)
2016 Constant, ///< Matching constants
2017 Splat, ///< Matching the same instruction multiple times (broadcast)
2018 Failed, ///< We failed to create a vectorizable group
2019 };
2020
2022
2023 /// A vector of operand vectors.
2025 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2026 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2027 unsigned ArgSize = 0;
2028
2029 const TargetLibraryInfo &TLI;
2030 const DataLayout &DL;
2031 ScalarEvolution &SE;
2032 const BoUpSLP &R;
2033 const Loop *L = nullptr;
2034
2035 /// \returns the operand data at \p OpIdx and \p Lane.
2036 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2037 return OpsVec[OpIdx][Lane];
2038 }
2039
2040 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2041 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2042 return OpsVec[OpIdx][Lane];
2043 }
2044
2045 /// Clears the used flag for all entries.
2046 void clearUsed() {
2047 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2048 OpIdx != NumOperands; ++OpIdx)
2049 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2050 ++Lane)
2051 OpsVec[OpIdx][Lane].IsUsed = false;
2052 }
2053
2054 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2055 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2056 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2057 }
2058
2059 /// \param Lane lane of the operands under analysis.
2060 /// \param OpIdx operand index in \p Lane lane we're looking the best
2061 /// candidate for.
2062 /// \param Idx operand index of the current candidate value.
2063 /// \returns The additional score due to possible broadcasting of the
2064 /// elements in the lane. It is more profitable to have power-of-2 unique
2065 /// elements in the lane, it will be vectorized with higher probability
2066 /// after removing duplicates. Currently the SLP vectorizer supports only
2067 /// vectorization of the power-of-2 number of unique scalars.
2068 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2069 const SmallBitVector &UsedLanes) const {
2070 Value *IdxLaneV = getData(Idx, Lane).V;
2071 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2072 isa<ExtractElementInst>(IdxLaneV))
2073 return 0;
2075 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2076 if (Ln == Lane)
2077 continue;
2078 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2079 if (!isa<Instruction>(OpIdxLnV))
2080 return 0;
2081 Uniques.try_emplace(OpIdxLnV, Ln);
2082 }
2083 unsigned UniquesCount = Uniques.size();
2084 auto IdxIt = Uniques.find(IdxLaneV);
2085 unsigned UniquesCntWithIdxLaneV =
2086 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2087 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2088 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2089 unsigned UniquesCntWithOpIdxLaneV =
2090 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2091 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2092 return 0;
2093 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2094 UniquesCntWithOpIdxLaneV,
2095 UniquesCntWithOpIdxLaneV -
2096 bit_floor(UniquesCntWithOpIdxLaneV)) -
2097 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2098 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2099 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2100 }
2101
2102 /// \param Lane lane of the operands under analysis.
2103 /// \param OpIdx operand index in \p Lane lane we're looking the best
2104 /// candidate for.
2105 /// \param Idx operand index of the current candidate value.
2106 /// \returns The additional score for the scalar which users are all
2107 /// vectorized.
2108 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2109 Value *IdxLaneV = getData(Idx, Lane).V;
2110 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2111 // Do not care about number of uses for vector-like instructions
2112 // (extractelement/extractvalue with constant indices), they are extracts
2113 // themselves and already externally used. Vectorization of such
2114 // instructions does not add extra extractelement instruction, just may
2115 // remove it.
2116 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2117 isVectorLikeInstWithConstOps(OpIdxLaneV))
2119 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2120 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2121 return 0;
2122 return R.areAllUsersVectorized(IdxLaneI)
2124 : 0;
2125 }
2126
2127 /// Score scaling factor for fully compatible instructions but with
2128 /// different number of external uses. Allows better selection of the
2129 /// instructions with less external uses.
2130 static const int ScoreScaleFactor = 10;
2131
2132 /// \Returns the look-ahead score, which tells us how much the sub-trees
2133 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2134 /// score. This helps break ties in an informed way when we cannot decide on
2135 /// the order of the operands by just considering the immediate
2136 /// predecessors.
2137 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2138 int Lane, unsigned OpIdx, unsigned Idx,
2139 bool &IsUsed, const SmallBitVector &UsedLanes) {
2140 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2142 // Keep track of the instruction stack as we recurse into the operands
2143 // during the look-ahead score exploration.
2144 int Score =
2145 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2146 /*CurrLevel=*/1, MainAltOps);
2147 if (Score) {
2148 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2149 if (Score <= -SplatScore) {
2150 // Failed score.
2151 Score = 0;
2152 } else {
2153 Score += SplatScore;
2154 // Scale score to see the difference between different operands
2155 // and similar operands but all vectorized/not all vectorized
2156 // uses. It does not affect actual selection of the best
2157 // compatible operand in general, just allows to select the
2158 // operand with all vectorized uses.
2159 Score *= ScoreScaleFactor;
2160 Score += getExternalUseScore(Lane, OpIdx, Idx);
2161 IsUsed = true;
2162 }
2163 }
2164 return Score;
2165 }
2166
2167 /// Best defined scores per lanes between the passes. Used to choose the
2168 /// best operand (with the highest score) between the passes.
2169 /// The key - {Operand Index, Lane}.
2170 /// The value - the best score between the passes for the lane and the
2171 /// operand.
2173 BestScoresPerLanes;
2174
2175 // Search all operands in Ops[*][Lane] for the one that matches best
2176 // Ops[OpIdx][LastLane] and return its opreand index.
2177 // If no good match can be found, return std::nullopt.
2178 std::optional<unsigned>
2179 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2180 ArrayRef<ReorderingMode> ReorderingModes,
2181 ArrayRef<Value *> MainAltOps,
2182 const SmallBitVector &UsedLanes) {
2183 unsigned NumOperands = getNumOperands();
2184
2185 // The operand of the previous lane at OpIdx.
2186 Value *OpLastLane = getData(OpIdx, LastLane).V;
2187
2188 // Our strategy mode for OpIdx.
2189 ReorderingMode RMode = ReorderingModes[OpIdx];
2190 if (RMode == ReorderingMode::Failed)
2191 return std::nullopt;
2192
2193 // The linearized opcode of the operand at OpIdx, Lane.
2194 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2195
2196 // The best operand index and its score.
2197 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2198 // are using the score to differentiate between the two.
2199 struct BestOpData {
2200 std::optional<unsigned> Idx;
2201 unsigned Score = 0;
2202 } BestOp;
2203 BestOp.Score =
2204 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2205 .first->second;
2206
2207 // Track if the operand must be marked as used. If the operand is set to
2208 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2209 // want to reestimate the operands again on the following iterations).
2210 bool IsUsed = RMode == ReorderingMode::Splat ||
2211 RMode == ReorderingMode::Constant ||
2212 RMode == ReorderingMode::Load;
2213 // Iterate through all unused operands and look for the best.
2214 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2215 // Get the operand at Idx and Lane.
2216 OperandData &OpData = getData(Idx, Lane);
2217 Value *Op = OpData.V;
2218 bool OpAPO = OpData.APO;
2219
2220 // Skip already selected operands.
2221 if (OpData.IsUsed)
2222 continue;
2223
2224 // Skip if we are trying to move the operand to a position with a
2225 // different opcode in the linearized tree form. This would break the
2226 // semantics.
2227 if (OpAPO != OpIdxAPO)
2228 continue;
2229
2230 // Look for an operand that matches the current mode.
2231 switch (RMode) {
2232 case ReorderingMode::Load:
2233 case ReorderingMode::Opcode: {
2234 bool LeftToRight = Lane > LastLane;
2235 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2236 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2237 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2238 OpIdx, Idx, IsUsed, UsedLanes);
2239 if (Score > static_cast<int>(BestOp.Score) ||
2240 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2241 Idx == OpIdx)) {
2242 BestOp.Idx = Idx;
2243 BestOp.Score = Score;
2244 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2245 }
2246 break;
2247 }
2248 case ReorderingMode::Constant:
2249 if (isa<Constant>(Op) ||
2250 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2251 BestOp.Idx = Idx;
2252 if (isa<Constant>(Op)) {
2254 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2256 }
2257 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2258 IsUsed = false;
2259 }
2260 break;
2261 case ReorderingMode::Splat:
2262 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2263 IsUsed = Op == OpLastLane;
2264 if (Op == OpLastLane) {
2265 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2266 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2268 }
2269 BestOp.Idx = Idx;
2270 }
2271 break;
2272 case ReorderingMode::Failed:
2273 llvm_unreachable("Not expected Failed reordering mode.");
2274 }
2275 }
2276
2277 if (BestOp.Idx) {
2278 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2279 return BestOp.Idx;
2280 }
2281 // If we could not find a good match return std::nullopt.
2282 return std::nullopt;
2283 }
2284
2285 /// Helper for reorderOperandVecs.
2286 /// \returns the lane that we should start reordering from. This is the one
2287 /// which has the least number of operands that can freely move about or
2288 /// less profitable because it already has the most optimal set of operands.
2289 unsigned getBestLaneToStartReordering() const {
2290 unsigned Min = UINT_MAX;
2291 unsigned SameOpNumber = 0;
2292 // std::pair<unsigned, unsigned> is used to implement a simple voting
2293 // algorithm and choose the lane with the least number of operands that
2294 // can freely move about or less profitable because it already has the
2295 // most optimal set of operands. The first unsigned is a counter for
2296 // voting, the second unsigned is the counter of lanes with instructions
2297 // with same/alternate opcodes and same parent basic block.
2299 // Try to be closer to the original results, if we have multiple lanes
2300 // with same cost. If 2 lanes have the same cost, use the one with the
2301 // highest index.
2302 for (int I = getNumLanes(); I > 0; --I) {
2303 unsigned Lane = I - 1;
2304 OperandsOrderData NumFreeOpsHash =
2305 getMaxNumOperandsThatCanBeReordered(Lane);
2306 // Compare the number of operands that can move and choose the one with
2307 // the least number.
2308 if (NumFreeOpsHash.NumOfAPOs < Min) {
2309 Min = NumFreeOpsHash.NumOfAPOs;
2310 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2311 HashMap.clear();
2312 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2313 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2314 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2315 // Select the most optimal lane in terms of number of operands that
2316 // should be moved around.
2317 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2321 auto [It, Inserted] =
2322 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2323 if (!Inserted)
2324 ++It->second.first;
2325 }
2326 }
2327 // Select the lane with the minimum counter.
2328 unsigned BestLane = 0;
2329 unsigned CntMin = UINT_MAX;
2330 for (const auto &Data : reverse(HashMap)) {
2331 if (Data.second.first < CntMin) {
2332 CntMin = Data.second.first;
2333 BestLane = Data.second.second;
2334 }
2335 }
2336 return BestLane;
2337 }
2338
2339 /// Data structure that helps to reorder operands.
2340 struct OperandsOrderData {
2341 /// The best number of operands with the same APOs, which can be
2342 /// reordered.
2343 unsigned NumOfAPOs = UINT_MAX;
2344 /// Number of operands with the same/alternate instruction opcode and
2345 /// parent.
2346 unsigned NumOpsWithSameOpcodeParent = 0;
2347 /// Hash for the actual operands ordering.
2348 /// Used to count operands, actually their position id and opcode
2349 /// value. It is used in the voting mechanism to find the lane with the
2350 /// least number of operands that can freely move about or less profitable
2351 /// because it already has the most optimal set of operands. Can be
2352 /// replaced with SmallVector<unsigned> instead but hash code is faster
2353 /// and requires less memory.
2354 unsigned Hash = 0;
2355 };
2356 /// \returns the maximum number of operands that are allowed to be reordered
2357 /// for \p Lane and the number of compatible instructions(with the same
2358 /// parent/opcode). This is used as a heuristic for selecting the first lane
2359 /// to start operand reordering.
2360 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2361 unsigned CntTrue = 0;
2362 unsigned NumOperands = getNumOperands();
2363 // Operands with the same APO can be reordered. We therefore need to count
2364 // how many of them we have for each APO, like this: Cnt[APO] = x.
2365 // Since we only have two APOs, namely true and false, we can avoid using
2366 // a map. Instead we can simply count the number of operands that
2367 // correspond to one of them (in this case the 'true' APO), and calculate
2368 // the other by subtracting it from the total number of operands.
2369 // Operands with the same instruction opcode and parent are more
2370 // profitable since we don't need to move them in many cases, with a high
2371 // probability such lane already can be vectorized effectively.
2372 bool AllUndefs = true;
2373 unsigned NumOpsWithSameOpcodeParent = 0;
2374 Instruction *OpcodeI = nullptr;
2375 BasicBlock *Parent = nullptr;
2376 unsigned Hash = 0;
2377 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2378 const OperandData &OpData = getData(OpIdx, Lane);
2379 if (OpData.APO)
2380 ++CntTrue;
2381 // Use Boyer-Moore majority voting for finding the majority opcode and
2382 // the number of times it occurs.
2383 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2384 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2385 I->getParent() != Parent) {
2386 if (NumOpsWithSameOpcodeParent == 0) {
2387 NumOpsWithSameOpcodeParent = 1;
2388 OpcodeI = I;
2389 Parent = I->getParent();
2390 } else {
2391 --NumOpsWithSameOpcodeParent;
2392 }
2393 } else {
2394 ++NumOpsWithSameOpcodeParent;
2395 }
2396 }
2397 Hash = hash_combine(
2398 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2399 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2400 }
2401 if (AllUndefs)
2402 return {};
2403 OperandsOrderData Data;
2404 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2405 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2406 Data.Hash = Hash;
2407 return Data;
2408 }
2409
2410 /// Go through the instructions in VL and append their operands.
2411 void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
2412 assert(!VL.empty() && "Bad VL");
2413 assert((empty() || VL.size() == getNumLanes()) &&
2414 "Expected same number of lanes");
2415 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2416 // arguments to the intrinsic produces the same result.
2417 constexpr unsigned IntrinsicNumOperands = 2;
2418 unsigned NumOperands = VL0->getNumOperands();
2419 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2420 OpsVec.resize(NumOperands);
2421 unsigned NumLanes = VL.size();
2422 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2423 OpsVec[OpIdx].resize(NumLanes);
2424 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2425 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2426 "Expected instruction or poison value");
2427 // Our tree has just 3 nodes: the root and two operands.
2428 // It is therefore trivial to get the APO. We only need to check the
2429 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2430 // RHS operand. The LHS operand of both add and sub is never attached
2431 // to an inversese operation in the linearized form, therefore its APO
2432 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2433
2434 // Since operand reordering is performed on groups of commutative
2435 // operations or alternating sequences (e.g., +, -), we can safely
2436 // tell the inverse operations by checking commutativity.
2437 if (isa<PoisonValue>(VL[Lane])) {
2438 OpsVec[OpIdx][Lane] = {
2439 PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
2440 false};
2441 continue;
2442 }
2443 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2444 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2445 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2446 APO, false};
2447 }
2448 }
2449 }
2450
2451 /// \returns the number of operands.
2452 unsigned getNumOperands() const { return ArgSize; }
2453
2454 /// \returns the number of lanes.
2455 unsigned getNumLanes() const { return OpsVec[0].size(); }
2456
2457 /// \returns the operand value at \p OpIdx and \p Lane.
2458 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2459 return getData(OpIdx, Lane).V;
2460 }
2461
2462 /// \returns true if the data structure is empty.
2463 bool empty() const { return OpsVec.empty(); }
2464
2465 /// Clears the data.
2466 void clear() { OpsVec.clear(); }
2467
2468 /// \Returns true if there are enough operands identical to \p Op to fill
2469 /// the whole vector (it is mixed with constants or loop invariant values).
2470 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2471 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2472 assert(Op == getValue(OpIdx, Lane) &&
2473 "Op is expected to be getValue(OpIdx, Lane).");
2474 // Small number of loads - try load matching.
2475 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2476 return false;
2477 bool OpAPO = getData(OpIdx, Lane).APO;
2478 bool IsInvariant = L && L->isLoopInvariant(Op);
2479 unsigned Cnt = 0;
2480 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2481 if (Ln == Lane)
2482 continue;
2483 // This is set to true if we found a candidate for broadcast at Lane.
2484 bool FoundCandidate = false;
2485 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2486 OperandData &Data = getData(OpI, Ln);
2487 if (Data.APO != OpAPO || Data.IsUsed)
2488 continue;
2489 Value *OpILane = getValue(OpI, Lane);
2490 bool IsConstantOp = isa<Constant>(OpILane);
2491 // Consider the broadcast candidate if:
2492 // 1. Same value is found in one of the operands.
2493 if (Data.V == Op ||
2494 // 2. The operand in the given lane is not constant but there is a
2495 // constant operand in another lane (which can be moved to the
2496 // given lane). In this case we can represent it as a simple
2497 // permutation of constant and broadcast.
2498 (!IsConstantOp &&
2499 ((Lns > 2 && isa<Constant>(Data.V)) ||
2500 // 2.1. If we have only 2 lanes, need to check that value in the
2501 // next lane does not build same opcode sequence.
2502 (Lns == 2 &&
2503 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2504 .getOpcode() &&
2505 isa<Constant>(Data.V)))) ||
2506 // 3. The operand in the current lane is loop invariant (can be
2507 // hoisted out) and another operand is also a loop invariant
2508 // (though not a constant). In this case the whole vector can be
2509 // hoisted out.
2510 // FIXME: need to teach the cost model about this case for better
2511 // estimation.
2512 (IsInvariant && !isa<Constant>(Data.V) &&
2513 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2514 L->isLoopInvariant(Data.V))) {
2515 FoundCandidate = true;
2516 Data.IsUsed = Data.V == Op;
2517 if (Data.V == Op)
2518 ++Cnt;
2519 break;
2520 }
2521 }
2522 if (!FoundCandidate)
2523 return false;
2524 }
2525 return getNumLanes() == 2 || Cnt > 1;
2526 }
2527
2528 /// Checks if there is at least single compatible operand in lanes other
2529 /// than \p Lane, compatible with the operand \p Op.
2530 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2531 assert(Op == getValue(OpIdx, Lane) &&
2532 "Op is expected to be getValue(OpIdx, Lane).");
2533 bool OpAPO = getData(OpIdx, Lane).APO;
2534 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2535 if (Ln == Lane)
2536 continue;
2537 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2538 const OperandData &Data = getData(OpI, Ln);
2539 if (Data.APO != OpAPO || Data.IsUsed)
2540 return true;
2541 Value *OpILn = getValue(OpI, Ln);
2542 return (L && L->isLoopInvariant(OpILn)) ||
2543 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2544 allSameBlock({Op, OpILn}));
2545 }))
2546 return true;
2547 }
2548 return false;
2549 }
2550
2551 public:
2552 /// Initialize with all the operands of the instruction vector \p RootVL.
2554 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2555 L(R.LI->getLoopFor((VL0->getParent()))) {
2556 // Append all the operands of RootVL.
2557 appendOperandsOfVL(RootVL, VL0);
2558 }
2559
2560 /// \Returns a value vector with the operands across all lanes for the
2561 /// opearnd at \p OpIdx.
2562 ValueList getVL(unsigned OpIdx) const {
2563 ValueList OpVL(OpsVec[OpIdx].size());
2564 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2565 "Expected same num of lanes across all operands");
2566 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2567 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2568 return OpVL;
2569 }
2570
2571 // Performs operand reordering for 2 or more operands.
2572 // The original operands are in OrigOps[OpIdx][Lane].
2573 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2574 void reorder() {
2575 unsigned NumOperands = getNumOperands();
2576 unsigned NumLanes = getNumLanes();
2577 // Each operand has its own mode. We are using this mode to help us select
2578 // the instructions for each lane, so that they match best with the ones
2579 // we have selected so far.
2580 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2581
2582 // This is a greedy single-pass algorithm. We are going over each lane
2583 // once and deciding on the best order right away with no back-tracking.
2584 // However, in order to increase its effectiveness, we start with the lane
2585 // that has operands that can move the least. For example, given the
2586 // following lanes:
2587 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2588 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2589 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2590 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2591 // we will start at Lane 1, since the operands of the subtraction cannot
2592 // be reordered. Then we will visit the rest of the lanes in a circular
2593 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2594
2595 // Find the first lane that we will start our search from.
2596 unsigned FirstLane = getBestLaneToStartReordering();
2597
2598 // Initialize the modes.
2599 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2600 Value *OpLane0 = getValue(OpIdx, FirstLane);
2601 // Keep track if we have instructions with all the same opcode on one
2602 // side.
2603 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2604 // Check if OpLane0 should be broadcast.
2605 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2606 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2607 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2608 else if (isa<LoadInst>(OpILane0))
2609 ReorderingModes[OpIdx] = ReorderingMode::Load;
2610 else
2611 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2612 } else if (isa<Constant>(OpLane0)) {
2613 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2614 } else if (isa<Argument>(OpLane0)) {
2615 // Our best hope is a Splat. It may save some cost in some cases.
2616 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2617 } else {
2618 llvm_unreachable("Unexpected value kind.");
2619 }
2620 }
2621
2622 // Check that we don't have same operands. No need to reorder if operands
2623 // are just perfect diamond or shuffled diamond match. Do not do it only
2624 // for possible broadcasts or non-power of 2 number of scalars (just for
2625 // now).
2626 auto &&SkipReordering = [this]() {
2627 SmallPtrSet<Value *, 4> UniqueValues;
2628 ArrayRef<OperandData> Op0 = OpsVec.front();
2629 for (const OperandData &Data : Op0)
2630 UniqueValues.insert(Data.V);
2632 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2633 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2634 return !UniqueValues.contains(Data.V);
2635 }))
2636 return false;
2637 }
2638 // TODO: Check if we can remove a check for non-power-2 number of
2639 // scalars after full support of non-power-2 vectorization.
2640 return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2641 };
2642
2643 // If the initial strategy fails for any of the operand indexes, then we
2644 // perform reordering again in a second pass. This helps avoid assigning
2645 // high priority to the failed strategy, and should improve reordering for
2646 // the non-failed operand indexes.
2647 for (int Pass = 0; Pass != 2; ++Pass) {
2648 // Check if no need to reorder operands since they're are perfect or
2649 // shuffled diamond match.
2650 // Need to do it to avoid extra external use cost counting for
2651 // shuffled matches, which may cause regressions.
2652 if (SkipReordering())
2653 break;
2654 // Skip the second pass if the first pass did not fail.
2655 bool StrategyFailed = false;
2656 // Mark all operand data as free to use.
2657 clearUsed();
2658 // We keep the original operand order for the FirstLane, so reorder the
2659 // rest of the lanes. We are visiting the nodes in a circular fashion,
2660 // using FirstLane as the center point and increasing the radius
2661 // distance.
2662 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2663 for (unsigned I = 0; I < NumOperands; ++I)
2664 MainAltOps[I].push_back(getData(I, FirstLane).V);
2665
2666 SmallBitVector UsedLanes(NumLanes);
2667 UsedLanes.set(FirstLane);
2668 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2669 // Visit the lane on the right and then the lane on the left.
2670 for (int Direction : {+1, -1}) {
2671 int Lane = FirstLane + Direction * Distance;
2672 if (Lane < 0 || Lane >= (int)NumLanes)
2673 continue;
2674 UsedLanes.set(Lane);
2675 int LastLane = Lane - Direction;
2676 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2677 "Out of bounds");
2678 // Look for a good match for each operand.
2679 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2680 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2681 std::optional<unsigned> BestIdx =
2682 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2683 MainAltOps[OpIdx], UsedLanes);
2684 // By not selecting a value, we allow the operands that follow to
2685 // select a better matching value. We will get a non-null value in
2686 // the next run of getBestOperand().
2687 if (BestIdx) {
2688 // Swap the current operand with the one returned by
2689 // getBestOperand().
2690 swap(OpIdx, *BestIdx, Lane);
2691 } else {
2692 // Enable the second pass.
2693 StrategyFailed = true;
2694 }
2695 // Try to get the alternate opcode and follow it during analysis.
2696 if (MainAltOps[OpIdx].size() != 2) {
2697 OperandData &AltOp = getData(OpIdx, Lane);
2698 InstructionsState OpS =
2699 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2700 if (OpS.getOpcode() && OpS.isAltShuffle())
2701 MainAltOps[OpIdx].push_back(AltOp.V);
2702 }
2703 }
2704 }
2705 }
2706 // Skip second pass if the strategy did not fail.
2707 if (!StrategyFailed)
2708 break;
2709 }
2710 }
2711
2712#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2713 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2714 switch (RMode) {
2715 case ReorderingMode::Load:
2716 return "Load";
2717 case ReorderingMode::Opcode:
2718 return "Opcode";
2719 case ReorderingMode::Constant:
2720 return "Constant";
2721 case ReorderingMode::Splat:
2722 return "Splat";
2723 case ReorderingMode::Failed:
2724 return "Failed";
2725 }
2726 llvm_unreachable("Unimplemented Reordering Type");
2727 }
2728
2729 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2730 raw_ostream &OS) {
2731 return OS << getModeStr(RMode);
2732 }
2733
2734 /// Debug print.
2735 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2736 printMode(RMode, dbgs());
2737 }
2738
2739 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2740 return printMode(RMode, OS);
2741 }
2742
2744 const unsigned Indent = 2;
2745 unsigned Cnt = 0;
2746 for (const OperandDataVec &OpDataVec : OpsVec) {
2747 OS << "Operand " << Cnt++ << "\n";
2748 for (const OperandData &OpData : OpDataVec) {
2749 OS.indent(Indent) << "{";
2750 if (Value *V = OpData.V)
2751 OS << *V;
2752 else
2753 OS << "null";
2754 OS << ", APO:" << OpData.APO << "}\n";
2755 }
2756 OS << "\n";
2757 }
2758 return OS;
2759 }
2760
2761 /// Debug print.
2762 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2763#endif
2764 };
2765
2766 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2767 /// for a pair which have highest score deemed to have best chance to form
2768 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2769 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2770 /// of the cost, considered to be good enough score.
2771 std::optional<int>
2772 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2773 int Limit = LookAheadHeuristics::ScoreFail) const {
2774 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2776 int BestScore = Limit;
2777 std::optional<int> Index;
2778 for (int I : seq<int>(0, Candidates.size())) {
2779 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2780 Candidates[I].second,
2781 /*U1=*/nullptr, /*U2=*/nullptr,
2782 /*CurrLevel=*/1, {});
2783 if (Score > BestScore) {
2784 BestScore = Score;
2785 Index = I;
2786 }
2787 }
2788 return Index;
2789 }
2790
2791 /// Checks if the instruction is marked for deletion.
2792 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2793
2794 /// Removes an instruction from its block and eventually deletes it.
2795 /// It's like Instruction::eraseFromParent() except that the actual deletion
2796 /// is delayed until BoUpSLP is destructed.
2798 DeletedInstructions.insert(I);
2799 }
2800
2801 /// Remove instructions from the parent function and clear the operands of \p
2802 /// DeadVals instructions, marking for deletion trivially dead operands.
2803 template <typename T>
2806 for (T *V : DeadVals) {
2807 auto *I = cast<Instruction>(V);
2808 DeletedInstructions.insert(I);
2809 }
2810 DenseSet<Value *> Processed;
2811 for (T *V : DeadVals) {
2812 if (!V || !Processed.insert(V).second)
2813 continue;
2814 auto *I = cast<Instruction>(V);
2817 if (const TreeEntry *Entry = getTreeEntry(I)) {
2818 Entries.push_back(Entry);
2819 auto It = MultiNodeScalars.find(I);
2820 if (It != MultiNodeScalars.end())
2821 Entries.append(It->second.begin(), It->second.end());
2822 }
2823 for (Use &U : I->operands()) {
2824 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2825 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2827 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2828 return Entry->VectorizedValue == OpI;
2829 })))
2830 DeadInsts.push_back(OpI);
2831 }
2832 I->dropAllReferences();
2833 }
2834 for (T *V : DeadVals) {
2835 auto *I = cast<Instruction>(V);
2836 if (!I->getParent())
2837 continue;
2838 assert((I->use_empty() || all_of(I->uses(),
2839 [&](Use &U) {
2840 return isDeleted(
2841 cast<Instruction>(U.getUser()));
2842 })) &&
2843 "trying to erase instruction with users.");
2844 I->removeFromParent();
2845 SE->forgetValue(I);
2846 }
2847 // Process the dead instruction list until empty.
2848 while (!DeadInsts.empty()) {
2849 Value *V = DeadInsts.pop_back_val();
2850 Instruction *VI = cast_or_null<Instruction>(V);
2851 if (!VI || !VI->getParent())
2852 continue;
2854 "Live instruction found in dead worklist!");
2855 assert(VI->use_empty() && "Instructions with uses are not dead.");
2856
2857 // Don't lose the debug info while deleting the instructions.
2858 salvageDebugInfo(*VI);
2859
2860 // Null out all of the instruction's operands to see if any operand
2861 // becomes dead as we go.
2862 for (Use &OpU : VI->operands()) {
2863 Value *OpV = OpU.get();
2864 if (!OpV)
2865 continue;
2866 OpU.set(nullptr);
2867
2868 if (!OpV->use_empty())
2869 continue;
2870
2871 // If the operand is an instruction that became dead as we nulled out
2872 // the operand, and if it is 'trivially' dead, delete it in a future
2873 // loop iteration.
2874 if (auto *OpI = dyn_cast<Instruction>(OpV))
2875 if (!DeletedInstructions.contains(OpI) &&
2877 DeadInsts.push_back(OpI);
2878 }
2879
2880 VI->removeFromParent();
2881 DeletedInstructions.insert(VI);
2882 SE->forgetValue(VI);
2883 }
2884 }
2885
2886 /// Checks if the instruction was already analyzed for being possible
2887 /// reduction root.
2889 return AnalyzedReductionsRoots.count(I);
2890 }
2891 /// Register given instruction as already analyzed for being possible
2892 /// reduction root.
2894 AnalyzedReductionsRoots.insert(I);
2895 }
2896 /// Checks if the provided list of reduced values was checked already for
2897 /// vectorization.
2899 return AnalyzedReductionVals.contains(hash_value(VL));
2900 }
2901 /// Adds the list of reduced values to list of already checked values for the
2902 /// vectorization.
2904 AnalyzedReductionVals.insert(hash_value(VL));
2905 }
2906 /// Clear the list of the analyzed reduction root instructions.
2908 AnalyzedReductionsRoots.clear();
2909 AnalyzedReductionVals.clear();
2910 AnalyzedMinBWVals.clear();
2911 }
2912 /// Checks if the given value is gathered in one of the nodes.
2913 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2914 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2915 }
2916 /// Checks if the given value is gathered in one of the nodes.
2917 bool isGathered(const Value *V) const {
2918 return MustGather.contains(V);
2919 }
2920 /// Checks if the specified value was not schedule.
2921 bool isNotScheduled(const Value *V) const {
2922 return NonScheduledFirst.contains(V);
2923 }
2924
2925 /// Check if the value is vectorized in the tree.
2926 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2927
2928 ~BoUpSLP();
2929
2930private:
2931 /// Determine if a node \p E in can be demoted to a smaller type with a
2932 /// truncation. We collect the entries that will be demoted in ToDemote.
2933 /// \param E Node for analysis
2934 /// \param ToDemote indices of the nodes to be demoted.
2935 bool collectValuesToDemote(
2936 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2938 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2939 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2940
2941 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2942 /// reordering (i.e. the operands can be reordered because they have only one
2943 /// user and reordarable).
2944 /// \param ReorderableGathers List of all gather nodes that require reordering
2945 /// (e.g., gather of extractlements or partially vectorizable loads).
2946 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2947 /// reordering, subset of \p NonVectorized.
2948 bool
2949 canReorderOperands(TreeEntry *UserTE,
2950 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2951 ArrayRef<TreeEntry *> ReorderableGathers,
2952 SmallVectorImpl<TreeEntry *> &GatherOps);
2953
2954 /// Checks if the given \p TE is a gather node with clustered reused scalars
2955 /// and reorders it per given \p Mask.
2956 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2957
2958 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2959 /// if any. If it is not vectorized (gather node), returns nullptr.
2960 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2961 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2962 TreeEntry *TE = nullptr;
2963 const auto *It = find_if(VL, [&](Value *V) {
2964 TE = getTreeEntry(V);
2965 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2966 return true;
2967 auto It = MultiNodeScalars.find(V);
2968 if (It != MultiNodeScalars.end()) {
2969 for (TreeEntry *E : It->second) {
2970 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2971 TE = E;
2972 return true;
2973 }
2974 }
2975 }
2976 return false;
2977 });
2978 if (It != VL.end()) {
2979 assert(TE->isSame(VL) && "Expected same scalars.");
2980 return TE;
2981 }
2982 return nullptr;
2983 }
2984
2985 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2986 /// if any. If it is not vectorized (gather node), returns nullptr.
2987 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2988 unsigned OpIdx) const {
2989 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2990 const_cast<TreeEntry *>(UserTE), OpIdx);
2991 }
2992
2993 /// Checks if all users of \p I are the part of the vectorization tree.
2994 bool areAllUsersVectorized(
2995 Instruction *I,
2996 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2997
2998 /// Return information about the vector formed for the specified index
2999 /// of a vector of (the same) instruction.
3001
3002 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3003 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3004
3005 /// Gets the root instruction for the given node. If the node is a strided
3006 /// load/store node with the reverse order, the root instruction is the last
3007 /// one.
3008 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3009
3010 /// \returns Cast context for the given graph node.
3012 getCastContextHint(const TreeEntry &TE) const;
3013
3014 /// \returns the cost of the vectorizable entry.
3015 InstructionCost getEntryCost(const TreeEntry *E,
3016 ArrayRef<Value *> VectorizedVals,
3017 SmallPtrSetImpl<Value *> &CheckedExtracts);
3018
3019 /// This is the recursive part of buildTree.
3020 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3021 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3022
3023 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3024 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3025 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3026 /// returns false, setting \p CurrentOrder to either an empty vector or a
3027 /// non-identity permutation that allows to reuse extract instructions.
3028 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3029 /// extract order.
3030 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
3031 SmallVectorImpl<unsigned> &CurrentOrder,
3032 bool ResizeAllowed = false) const;
3033
3034 /// Vectorize a single entry in the tree.
3035 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3036 /// avoid issues with def-use order.
3037 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3038
3039 /// Returns vectorized operand node, that matches the order of the scalars
3040 /// operand number \p NodeIdx in entry \p E.
3041 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3042 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3043 unsigned NodeIdx) const {
3044 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3045 }
3046
3047 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3048 /// \p E.
3049 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3050 /// avoid issues with def-use order.
3051 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3052
3053 /// Create a new vector from a list of scalar values. Produces a sequence
3054 /// which exploits values reused across lanes, and arranges the inserts
3055 /// for ease of later optimization.
3056 template <typename BVTy, typename ResTy, typename... Args>
3057 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3058
3059 /// Create a new vector from a list of scalar values. Produces a sequence
3060 /// which exploits values reused across lanes, and arranges the inserts
3061 /// for ease of later optimization.
3062 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3063 bool PostponedPHIs);
3064
3065 /// Returns the instruction in the bundle, which can be used as a base point
3066 /// for scheduling. Usually it is the last instruction in the bundle, except
3067 /// for the case when all operands are external (in this case, it is the first
3068 /// instruction in the list).
3069 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3070
3071 /// Tries to find extractelement instructions with constant indices from fixed
3072 /// vector type and gather such instructions into a bunch, which highly likely
3073 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3074 /// was successful, the matched scalars are replaced by poison values in \p VL
3075 /// for future analysis.
3076 std::optional<TargetTransformInfo::ShuffleKind>
3077 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3078 SmallVectorImpl<int> &Mask) const;
3079
3080 /// Tries to find extractelement instructions with constant indices from fixed
3081 /// vector type and gather such instructions into a bunch, which highly likely
3082 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3083 /// was successful, the matched scalars are replaced by poison values in \p VL
3084 /// for future analysis.
3086 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3088 unsigned NumParts) const;
3089
3090 /// Checks if the gathered \p VL can be represented as a single register
3091 /// shuffle(s) of previous tree entries.
3092 /// \param TE Tree entry checked for permutation.
3093 /// \param VL List of scalars (a subset of the TE scalar), checked for
3094 /// permutations. Must form single-register vector.
3095 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3096 /// commands to build the mask using the original vector value, without
3097 /// relying on the potential reordering.
3098 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3099 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3100 std::optional<TargetTransformInfo::ShuffleKind>
3101 isGatherShuffledSingleRegisterEntry(
3102 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3103 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3104 bool ForOrder);
3105
3106 /// Checks if the gathered \p VL can be represented as multi-register
3107 /// shuffle(s) of previous tree entries.
3108 /// \param TE Tree entry checked for permutation.
3109 /// \param VL List of scalars (a subset of the TE scalar), checked for
3110 /// permutations.
3111 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3112 /// commands to build the mask using the original vector value, without
3113 /// relying on the potential reordering.
3114 /// \returns per-register series of ShuffleKind, if gathered values can be
3115 /// represented as shuffles of previous tree entries. \p Mask is filled with
3116 /// the shuffle mask (also on per-register base).
3118 isGatherShuffledEntry(
3119 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3121 unsigned NumParts, bool ForOrder = false);
3122
3123 /// \returns the cost of gathering (inserting) the values in \p VL into a
3124 /// vector.
3125 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3126 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3127 Type *ScalarTy) const;
3128
3129 /// Set the Builder insert point to one after the last instruction in
3130 /// the bundle
3131 void setInsertPointAfterBundle(const TreeEntry *E);
3132
3133 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3134 /// specified, the starting vector value is poison.
3135 Value *
3136 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3137 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3138
3139 /// \returns whether the VectorizableTree is fully vectorizable and will
3140 /// be beneficial even the tree height is tiny.
3141 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3142
3143 /// Run through the list of all gathered loads in the graph and try to find
3144 /// vector loads/masked gathers instead of regular gathers. Later these loads
3145 /// are reshufled to build final gathered nodes.
3146 void tryToVectorizeGatheredLoads(
3147 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3148 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3149 8> &GatheredLoads);
3150
3151 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3152 /// users of \p TE and collects the stores. It returns the map from the store
3153 /// pointers to the collected stores.
3155 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3156
3157 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3158 /// stores in \p StoresVec can form a vector instruction. If so it returns
3159 /// true and populates \p ReorderIndices with the shuffle indices of the
3160 /// stores when compared to the sorted vector.
3161 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3162 OrdersType &ReorderIndices) const;
3163
3164 /// Iterates through the users of \p TE, looking for scalar stores that can be
3165 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3166 /// their order and builds an order index vector for each store bundle. It
3167 /// returns all these order vectors found.
3168 /// We run this after the tree has formed, otherwise we may come across user
3169 /// instructions that are not yet in the tree.
3171 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3172
3173 /// Tries to reorder the gathering node for better vectorization
3174 /// opportunities.
3175 void reorderGatherNode(TreeEntry &TE);
3176
3177 struct TreeEntry {
3178 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3179 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3180
3181 /// \returns Common mask for reorder indices and reused scalars.
3182 SmallVector<int> getCommonMask() const {
3184 inversePermutation(ReorderIndices, Mask);
3185 ::addMask(Mask, ReuseShuffleIndices);
3186 return Mask;
3187 }
3188
3189 /// \returns true if the scalars in VL are equal to this entry.
3190 bool isSame(ArrayRef<Value *> VL) const {
3191 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3192 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3193 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3194 return VL.size() == Mask.size() &&
3195 std::equal(VL.begin(), VL.end(), Mask.begin(),
3196 [Scalars](Value *V, int Idx) {
3197 return (isa<UndefValue>(V) &&
3198 Idx == PoisonMaskElem) ||
3199 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3200 });
3201 };
3202 if (!ReorderIndices.empty()) {
3203 // TODO: implement matching if the nodes are just reordered, still can
3204 // treat the vector as the same if the list of scalars matches VL
3205 // directly, without reordering.
3207 inversePermutation(ReorderIndices, Mask);
3208 if (VL.size() == Scalars.size())
3209 return IsSame(Scalars, Mask);
3210 if (VL.size() == ReuseShuffleIndices.size()) {
3211 ::addMask(Mask, ReuseShuffleIndices);
3212 return IsSame(Scalars, Mask);
3213 }
3214 return false;
3215 }
3216 return IsSame(Scalars, ReuseShuffleIndices);
3217 }
3218
3219 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3220 return isGather() && !UserTreeIndices.empty() &&
3221 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3222 UserTreeIndices.front().UserTE == UserEI.UserTE;
3223 }
3224
3225 /// \returns true if current entry has same operands as \p TE.
3226 bool hasEqualOperands(const TreeEntry &TE) const {
3227 if (TE.getNumOperands() != getNumOperands())
3228 return false;
3229 SmallBitVector Used(getNumOperands());
3230 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3231 unsigned PrevCount = Used.count();
3232 for (unsigned K = 0; K < E; ++K) {
3233 if (Used.test(K))
3234 continue;
3235 if (getOperand(K) == TE.getOperand(I)) {
3236 Used.set(K);
3237 break;
3238 }
3239 }
3240 // Check if we actually found the matching operand.
3241 if (PrevCount == Used.count())
3242 return false;
3243 }
3244 return true;
3245 }
3246
3247 /// \return Final vectorization factor for the node. Defined by the total
3248 /// number of vectorized scalars, including those, used several times in the
3249 /// entry and counted in the \a ReuseShuffleIndices, if any.
3250 unsigned getVectorFactor() const {
3251 if (!ReuseShuffleIndices.empty())
3252 return ReuseShuffleIndices.size();
3253 return Scalars.size();
3254 };
3255
3256 /// Checks if the current node is a gather node.
3257 bool isGather() const {return State == NeedToGather; }
3258
3259 /// A vector of scalars.
3260 ValueList Scalars;
3261
3262 /// The Scalars are vectorized into this value. It is initialized to Null.
3263 WeakTrackingVH VectorizedValue = nullptr;
3264
3265 /// New vector phi instructions emitted for the vectorized phi nodes.
3266 PHINode *PHI = nullptr;
3267
3268 /// Do we need to gather this sequence or vectorize it
3269 /// (either with vector instruction or with scatter/gather
3270 /// intrinsics for store/load)?
3271 enum EntryState {
3272 Vectorize, ///< The node is regularly vectorized.
3273 ScatterVectorize, ///< Masked scatter/gather node.
3274 StridedVectorize, ///< Strided loads (and stores)
3275 NeedToGather, ///< Gather/buildvector node.
3276 CombinedVectorize, ///< Vectorized node, combined with its user into more
3277 ///< complex node like select/cmp to minmax, mul/add to
3278 ///< fma, etc. Must be used for the following nodes in
3279 ///< the pattern, not the very first one.
3280 };
3281 EntryState State;
3282
3283 /// List of combined opcodes supported by the vectorizer.
3284 enum CombinedOpcode {
3285 NotCombinedOp = -1,
3286 MinMax = Instruction::OtherOpsEnd + 1,
3287 };
3288 CombinedOpcode CombinedOp = NotCombinedOp;
3289
3290 /// Does this sequence require some shuffling?
3291 SmallVector<int, 4> ReuseShuffleIndices;
3292
3293 /// Does this entry require reordering?
3294 SmallVector<unsigned, 4> ReorderIndices;
3295
3296 /// Points back to the VectorizableTree.
3297 ///
3298 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3299 /// to be a pointer and needs to be able to initialize the child iterator.
3300 /// Thus we need a reference back to the container to translate the indices
3301 /// to entries.
3302 VecTreeTy &Container;
3303
3304 /// The TreeEntry index containing the user of this entry. We can actually
3305 /// have multiple users so the data structure is not truly a tree.
3306 SmallVector<EdgeInfo, 1> UserTreeIndices;
3307
3308 /// The index of this treeEntry in VectorizableTree.
3309 unsigned Idx = 0;
3310
3311 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3312 /// other nodes as a series of insertvector instructions.
3313 SmallVector<std::pair<unsigned, unsigned>, 0> CombinedEntriesWithIndices;
3314
3315 private:
3316 /// The operands of each instruction in each lane Operands[op_index][lane].
3317 /// Note: This helps avoid the replication of the code that performs the
3318 /// reordering of operands during buildTree_rec() and vectorizeTree().
3320
3321 /// The main/alternate instruction.
3322 Instruction *MainOp = nullptr;
3323 Instruction *AltOp = nullptr;
3324
3325 /// Interleaving factor for interleaved loads Vectorize nodes.
3326 unsigned InterleaveFactor = 0;
3327
3328 public:
3329 /// Returns interleave factor for interleave nodes.
3330 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3331 /// Sets interleaving factor for the interleaving nodes.
3332 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3333
3334 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3335 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3336 if (Operands.size() < OpIdx + 1)
3337 Operands.resize(OpIdx + 1);
3338 assert(Operands[OpIdx].empty() && "Already resized?");
3339 assert(OpVL.size() <= Scalars.size() &&
3340 "Number of operands is greater than the number of scalars.");
3341 Operands[OpIdx].resize(OpVL.size());
3342 copy(OpVL, Operands[OpIdx].begin());
3343 }
3344
3345 /// Set this bundle's operand from Scalars.
3346 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3347 VLOperands Ops(Scalars, MainOp, R);
3348 if (RequireReorder)
3349 Ops.reorder();
3350 for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
3351 setOperand(I, Ops.getVL(I));
3352 }
3353
3354 /// Reorders operands of the node to the given mask \p Mask.
3355 void reorderOperands(ArrayRef<int> Mask) {
3356 for (ValueList &Operand : Operands)
3357 reorderScalars(Operand, Mask);
3358 }
3359
3360 /// \returns the \p OpIdx operand of this TreeEntry.
3361 ValueList &getOperand(unsigned OpIdx) {
3362 assert(OpIdx < Operands.size() && "Off bounds");
3363 return Operands[OpIdx];
3364 }
3365
3366 /// \returns the \p OpIdx operand of this TreeEntry.
3367 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3368 assert(OpIdx < Operands.size() && "Off bounds");
3369 return Operands[OpIdx];
3370 }
3371
3372 /// \returns the number of operands.
3373 unsigned getNumOperands() const { return Operands.size(); }
3374
3375 /// \return the single \p OpIdx operand.
3376 Value *getSingleOperand(unsigned OpIdx) const {
3377 assert(OpIdx < Operands.size() && "Off bounds");
3378 assert(!Operands[OpIdx].empty() && "No operand available");
3379 return Operands[OpIdx][0];
3380 }
3381
3382 /// Some of the instructions in the list have alternate opcodes.
3383 bool isAltShuffle() const { return MainOp != AltOp; }
3384
3385 bool isOpcodeOrAlt(Instruction *I) const {
3386 unsigned CheckedOpcode = I->getOpcode();
3387 return (getOpcode() == CheckedOpcode ||
3388 getAltOpcode() == CheckedOpcode);
3389 }
3390
3391 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3392 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3393 /// \p OpValue.
3394 Value *isOneOf(Value *Op) const {
3395 auto *I = dyn_cast<Instruction>(Op);
3396 if (I && isOpcodeOrAlt(I))
3397 return Op;
3398 return MainOp;
3399 }
3400
3401 void setOperations(const InstructionsState &S) {
3402 MainOp = S.getMainOp();
3403 AltOp = S.getAltOp();
3404 }
3405
3406 Instruction *getMainOp() const {
3407 return MainOp;
3408 }
3409
3410 Instruction *getAltOp() const {
3411 return AltOp;
3412 }
3413
3414 /// The main/alternate opcodes for the list of instructions.
3415 unsigned getOpcode() const {
3416 return MainOp ? MainOp->getOpcode() : 0;
3417 }
3418
3419 unsigned getAltOpcode() const {
3420 return AltOp ? AltOp->getOpcode() : 0;
3421 }
3422
3423 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3424 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3425 int findLaneForValue(Value *V) const {
3426 unsigned FoundLane = getVectorFactor();
3427 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3428 std::advance(It, 1)) {
3429 if (*It != V)
3430 continue;
3431 FoundLane = std::distance(Scalars.begin(), It);
3432 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3433 if (!ReorderIndices.empty())
3434 FoundLane = ReorderIndices[FoundLane];
3435 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3436 if (ReuseShuffleIndices.empty())
3437 break;
3438 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3439 RIt != ReuseShuffleIndices.end()) {
3440 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3441 break;
3442 }
3443 }
3444 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3445 return FoundLane;
3446 }
3447
3448 /// Build a shuffle mask for graph entry which represents a merge of main
3449 /// and alternate operations.
3450 void
3451 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3453 SmallVectorImpl<Value *> *OpScalars = nullptr,
3454 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3455
3456 /// Return true if this is a non-power-of-2 node.
3457 bool isNonPowOf2Vec() const {
3458 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3459 return IsNonPowerOf2;
3460 }
3461
3462 /// Return true if this is a node, which tries to vectorize number of
3463 /// elements, forming whole vectors.
3464 bool
3465 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3466 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3467 TTI, getValueType(Scalars.front()), Scalars.size());
3468 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3469 "Reshuffling not supported with non-power-of-2 vectors yet.");
3470 return IsNonPowerOf2;
3471 }
3472
3473 Value *getOrdered(unsigned Idx) const {
3474 assert(isGather() && "Must be used only for buildvectors/gathers.");
3475 if (ReorderIndices.empty())
3476 return Scalars[Idx];
3478 inversePermutation(ReorderIndices, Mask);
3479 return Scalars[Mask[Idx]];
3480 }
3481
3482#ifndef NDEBUG
3483 /// Debug printer.
3484 LLVM_DUMP_METHOD void dump() const {
3485 dbgs() << Idx << ".\n";
3486 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3487 dbgs() << "Operand " << OpI << ":\n";
3488 for (const Value *V : Operands[OpI])
3489 dbgs().indent(2) << *V << "\n";
3490 }
3491 dbgs() << "Scalars: \n";
3492 for (Value *V : Scalars)
3493 dbgs().indent(2) << *V << "\n";
3494 dbgs() << "State: ";
3495 switch (State) {
3496 case Vectorize:
3497 if (InterleaveFactor > 0) {
3498 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3499 << "\n";
3500 } else {
3501 dbgs() << "Vectorize\n";
3502 }
3503 break;
3504 case ScatterVectorize:
3505 dbgs() << "ScatterVectorize\n";
3506 break;
3507 case StridedVectorize:
3508 dbgs() << "StridedVectorize\n";
3509 break;
3510 case NeedToGather:
3511 dbgs() << "NeedToGather\n";
3512 break;
3513 case CombinedVectorize:
3514 dbgs() << "CombinedVectorize\n";
3515 break;
3516 }
3517 dbgs() << "MainOp: ";
3518 if (MainOp)
3519 dbgs() << *MainOp << "\n";
3520 else
3521 dbgs() << "NULL\n";
3522 dbgs() << "AltOp: ";
3523 if (AltOp)
3524 dbgs() << *AltOp << "\n";
3525 else
3526 dbgs() << "NULL\n";
3527 dbgs() << "VectorizedValue: ";
3528 if (VectorizedValue)
3529 dbgs() << *VectorizedValue << "\n";
3530 else
3531 dbgs() << "NULL\n";
3532 dbgs() << "ReuseShuffleIndices: ";
3533 if (ReuseShuffleIndices.empty())
3534 dbgs() << "Empty";
3535 else
3536 for (int ReuseIdx : ReuseShuffleIndices)
3537 dbgs() << ReuseIdx << ", ";
3538 dbgs() << "\n";
3539 dbgs() << "ReorderIndices: ";
3540 for (unsigned ReorderIdx : ReorderIndices)
3541 dbgs() << ReorderIdx << ", ";
3542 dbgs() << "\n";
3543 dbgs() << "UserTreeIndices: ";
3544 for (const auto &EInfo : UserTreeIndices)
3545 dbgs() << EInfo << ", ";
3546 dbgs() << "\n";
3547 }
3548#endif
3549 };
3550
3551#ifndef NDEBUG
3552 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3553 InstructionCost VecCost, InstructionCost ScalarCost,
3554 StringRef Banner) const {
3555 dbgs() << "SLP: " << Banner << ":\n";
3556 E->dump();
3557 dbgs() << "SLP: Costs:\n";
3558 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3559 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3560 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3561 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3562 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3563 }
3564#endif
3565
3566 /// Create a new VectorizableTree entry.
3567 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3568 std::optional<ScheduleData *> Bundle,
3569 const InstructionsState &S,
3570 const EdgeInfo &UserTreeIdx,
3571 ArrayRef<int> ReuseShuffleIndices = {},
3572 ArrayRef<unsigned> ReorderIndices = {},
3573 unsigned InterleaveFactor = 0) {
3574 TreeEntry::EntryState EntryState =
3575 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3576 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3577 ReuseShuffleIndices, ReorderIndices);
3578 if (E && InterleaveFactor > 0)
3579 E->setInterleave(InterleaveFactor);
3580 return E;
3581 }
3582
3583 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3584 TreeEntry::EntryState EntryState,
3585 std::optional<ScheduleData *> Bundle,
3586 const InstructionsState &S,
3587 const EdgeInfo &UserTreeIdx,
3588 ArrayRef<int> ReuseShuffleIndices = {},
3589 ArrayRef<unsigned> ReorderIndices = {}) {
3590 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3591 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3592 "Need to vectorize gather entry?");
3593 // Gathered loads still gathered? Do not create entry, use the original one.
3594 if (GatheredLoadsEntriesFirst.has_value() &&
3595 EntryState == TreeEntry::NeedToGather &&
3596 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3597 !UserTreeIdx.UserTE)
3598 return nullptr;
3599 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3600 TreeEntry *Last = VectorizableTree.back().get();
3601 Last->Idx = VectorizableTree.size() - 1;
3602 Last->State = EntryState;
3603 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3604 // for non-power-of-two vectors.
3605 assert(
3607 ReuseShuffleIndices.empty()) &&
3608 "Reshuffling scalars not yet supported for nodes with padding");
3609 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3610 ReuseShuffleIndices.end());
3611 if (ReorderIndices.empty()) {
3612 Last->Scalars.assign(VL.begin(), VL.end());
3613 Last->setOperations(S);
3614 } else {
3615 // Reorder scalars and build final mask.
3616 Last->Scalars.assign(VL.size(), nullptr);
3617 transform(ReorderIndices, Last->Scalars.begin(),
3618 [VL](unsigned Idx) -> Value * {
3619 if (Idx >= VL.size())
3620 return UndefValue::get(VL.front()->getType());
3621 return VL[Idx];
3622 });
3623 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3624 Last->setOperations(S);
3625 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3626 }
3627 if (!Last->isGather()) {
3628 for (Value *V : VL) {
3629 const TreeEntry *TE = getTreeEntry(V);
3630 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3631 "Scalar already in tree!");
3632 if (TE) {
3633 if (TE != Last)
3634 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3635 continue;
3636 }
3637 ScalarToTreeEntry[V] = Last;
3638 }
3639 // Update the scheduler bundle to point to this TreeEntry.
3640 ScheduleData *BundleMember = *Bundle;
3641 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3642 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3643 doesNotNeedToSchedule(VL)) &&
3644 "Bundle and VL out of sync");
3645 if (BundleMember) {
3646 for (Value *V : VL) {
3648 continue;
3649 if (!BundleMember)
3650 continue;
3651 BundleMember->TE = Last;
3652 BundleMember = BundleMember->NextInBundle;
3653 }
3654 }
3655 assert(!BundleMember && "Bundle and VL out of sync");
3656 } else {
3657 // Build a map for gathered scalars to the nodes where they are used.
3658 bool AllConstsOrCasts = true;
3659 for (Value *V : VL)
3660 if (!isConstant(V)) {
3661 auto *I = dyn_cast<CastInst>(V);
3662 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3663 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3664 !UserTreeIdx.UserTE->isGather())
3665 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3666 }
3667 if (AllConstsOrCasts)
3668 CastMaxMinBWSizes =
3669 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3670 MustGather.insert(VL.begin(), VL.end());
3671 }
3672
3673 if (UserTreeIdx.UserTE)
3674 Last->UserTreeIndices.push_back(UserTreeIdx);
3675 return Last;
3676 }
3677
3678 /// -- Vectorization State --
3679 /// Holds all of the tree entries.
3680 TreeEntry::VecTreeTy VectorizableTree;
3681
3682#ifndef NDEBUG
3683 /// Debug printer.
3684 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3685 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3686 VectorizableTree[Id]->dump();
3687 dbgs() << "\n";
3688 }
3689 }
3690#endif
3691
3692 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3693
3694 const TreeEntry *getTreeEntry(Value *V) const {
3695 return ScalarToTreeEntry.lookup(V);
3696 }
3697
3698 /// Check that the operand node of alternate node does not generate
3699 /// buildvector sequence. If it is, then probably not worth it to build
3700 /// alternate shuffle, if number of buildvector operands + alternate
3701 /// instruction > than the number of buildvector instructions.
3702 /// \param S the instructions state of the analyzed values.
3703 /// \param VL list of the instructions with alternate opcodes.
3704 bool areAltOperandsProfitable(const InstructionsState &S,
3705 ArrayRef<Value *> VL) const;
3706
3707 /// Checks if the specified list of the instructions/values can be vectorized
3708 /// and fills required data before actual scheduling of the instructions.
3709 TreeEntry::EntryState
3710 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3711 bool IsScatterVectorizeUserTE,
3712 OrdersType &CurrentOrder,
3713 SmallVectorImpl<Value *> &PointerOps);
3714
3715 /// Maps a specific scalar to its tree entry.
3716 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3717
3718 /// List of scalars, used in several vectorize nodes, and the list of the
3719 /// nodes.
3721
3722 /// Maps a value to the proposed vectorizable size.
3723 SmallDenseMap<Value *, unsigned> InstrElementSize;
3724
3725 /// A list of scalars that we found that we need to keep as scalars.
3726 ValueSet MustGather;
3727
3728 /// A set of first non-schedulable values.
3729 ValueSet NonScheduledFirst;
3730
3731 /// A map between the vectorized entries and the last instructions in the
3732 /// bundles. The bundles are built in use order, not in the def order of the
3733 /// instructions. So, we cannot rely directly on the last instruction in the
3734 /// bundle being the last instruction in the program order during
3735 /// vectorization process since the basic blocks are affected, need to
3736 /// pre-gather them before.
3737 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3738
3739 /// List of gather nodes, depending on other gather/vector nodes, which should
3740 /// be emitted after the vector instruction emission process to correctly
3741 /// handle order of the vector instructions and shuffles.
3742 SetVector<const TreeEntry *> PostponedGathers;
3743
3744 using ValueToGatherNodesMap =
3746 ValueToGatherNodesMap ValueToGatherNodes;
3747
3748 /// A list of the load entries (node indices), which can be vectorized using
3749 /// strided or masked gather approach, but attempted to be represented as
3750 /// contiguous loads.
3751 SetVector<unsigned> LoadEntriesToVectorize;
3752
3753 /// true if graph nodes transforming mode is on.
3754 bool IsGraphTransformMode = false;
3755
3756 /// The index of the first gathered load entry in the VectorizeTree.
3757 std::optional<unsigned> GatheredLoadsEntriesFirst;
3758
3759 /// This POD struct describes one external user in the vectorized tree.
3760 struct ExternalUser {
3761 ExternalUser(Value *S, llvm::User *U, int L)
3762 : Scalar(S), User(U), Lane(L) {}
3763
3764 // Which scalar in our function.
3765 Value *Scalar;
3766
3767 // Which user that uses the scalar.
3769
3770 // Which lane does the scalar belong to.
3771 int Lane;
3772 };
3773 using UserList = SmallVector<ExternalUser, 16>;
3774
3775 /// Checks if two instructions may access the same memory.
3776 ///
3777 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3778 /// is invariant in the calling loop.
3779 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3780 Instruction *Inst2) {
3781 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3782 return true;
3783 // First check if the result is already in the cache.
3784 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3785 auto It = AliasCache.find(Key);
3786 if (It != AliasCache.end())
3787 return It->second;
3788 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3789 // Store the result in the cache.
3790 AliasCache.try_emplace(Key, Aliased);
3791 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3792 return Aliased;
3793 }
3794
3795 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3796
3797 /// Cache for alias results.
3798 /// TODO: consider moving this to the AliasAnalysis itself.
3800
3801 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3802 // globally through SLP because we don't perform any action which
3803 // invalidates capture results.
3804 BatchAAResults BatchAA;
3805
3806 /// Temporary store for deleted instructions. Instructions will be deleted
3807 /// eventually when the BoUpSLP is destructed. The deferral is required to
3808 /// ensure that there are no incorrect collisions in the AliasCache, which
3809 /// can happen if a new instruction is allocated at the same address as a
3810 /// previously deleted instruction.
3811 DenseSet<Instruction *> DeletedInstructions;
3812
3813 /// Set of the instruction, being analyzed already for reductions.
3814 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3815
3816 /// Set of hashes for the list of reduction values already being analyzed.
3817 DenseSet<size_t> AnalyzedReductionVals;
3818
3819 /// Values, already been analyzed for mininmal bitwidth and found to be
3820 /// non-profitable.
3821 DenseSet<Value *> AnalyzedMinBWVals;
3822
3823 /// A list of values that need to extracted out of the tree.
3824 /// This list holds pairs of (Internal Scalar : External User). External User
3825 /// can be nullptr, it means that this Internal Scalar will be used later,
3826 /// after vectorization.
3827 UserList ExternalUses;
3828
3829 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3830 /// extractelement instructions.
3831 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3832
3833 /// Values used only by @llvm.assume calls.
3835
3836 /// Holds all of the instructions that we gathered, shuffle instructions and
3837 /// extractelements.
3838 SetVector<Instruction *> GatherShuffleExtractSeq;
3839
3840 /// A list of blocks that we are going to CSE.
3841 DenseSet<BasicBlock *> CSEBlocks;
3842
3843 /// List of hashes of vector of loads, which are known to be non vectorizable.
3844 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3845
3846 /// Contains all scheduling relevant data for an instruction.
3847 /// A ScheduleData either represents a single instruction or a member of an
3848 /// instruction bundle (= a group of instructions which is combined into a
3849 /// vector instruction).
3850 struct ScheduleData {
3851 // The initial value for the dependency counters. It means that the
3852 // dependencies are not calculated yet.
3853 enum { InvalidDeps = -1 };
3854
3855 ScheduleData() = default;
3856
3857 void init(int BlockSchedulingRegionID, Instruction *I) {
3858 FirstInBundle = this;
3859 NextInBundle = nullptr;
3860 NextLoadStore = nullptr;
3861 IsScheduled = false;
3862 SchedulingRegionID = BlockSchedulingRegionID;
3863 clearDependencies();
3864 Inst = I;
3865 TE = nullptr;
3866 }
3867
3868 /// Verify basic self consistency properties
3869 void verify() {
3870 if (hasValidDependencies()) {
3871 assert(UnscheduledDeps <= Dependencies && "invariant");
3872 } else {
3873 assert(UnscheduledDeps == Dependencies && "invariant");
3874 }
3875
3876 if (IsScheduled) {
3877 assert(isSchedulingEntity() &&
3878 "unexpected scheduled state");
3879 for (const ScheduleData *BundleMember = this; BundleMember;
3880 BundleMember = BundleMember->NextInBundle) {
3881 assert(BundleMember->hasValidDependencies() &&
3882 BundleMember->UnscheduledDeps == 0 &&
3883 "unexpected scheduled state");
3884 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3885 "only bundle is marked scheduled");
3886 }
3887 }
3888
3889 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3890 "all bundle members must be in same basic block");
3891 }
3892
3893 /// Returns true if the dependency information has been calculated.
3894 /// Note that depenendency validity can vary between instructions within
3895 /// a single bundle.
3896 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3897
3898 /// Returns true for single instructions and for bundle representatives
3899 /// (= the head of a bundle).
3900 bool isSchedulingEntity() const { return FirstInBundle == this; }
3901
3902 /// Returns true if it represents an instruction bundle and not only a
3903 /// single instruction.
3904 bool isPartOfBundle() const {
3905 return NextInBundle != nullptr || FirstInBundle != this || TE;
3906 }
3907
3908 /// Returns true if it is ready for scheduling, i.e. it has no more
3909 /// unscheduled depending instructions/bundles.
3910 bool isReady() const {
3911 assert(isSchedulingEntity() &&
3912 "can't consider non-scheduling entity for ready list");
3913 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3914 }
3915
3916 /// Modifies the number of unscheduled dependencies for this instruction,
3917 /// and returns the number of remaining dependencies for the containing
3918 /// bundle.
3919 int incrementUnscheduledDeps(int Incr) {
3920 assert(hasValidDependencies() &&
3921 "increment of unscheduled deps would be meaningless");
3922 UnscheduledDeps += Incr;
3923 return FirstInBundle->unscheduledDepsInBundle();
3924 }
3925
3926 /// Sets the number of unscheduled dependencies to the number of
3927 /// dependencies.
3928 void resetUnscheduledDeps() {
3929 UnscheduledDeps = Dependencies;
3930 }
3931
3932 /// Clears all dependency information.
3933 void clearDependencies() {
3934 Dependencies = InvalidDeps;
3935 resetUnscheduledDeps();
3936 MemoryDependencies.clear();
3937 ControlDependencies.clear();
3938 }
3939
3940 int unscheduledDepsInBundle() const {
3941 assert(isSchedulingEntity() && "only meaningful on the bundle");
3942 int Sum = 0;
3943 for (const ScheduleData *BundleMember = this; BundleMember;
3944 BundleMember = BundleMember->NextInBundle) {
3945 if (BundleMember->UnscheduledDeps == InvalidDeps)
3946 return InvalidDeps;
3947 Sum += BundleMember->UnscheduledDeps;
3948 }
3949 return Sum;
3950 }
3951
3952 void dump(raw_ostream &os) const {
3953 if (!isSchedulingEntity()) {
3954 os << "/ " << *Inst;
3955 } else if (NextInBundle) {
3956 os << '[' << *Inst;
3957 ScheduleData *SD = NextInBundle;
3958 while (SD) {
3959 os << ';' << *SD->Inst;
3960 SD = SD->NextInBundle;
3961 }
3962 os << ']';
3963 } else {
3964 os << *Inst;
3965 }
3966 }
3967
3968 Instruction *Inst = nullptr;
3969
3970 /// The TreeEntry that this instruction corresponds to.
3971 TreeEntry *TE = nullptr;
3972
3973 /// Points to the head in an instruction bundle (and always to this for
3974 /// single instructions).
3975 ScheduleData *FirstInBundle = nullptr;
3976
3977 /// Single linked list of all instructions in a bundle. Null if it is a
3978 /// single instruction.
3979 ScheduleData *NextInBundle = nullptr;
3980
3981 /// Single linked list of all memory instructions (e.g. load, store, call)
3982 /// in the block - until the end of the scheduling region.
3983 ScheduleData *NextLoadStore = nullptr;
3984
3985 /// The dependent memory instructions.
3986 /// This list is derived on demand in calculateDependencies().
3987 SmallVector<ScheduleData *, 4> MemoryDependencies;
3988
3989 /// List of instructions which this instruction could be control dependent
3990 /// on. Allowing such nodes to be scheduled below this one could introduce
3991 /// a runtime fault which didn't exist in the original program.
3992 /// ex: this is a load or udiv following a readonly call which inf loops
3993 SmallVector<ScheduleData *, 4> ControlDependencies;
3994
3995 /// This ScheduleData is in the current scheduling region if this matches
3996 /// the current SchedulingRegionID of BlockScheduling.
3997 int SchedulingRegionID = 0;
3998
3999 /// Used for getting a "good" final ordering of instructions.
4000 int SchedulingPriority = 0;
4001
4002 /// The number of dependencies. Constitutes of the number of users of the
4003 /// instruction plus the number of dependent memory instructions (if any).
4004 /// This value is calculated on demand.
4005 /// If InvalidDeps, the number of dependencies is not calculated yet.
4006 int Dependencies = InvalidDeps;
4007
4008 /// The number of dependencies minus the number of dependencies of scheduled
4009 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4010 /// for scheduling.
4011 /// Note that this is negative as long as Dependencies is not calculated.
4012 int UnscheduledDeps = InvalidDeps;
4013
4014 /// True if this instruction is scheduled (or considered as scheduled in the
4015 /// dry-run).
4016 bool IsScheduled = false;
4017 };
4018
4019#ifndef NDEBUG
4021 const BoUpSLP::ScheduleData &SD) {
4022 SD.dump(os);
4023 return os;
4024 }
4025#endif
4026
4027 friend struct GraphTraits<BoUpSLP *>;
4028 friend struct DOTGraphTraits<BoUpSLP *>;
4029
4030 /// Contains all scheduling data for a basic block.
4031 /// It does not schedules instructions, which are not memory read/write
4032 /// instructions and their operands are either constants, or arguments, or
4033 /// phis, or instructions from others blocks, or their users are phis or from
4034 /// the other blocks. The resulting vector instructions can be placed at the
4035 /// beginning of the basic block without scheduling (if operands does not need
4036 /// to be scheduled) or at the end of the block (if users are outside of the
4037 /// block). It allows to save some compile time and memory used by the
4038 /// compiler.
4039 /// ScheduleData is assigned for each instruction in between the boundaries of
4040 /// the tree entry, even for those, which are not part of the graph. It is
4041 /// required to correctly follow the dependencies between the instructions and
4042 /// their correct scheduling. The ScheduleData is not allocated for the
4043 /// instructions, which do not require scheduling, like phis, nodes with
4044 /// extractelements/insertelements only or nodes with instructions, with
4045 /// uses/operands outside of the block.
4046 struct BlockScheduling {
4047 BlockScheduling(BasicBlock *BB)
4048 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4049
4050 void clear() {
4051 ReadyInsts.clear();
4052 ScheduleStart = nullptr;
4053 ScheduleEnd = nullptr;
4054 FirstLoadStoreInRegion = nullptr;
4055 LastLoadStoreInRegion = nullptr;
4056 RegionHasStackSave = false;
4057
4058 // Reduce the maximum schedule region size by the size of the
4059 // previous scheduling run.
4060 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4061 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4062 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4063 ScheduleRegionSize = 0;
4064
4065 // Make a new scheduling region, i.e. all existing ScheduleData is not
4066 // in the new region yet.
4067 ++SchedulingRegionID;
4068 }
4069
4070 ScheduleData *getScheduleData(Instruction *I) {
4071 if (BB != I->getParent())
4072 // Avoid lookup if can't possibly be in map.
4073 return nullptr;
4074 ScheduleData *SD = ScheduleDataMap.lookup(I);
4075 if (SD && isInSchedulingRegion(SD))
4076 return SD;
4077 return nullptr;
4078 }
4079
4080 ScheduleData *getScheduleData(Value *V) {
4081 if (auto *I = dyn_cast<Instruction>(V))
4082 return getScheduleData(I);
4083 return nullptr;
4084 }
4085
4086 bool isInSchedulingRegion(ScheduleData *SD) const {
4087 return SD->SchedulingRegionID == SchedulingRegionID;
4088 }
4089
4090 /// Marks an instruction as scheduled and puts all dependent ready
4091 /// instructions into the ready-list.
4092 template <typename ReadyListType>
4093 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4094 SD->IsScheduled = true;
4095 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4096
4097 for (ScheduleData *BundleMember = SD; BundleMember;
4098 BundleMember = BundleMember->NextInBundle) {
4099
4100 // Handle the def-use chain dependencies.
4101
4102 // Decrement the unscheduled counter and insert to ready list if ready.
4103 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4104 ScheduleData *OpDef = getScheduleData(I);
4105 if (OpDef && OpDef->hasValidDependencies() &&
4106 OpDef->incrementUnscheduledDeps(-1) == 0) {
4107 // There are no more unscheduled dependencies after
4108 // decrementing, so we can put the dependent instruction
4109 // into the ready list.
4110 ScheduleData *DepBundle = OpDef->FirstInBundle;
4111 assert(!DepBundle->IsScheduled &&
4112 "already scheduled bundle gets ready");
4113 ReadyList.insert(DepBundle);
4115 << "SLP: gets ready (def): " << *DepBundle << "\n");
4116 }
4117 };
4118
4119 // If BundleMember is a vector bundle, its operands may have been
4120 // reordered during buildTree(). We therefore need to get its operands
4121 // through the TreeEntry.
4122 if (TreeEntry *TE = BundleMember->TE) {
4123 // Need to search for the lane since the tree entry can be reordered.
4124 int Lane = std::distance(TE->Scalars.begin(),
4125 find(TE->Scalars, BundleMember->Inst));
4126 assert(Lane >= 0 && "Lane not set");
4127
4128 // Since vectorization tree is being built recursively this assertion
4129 // ensures that the tree entry has all operands set before reaching
4130 // this code. Couple of exceptions known at the moment are extracts
4131 // where their second (immediate) operand is not added. Since
4132 // immediates do not affect scheduler behavior this is considered
4133 // okay.
4134 auto *In = BundleMember->Inst;
4135 assert(
4136 In &&
4137 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4138 In->getNumOperands() == TE->getNumOperands()) &&
4139 "Missed TreeEntry operands?");
4140 (void)In; // fake use to avoid build failure when assertions disabled
4141
4142 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
4143 OpIdx != NumOperands; ++OpIdx)
4144 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4145 DecrUnsched(I);
4146 } else {
4147 // If BundleMember is a stand-alone instruction, no operand reordering
4148 // has taken place, so we directly access its operands.
4149 for (Use &U : BundleMember->Inst->operands())
4150 if (auto *I = dyn_cast<Instruction>(U.get()))
4151 DecrUnsched(I);
4152 }
4153 // Handle the memory dependencies.
4154 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4155 if (MemoryDepSD->hasValidDependencies() &&
4156 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4157 // There are no more unscheduled dependencies after decrementing,
4158 // so we can put the dependent instruction into the ready list.
4159 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4160 assert(!DepBundle->IsScheduled &&
4161 "already scheduled bundle gets ready");
4162 ReadyList.insert(DepBundle);
4164 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4165 }
4166 }
4167 // Handle the control dependencies.
4168 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4169 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4170 // There are no more unscheduled dependencies after decrementing,
4171 // so we can put the dependent instruction into the ready list.
4172 ScheduleData *DepBundle = DepSD->FirstInBundle;
4173 assert(!DepBundle->IsScheduled &&
4174 "already scheduled bundle gets ready");
4175 ReadyList.insert(DepBundle);
4177 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4178 }
4179 }
4180 }
4181 }
4182
4183 /// Verify basic self consistency properties of the data structure.
4184 void verify() {
4185 if (!ScheduleStart)
4186 return;
4187
4188 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4189 ScheduleStart->comesBefore(ScheduleEnd) &&
4190 "Not a valid scheduling region?");
4191
4192 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4193 auto *SD = getScheduleData(I);
4194 if (!SD)
4195 continue;
4196 assert(isInSchedulingRegion(SD) &&
4197 "primary schedule data not in window?");
4198 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4199 "entire bundle in window!");
4200 SD->verify();
4201 }
4202
4203 for (auto *SD : ReadyInsts) {
4204 assert(SD->isSchedulingEntity() && SD->isReady() &&
4205 "item in ready list not ready?");
4206 (void)SD;
4207 }
4208 }
4209
4210 /// Put all instructions into the ReadyList which are ready for scheduling.
4211 template <typename ReadyListType>
4212 void initialFillReadyList(ReadyListType &ReadyList) {
4213 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4214 ScheduleData *SD = getScheduleData(I);
4215 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4216 SD->isReady()) {
4217 ReadyList.insert(SD);
4219 << "SLP: initially in ready list: " << *SD << "\n");
4220 }
4221 }
4222 }
4223
4224 /// Build a bundle from the ScheduleData nodes corresponding to the
4225 /// scalar instruction for each lane.
4226 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4227
4228 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4229 /// cyclic dependencies. This is only a dry-run, no instructions are
4230 /// actually moved at this stage.
4231 /// \returns the scheduling bundle. The returned Optional value is not
4232 /// std::nullopt if \p VL is allowed to be scheduled.
4233 std::optional<ScheduleData *>
4234 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4235 const InstructionsState &S);
4236
4237 /// Un-bundles a group of instructions.
4238 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4239
4240 /// Allocates schedule data chunk.
4241 ScheduleData *allocateScheduleDataChunks();
4242
4243 /// Extends the scheduling region so that V is inside the region.
4244 /// \returns true if the region size is within the limit.
4245 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4246
4247 /// Initialize the ScheduleData structures for new instructions in the
4248 /// scheduling region.
4249 void initScheduleData(Instruction *FromI, Instruction *ToI,
4250 ScheduleData *PrevLoadStore,
4251 ScheduleData *NextLoadStore);
4252
4253 /// Updates the dependency information of a bundle and of all instructions/
4254 /// bundles which depend on the original bundle.
4255 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4256 BoUpSLP *SLP);
4257
4258 /// Sets all instruction in the scheduling region to un-scheduled.
4259 void resetSchedule();
4260
4261 BasicBlock *BB;
4262
4263 /// Simple memory allocation for ScheduleData.
4265
4266 /// The size of a ScheduleData array in ScheduleDataChunks.
4267 int ChunkSize;
4268
4269 /// The allocator position in the current chunk, which is the last entry
4270 /// of ScheduleDataChunks.
4271 int ChunkPos;
4272
4273 /// Attaches ScheduleData to Instruction.
4274 /// Note that the mapping survives during all vectorization iterations, i.e.
4275 /// ScheduleData structures are recycled.
4277
4278 /// The ready-list for scheduling (only used for the dry-run).
4279 SetVector<ScheduleData *> ReadyInsts;
4280
4281 /// The first instruction of the scheduling region.
4282 Instruction *ScheduleStart = nullptr;
4283
4284 /// The first instruction _after_ the scheduling region.
4285 Instruction *ScheduleEnd = nullptr;
4286
4287 /// The first memory accessing instruction in the scheduling region
4288 /// (can be null).
4289 ScheduleData *FirstLoadStoreInRegion = nullptr;
4290
4291 /// The last memory accessing instruction in the scheduling region
4292 /// (can be null).
4293 ScheduleData *LastLoadStoreInRegion = nullptr;
4294
4295 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4296 /// region? Used to optimize the dependence calculation for the
4297 /// common case where there isn't.
4298 bool RegionHasStackSave = false;
4299
4300 /// The current size of the scheduling region.
4301 int ScheduleRegionSize = 0;
4302
4303 /// The maximum size allowed for the scheduling region.
4304 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4305
4306 /// The ID of the scheduling region. For a new vectorization iteration this
4307 /// is incremented which "removes" all ScheduleData from the region.
4308 /// Make sure that the initial SchedulingRegionID is greater than the
4309 /// initial SchedulingRegionID in ScheduleData (which is 0).
4310 int SchedulingRegionID = 1;
4311 };
4312
4313 /// Attaches the BlockScheduling structures to basic blocks.
4315
4316 /// Performs the "real" scheduling. Done before vectorization is actually
4317 /// performed in a basic block.
4318 void scheduleBlock(BlockScheduling *BS);
4319
4320 /// List of users to ignore during scheduling and that don't need extracting.
4321 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4322
4323 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4324 /// sorted SmallVectors of unsigned.
4325 struct OrdersTypeDenseMapInfo {
4326 static OrdersType getEmptyKey() {
4327 OrdersType V;
4328 V.push_back(~1U);
4329 return V;
4330 }
4331
4332 static OrdersType getTombstoneKey() {
4333 OrdersType V;
4334 V.push_back(~2U);
4335 return V;
4336 }
4337
4338 static unsigned getHashValue(const OrdersType &V) {
4339 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4340 }
4341
4342 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4343 return LHS == RHS;
4344 }
4345 };
4346
4347 // Analysis and block reference.
4348 Function *F;
4349 ScalarEvolution *SE;
4351 TargetLibraryInfo *TLI;
4352 LoopInfo *LI;
4353 DominatorTree *DT;
4354 AssumptionCache *AC;
4355 DemandedBits *DB;
4356 const DataLayout *DL;
4358
4359 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4360 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4361
4362 /// Instruction builder to construct the vectorized tree.
4364
4365 /// A map of scalar integer values to the smallest bit width with which they
4366 /// can legally be represented. The values map to (width, signed) pairs,
4367 /// where "width" indicates the minimum bit width and "signed" is True if the
4368 /// value must be signed-extended, rather than zero-extended, back to its
4369 /// original width.
4371
4372 /// Final size of the reduced vector, if the current graph represents the
4373 /// input for the reduction and it was possible to narrow the size of the
4374 /// reduction.
4375 unsigned ReductionBitWidth = 0;
4376
4377 /// Canonical graph size before the transformations.
4378 unsigned BaseGraphSize = 1;
4379
4380 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4381 /// type sizes, used in the tree.
4382 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4383
4384 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4385 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4386 DenseSet<unsigned> ExtraBitWidthNodes;
4387};
4388
4389} // end namespace slpvectorizer
4390
4391template <> struct GraphTraits<BoUpSLP *> {
4392 using TreeEntry = BoUpSLP::TreeEntry;
4393
4394 /// NodeRef has to be a pointer per the GraphWriter.
4396
4398
4399 /// Add the VectorizableTree to the index iterator to be able to return
4400 /// TreeEntry pointers.
4401 struct ChildIteratorType
4402 : public iterator_adaptor_base<
4403 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4405
4407 ContainerTy &VT)
4408 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4409
4410 NodeRef operator*() { return I->UserTE; }
4411 };
4412
4414 return R.VectorizableTree[0].get();
4415 }
4416
4417 static ChildIteratorType child_begin(NodeRef N) {
4418 return {N->UserTreeIndices.begin(), N->Container};
4419 }
4420
4421 static ChildIteratorType child_end(NodeRef N) {
4422 return {N->UserTreeIndices.end(), N->Container};
4423 }
4424
4425 /// For the node iterator we just need to turn the TreeEntry iterator into a
4426 /// TreeEntry* iterator so that it dereferences to NodeRef.
4427 class nodes_iterator {
4429 ItTy It;
4430
4431 public:
4432 nodes_iterator(const ItTy &It2) : It(It2) {}
4433 NodeRef operator*() { return It->get(); }
4434 nodes_iterator operator++() {
4435 ++It;
4436 return *this;
4437 }
4438 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4439 };
4440
4441 static nodes_iterator nodes_begin(BoUpSLP *R) {
4442 return nodes_iterator(R->VectorizableTree.begin());
4443 }
4444
4445 static nodes_iterator nodes_end(BoUpSLP *R) {
4446 return nodes_iterator(R->VectorizableTree.end());
4447 }
4448
4449 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4450};
4451
4452template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4453 using TreeEntry = BoUpSLP::TreeEntry;
4454
4455 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4456
4457 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4458 std::string Str;
4460 OS << Entry->Idx << ".\n";
4461 if (isSplat(Entry->Scalars))
4462 OS << "<splat> ";
4463 for (auto *V : Entry->Scalars) {
4464 OS << *V;
4465 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4466 return EU.Scalar == V;
4467 }))
4468 OS << " <extract>";
4469 OS << "\n";
4470 }
4471 return Str;
4472 }
4473
4474 static std::string getNodeAttributes(const TreeEntry *Entry,
4475 const BoUpSLP *) {
4476 if (Entry->isGather())
4477 return "color=red";
4478 if (Entry->State == TreeEntry::ScatterVectorize ||
4479 Entry->State == TreeEntry::StridedVectorize)
4480 return "color=blue";
4481 return "";
4482 }
4483};
4484
4485} // end namespace llvm
4486
4489 for (auto *I : DeletedInstructions) {
4490 if (!I->getParent()) {
4491 // Temporarily insert instruction back to erase them from parent and
4492 // memory later.
4493 if (isa<PHINode>(I))
4494 // Phi nodes must be the very first instructions in the block.
4495 I->insertBefore(F->getEntryBlock(),
4496 F->getEntryBlock().getFirstNonPHIIt());
4497 else
4498 I->insertBefore(F->getEntryBlock().getTerminator());
4499 continue;
4500 }
4501 for (Use &U : I->operands()) {
4502 auto *Op = dyn_cast<Instruction>(U.get());
4503 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4505 DeadInsts.emplace_back(Op);
4506 }
4507 I->dropAllReferences();
4508 }
4509 for (auto *I : DeletedInstructions) {
4510 assert(I->use_empty() &&
4511 "trying to erase instruction with users.");
4512 I->eraseFromParent();
4513 }
4514
4515 // Cleanup any dead scalar code feeding the vectorized instructions
4517
4518#ifdef EXPENSIVE_CHECKS
4519 // If we could guarantee that this call is not extremely slow, we could
4520 // remove the ifdef limitation (see PR47712).
4521 assert(!verifyFunction(*F, &dbgs()));
4522#endif
4523}
4524
4525/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4526/// contains original mask for the scalars reused in the node. Procedure
4527/// transform this mask in accordance with the given \p Mask.
4529 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4530 "Expected non-empty mask.");
4531 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4532 Prev.swap(Reuses);
4533 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4534 if (Mask[I] != PoisonMaskElem)
4535 Reuses[Mask[I]] = Prev[I];
4536}
4537
4538/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4539/// the original order of the scalars. Procedure transforms the provided order
4540/// in accordance with the given \p Mask. If the resulting \p Order is just an
4541/// identity order, \p Order is cleared.
4543 bool BottomOrder = false) {
4544 assert(!Mask.empty() && "Expected non-empty mask.");
4545 unsigned Sz = Mask.size();
4546 if (BottomOrder) {
4547 SmallVector<unsigned> PrevOrder;
4548 if (Order.empty()) {
4549 PrevOrder.resize(Sz);
4550 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4551 } else {
4552 PrevOrder.swap(Order);
4553 }
4554 Order.assign(Sz, Sz);
4555 for (unsigned I = 0; I < Sz; ++I)
4556 if (Mask[I] != PoisonMaskElem)
4557 Order[I] = PrevOrder[Mask[I]];
4558 if (all_of(enumerate(Order), [&](const auto &Data) {
4559 return Data.value() == Sz || Data.index() == Data.value();
4560 })) {
4561 Order.clear();
4562 return;
4563 }
4564 fixupOrderingIndices(Order);
4565 return;
4566 }
4567 SmallVector<int> MaskOrder;
4568 if (Order.empty()) {
4569 MaskOrder.resize(Sz);
4570 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4571 } else {
4572 inversePermutation(Order, MaskOrder);
4573 }
4574 reorderReuses(MaskOrder, Mask);
4575 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4576 Order.clear();
4577 return;
4578 }
4579 Order.assign(Sz, Sz);
4580 for (unsigned I = 0; I < Sz; ++I)
4581 if (MaskOrder[I] != PoisonMaskElem)
4582 Order[MaskOrder[I]] = I;
4583 fixupOrderingIndices(Order);
4584}
4585
4586std::optional<BoUpSLP::OrdersType>
4587BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4588 assert(TE.isGather() && "Expected gather node only.");
4589 // Try to find subvector extract/insert patterns and reorder only such
4590 // patterns.
4591 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4592 Type *ScalarTy = GatheredScalars.front()->getType();
4593 int NumScalars = GatheredScalars.size();
4594 if (!isValidElementType(ScalarTy))
4595 return std::nullopt;
4596 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4597 int NumParts = TTI->getNumberOfParts(VecTy);
4598 if (NumParts == 0 || NumParts >= NumScalars ||
4599 VecTy->getNumElements() % NumParts != 0 ||
4600 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4601 VecTy->getNumElements() / NumParts))
4602 NumParts = 1;
4603 SmallVector<int> ExtractMask;
4604 SmallVector<int> Mask;
4607 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4609 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4610 /*ForOrder=*/true);
4611 // No shuffled operands - ignore.
4612 if (GatherShuffles.empty() && ExtractShuffles.empty())
4613 return std::nullopt;
4614 OrdersType CurrentOrder(NumScalars, NumScalars);
4615 if (GatherShuffles.size() == 1 &&
4616 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4617 Entries.front().front()->isSame(TE.Scalars)) {
4618 // Perfect match in the graph, will reuse the previously vectorized
4619 // node. Cost is 0.
4620 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4621 return CurrentOrder;
4622 }
4623 auto IsSplatMask = [](ArrayRef<int> Mask) {
4624 int SingleElt = PoisonMaskElem;
4625 return all_of(Mask, [&](int I) {
4626 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4627 SingleElt = I;
4628 return I == PoisonMaskElem || I == SingleElt;
4629 });
4630 };
4631 // Exclusive broadcast mask - ignore.
4632 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4633 (Entries.size() != 1 ||
4634 Entries.front().front()->ReorderIndices.empty())) ||
4635 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4636 return std::nullopt;
4637 SmallBitVector ShuffledSubMasks(NumParts);
4638 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4639 ArrayRef<int> Mask, int PartSz, int NumParts,
4640 function_ref<unsigned(unsigned)> GetVF) {
4641 for (int I : seq<int>(0, NumParts)) {
4642 if (ShuffledSubMasks.test(I))
4643 continue;
4644 const int VF = GetVF(I);
4645 if (VF == 0)
4646 continue;
4647 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4648 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4649 // Shuffle of at least 2 vectors - ignore.
4650 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4651 std::fill(Slice.begin(), Slice.end(), NumScalars);
4652 ShuffledSubMasks.set(I);
4653 continue;
4654 }
4655 // Try to include as much elements from the mask as possible.
4656 int FirstMin = INT_MAX;
4657 int SecondVecFound = false;
4658 for (int K : seq<int>(Limit)) {
4659 int Idx = Mask[I * PartSz + K];
4660 if (Idx == PoisonMaskElem) {
4661 Value *V = GatheredScalars[I * PartSz + K];
4662 if (isConstant(V) && !isa<PoisonValue>(V)) {
4663 SecondVecFound = true;
4664 break;
4665 }
4666 continue;
4667 }
4668 if (Idx < VF) {
4669 if (FirstMin > Idx)
4670 FirstMin = Idx;
4671 } else {
4672 SecondVecFound = true;
4673 break;
4674 }
4675 }
4676 FirstMin = (FirstMin / PartSz) * PartSz;
4677 // Shuffle of at least 2 vectors - ignore.
4678 if (SecondVecFound) {
4679 std::fill(Slice.begin(), Slice.end(), NumScalars);
4680 ShuffledSubMasks.set(I);
4681 continue;
4682 }
4683 for (int K : seq<int>(Limit)) {
4684 int Idx = Mask[I * PartSz + K];
4685 if (Idx == PoisonMaskElem)
4686 continue;
4687 Idx -= FirstMin;
4688 if (Idx >= PartSz) {
4689 SecondVecFound = true;
4690 break;
4691 }
4692 if (CurrentOrder[I * PartSz + Idx] >
4693 static_cast<unsigned>(I * PartSz + K) &&
4694 CurrentOrder[I * PartSz + Idx] !=
4695 static_cast<unsigned>(I * PartSz + Idx))
4696 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4697 }
4698 // Shuffle of at least 2 vectors - ignore.
4699 if (SecondVecFound) {
4700 std::fill(Slice.begin(), Slice.end(), NumScalars);
4701 ShuffledSubMasks.set(I);
4702 continue;
4703 }
4704 }
4705 };
4706 int PartSz = getPartNumElems(NumScalars, NumParts);
4707 if (!ExtractShuffles.empty())
4708 TransformMaskToOrder(
4709 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4710 if (!ExtractShuffles[I])
4711 return 0U;
4712 unsigned VF = 0;
4713 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4714 for (unsigned Idx : seq<unsigned>(Sz)) {
4715 int K = I * PartSz + Idx;
4716 if (ExtractMask[K] == PoisonMaskElem)
4717 continue;
4718 if (!TE.ReuseShuffleIndices.empty())
4719 K = TE.ReuseShuffleIndices[K];
4720 if (K == PoisonMaskElem)
4721 continue;
4722 if (!TE.ReorderIndices.empty())
4723 K = std::distance(TE.ReorderIndices.begin(),
4724 find(TE.ReorderIndices, K));
4725 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4726 if (!EI)
4727 continue;
4728 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4729 ->getElementCount()
4730 .getKnownMinValue());
4731 }
4732 return VF;
4733 });
4734 // Check special corner case - single shuffle of the same entry.
4735 if (GatherShuffles.size() == 1 && NumParts != 1) {
4736 if (ShuffledSubMasks.any())
4737 return std::nullopt;
4738 PartSz = NumScalars;
4739 NumParts = 1;
4740 }
4741 if (!Entries.empty())
4742 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4743 if (!GatherShuffles[I])
4744 return 0U;
4745 return std::max(Entries[I].front()->getVectorFactor(),
4746 Entries[I].back()->getVectorFactor());
4747 });
4748 int NumUndefs =
4749 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4750 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4751 return std::nullopt;
4752 return std::move(CurrentOrder);
4753}
4754
4755static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4756 const TargetLibraryInfo &TLI,
4757 bool CompareOpcodes = true) {
4760 return false;
4761 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4762 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4763 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4764 (!GEP2 || GEP2->getNumOperands() == 2) &&
4765 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4766 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4767 !CompareOpcodes ||
4768 (GEP1 && GEP2 &&
4769 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4770 .getOpcode()));
4771}
4772
4773/// Calculates minimal alignment as a common alignment.
4774template <typename T>
4776 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4777 for (Value *V : VL.drop_front())
4778 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4779 return CommonAlignment;
4780}
4781
4782/// Check if \p Order represents reverse order.
4784 assert(!Order.empty() &&
4785 "Order is empty. Please check it before using isReverseOrder.");
4786 unsigned Sz = Order.size();
4787 return all_of(enumerate(Order), [&](const auto &Pair) {
4788 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4789 });
4790}
4791
4792/// Checks if the provided list of pointers \p Pointers represents the strided
4793/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4794/// Otherwise, if \p Inst is not specified, just initialized optional value is
4795/// returned to show that the pointers represent strided pointers. If \p Inst
4796/// specified, the runtime stride is materialized before the given \p Inst.
4797/// \returns std::nullopt if the pointers are not pointers with the runtime
4798/// stride, nullptr or actual stride value, otherwise.
4799static std::optional<Value *>
4801 const DataLayout &DL, ScalarEvolution &SE,
4802 SmallVectorImpl<unsigned> &SortedIndices,
4803 Instruction *Inst = nullptr) {
4805 const SCEV *PtrSCEVLowest = nullptr;
4806 const SCEV *PtrSCEVHighest = nullptr;
4807 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4808 // addresses).
4809 for (Value *Ptr : PointerOps) {
4810 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4811 if (!PtrSCEV)
4812 return std::nullopt;
4813 SCEVs.push_back(PtrSCEV);
4814 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4815 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4816 continue;
4817 }
4818 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4819 if (isa<SCEVCouldNotCompute>(Diff))
4820 return std::nullopt;
4821 if (Diff->isNonConstantNegative()) {
4822 PtrSCEVLowest = PtrSCEV;
4823 continue;
4824 }
4825 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4826 if (isa<SCEVCouldNotCompute>(Diff1))
4827 return std::nullopt;
4828 if (Diff1->isNonConstantNegative()) {
4829 PtrSCEVHighest = PtrSCEV;
4830 continue;
4831 }
4832 }
4833 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4834 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4835 if (isa<SCEVCouldNotCompute>(Dist))
4836 return std::nullopt;
4837 int Size = DL.getTypeStoreSize(ElemTy);
4838 auto TryGetStride = [&](const SCEV *Dist,
4839 const SCEV *Multiplier) -> const SCEV * {
4840 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4841 if (M->getOperand(0) == Multiplier)
4842 return M->getOperand(1);
4843 if (M->getOperand(1) == Multiplier)
4844 return M->getOperand(0);
4845 return nullptr;
4846 }
4847 if (Multiplier == Dist)
4848 return SE.getConstant(Dist->getType(), 1);
4849 return SE.getUDivExactExpr(Dist, Multiplier);
4850 };
4851 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4852 const SCEV *Stride = nullptr;
4853 if (Size != 1 || SCEVs.size() > 2) {
4854 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4855 Stride = TryGetStride(Dist, Sz);
4856 if (!Stride)
4857 return std::nullopt;
4858 }
4859 if (!Stride || isa<SCEVConstant>(Stride))
4860 return std::nullopt;
4861 // Iterate through all pointers and check if all distances are
4862 // unique multiple of Stride.
4863 using DistOrdPair = std::pair<int64_t, int>;
4864 auto Compare = llvm::less_first();
4865 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4866 int Cnt = 0;
4867 bool IsConsecutive = true;
4868 for (const SCEV *PtrSCEV : SCEVs) {
4869 unsigned Dist = 0;
4870 if (PtrSCEV != PtrSCEVLowest) {
4871 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4872 const SCEV *Coeff = TryGetStride(Diff, Stride);
4873 if (!Coeff)
4874 return std::nullopt;
4875 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4876 if (!SC || isa<SCEVCouldNotCompute>(SC))
4877 return std::nullopt;
4878 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4879 SE.getMulExpr(Stride, SC)))
4880 ->isZero())
4881 return std::nullopt;
4882 Dist = SC->getAPInt().getZExtValue();
4883 }
4884 // If the strides are not the same or repeated, we can't vectorize.
4885 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4886 return std::nullopt;
4887 auto Res = Offsets.emplace(Dist, Cnt);
4888 if (!Res.second)
4889 return std::nullopt;
4890 // Consecutive order if the inserted element is the last one.
4891 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4892 ++Cnt;
4893 }
4894 if (Offsets.size() != SCEVs.size())
4895 return std::nullopt;
4896 SortedIndices.clear();
4897 if (!IsConsecutive) {
4898 // Fill SortedIndices array only if it is non-consecutive.
4899 SortedIndices.resize(PointerOps.size());
4900 Cnt = 0;
4901 for (const std::pair<int64_t, int> &Pair : Offsets) {
4902 SortedIndices[Cnt] = Pair.second;
4903 ++Cnt;
4904 }
4905 }
4906 if (!Inst)
4907 return nullptr;
4908 SCEVExpander Expander(SE, DL, "strided-load-vec");
4909 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4910}
4911
4912static std::pair<InstructionCost, InstructionCost>
4914 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4915 Type *ScalarTy, VectorType *VecTy);
4916
4917/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4918/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4919/// subvector pattern.
4920static InstructionCost
4922 VectorType *Tp, ArrayRef<int> Mask = {},
4924 int Index = 0, VectorType *SubTp = nullptr,
4926 if (Kind != TTI::SK_PermuteTwoSrc)
4927 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4928 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4929 int NumSubElts;
4931 Mask, NumSrcElts, NumSubElts, Index)) {
4932 if (Index + NumSubElts > NumSrcElts &&
4933 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4934 return TTI.getShuffleCost(
4936 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4938 }
4939 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4940}
4941
4945 SmallVectorImpl<Value *> &PointerOps,
4946 unsigned *BestVF, bool TryRecursiveCheck) const {
4947 // Check that a vectorized load would load the same memory as a scalar
4948 // load. For example, we don't want to vectorize loads that are smaller
4949 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4950 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4951 // from such a struct, we read/write packed bits disagreeing with the
4952 // unvectorized version.
4953 if (BestVF)
4954 *BestVF = 0;
4956 return LoadsState::Gather;
4957 Type *ScalarTy = VL0->getType();
4958
4959 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4960 return LoadsState::Gather;
4961
4962 // Make sure all loads in the bundle are simple - we can't vectorize
4963 // atomic or volatile loads.
4964 PointerOps.clear();
4965 const unsigned Sz = VL.size();
4966 PointerOps.resize(Sz);
4967 auto *POIter = PointerOps.begin();
4968 for (Value *V : VL) {
4969 auto *L = dyn_cast<LoadInst>(V);
4970 if (!L || !L->isSimple())
4971 return LoadsState::Gather;
4972 *POIter = L->getPointerOperand();
4973 ++POIter;
4974 }
4975
4976 Order.clear();
4977 // Check the order of pointer operands or that all pointers are the same.
4978 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4979
4980 auto *VecTy = getWidenedType(ScalarTy, Sz);
4981 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4982 if (!IsSorted) {
4983 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
4984 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4985 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4987 }
4988
4989 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
4990 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
4991 return LoadsState::Gather;
4992
4993 if (!all_of(PointerOps, [&](Value *P) {
4994 return arePointersCompatible(P, PointerOps.front(), *TLI);
4995 }))
4996 return LoadsState::Gather;
4997
4998 } else {
4999 Value *Ptr0;
5000 Value *PtrN;
5001 if (Order.empty()) {
5002 Ptr0 = PointerOps.front();
5003 PtrN = PointerOps.back();
5004 } else {
5005 Ptr0 = PointerOps[Order.front()];
5006 PtrN = PointerOps[Order.back()];
5007 }
5008 std::optional<int> Diff =
5009 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5010 // Check that the sorted loads are consecutive.
5011 if (static_cast<unsigned>(*Diff) == Sz - 1)
5012 return LoadsState::Vectorize;
5013 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5014 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5015 return LoadsState::Gather;
5016 // Simple check if not a strided access - clear order.
5017 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5018 // Try to generate strided load node if:
5019 // 1. Target with strided load support is detected.
5020 // 2. The number of loads is greater than MinProfitableStridedLoads,
5021 // or the potential stride <= MaxProfitableLoadStride and the
5022 // potential stride is power-of-2 (to avoid perf regressions for the very
5023 // small number of loads) and max distance > number of loads, or potential
5024 // stride is -1.
5025 // 3. The loads are ordered, or number of unordered loads <=
5026 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5027 // (this check is to avoid extra costs for very expensive shuffles).
5028 // 4. Any pointer operand is an instruction with the users outside of the
5029 // current graph (for masked gathers extra extractelement instructions
5030 // might be required).
5031 auto IsAnyPointerUsedOutGraph =
5032 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5033 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5034 return !getTreeEntry(U) && !MustGather.contains(U);
5035 });
5036 });
5037 const unsigned AbsoluteDiff = std::abs(*Diff);
5038 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5040 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5041 has_single_bit(AbsoluteDiff))) &&
5042 AbsoluteDiff > Sz) ||
5043 *Diff == -(static_cast<int>(Sz) - 1))) {
5044 int Stride = *Diff / static_cast<int>(Sz - 1);
5045 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5046 Align Alignment =
5047 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5048 ->getAlign();
5049 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5050 // Iterate through all pointers and check if all distances are
5051 // unique multiple of Dist.
5052 SmallSet<int, 4> Dists;
5053 for (Value *Ptr : PointerOps) {
5054 int Dist = 0;
5055 if (Ptr == PtrN)
5056 Dist = *Diff;
5057 else if (Ptr != Ptr0)
5058 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5059 // If the strides are not the same or repeated, we can't
5060 // vectorize.
5061 if (((Dist / Stride) * Stride) != Dist ||
5062 !Dists.insert(Dist).second)
5063 break;
5064 }
5065 if (Dists.size() == Sz)
5067 }
5068 }
5069 }
5070 }
5071 // Correctly identify compare the cost of loads + shuffles rather than
5072 // strided/masked gather loads. Returns true if vectorized + shuffles
5073 // representation is better than just gather.
5074 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5075 unsigned *BestVF,
5076 bool ProfitableGatherPointers) {
5077 if (BestVF)
5078 *BestVF = 0;
5079 // Compare masked gather cost and loads + insert subvector costs.
5081 auto [ScalarGEPCost, VectorGEPCost] =
5082 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5083 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5084 // Estimate the cost of masked gather GEP. If not a splat, roughly
5085 // estimate as a buildvector, otherwise estimate as splat.
5086 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5087 VectorType *PtrVecTy =
5088 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5089 VecTy->getNumElements());
5090 if (static_cast<unsigned>(count_if(
5091 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5092 any_of(PointerOps, [&](Value *V) {
5093 return getUnderlyingObject(V) !=
5094 getUnderlyingObject(PointerOps.front());
5095 }))
5096 VectorGEPCost += TTI.getScalarizationOverhead(
5097 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5098 else
5099 VectorGEPCost +=
5101 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5102 /*Insert=*/true, /*Extract=*/false, CostKind) +
5104 // The cost of scalar loads.
5105 InstructionCost ScalarLoadsCost =
5106 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5107 [&](InstructionCost C, Value *V) {
5108 return C + TTI.getInstructionCost(
5109 cast<Instruction>(V), CostKind);
5110 }) +
5111 ScalarGEPCost;
5112 // The cost of masked gather.
5113 InstructionCost MaskedGatherCost =
5115 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5116 /*VariableMask=*/false, CommonAlignment, CostKind) +
5117 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5118 InstructionCost GatherCost =
5119 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5120 /*Extract=*/false, CostKind) +
5121 ScalarLoadsCost;
5122 // The list of loads is small or perform partial check already - directly
5123 // compare masked gather cost and gather cost.
5124 constexpr unsigned ListLimit = 4;
5125 if (!TryRecursiveCheck || VL.size() < ListLimit)
5126 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5127
5128 // FIXME: The following code has not been updated for non-power-of-2
5129 // vectors. The splitting logic here does not cover the original
5130 // vector if the vector factor is not a power of two. FIXME
5131 if (!has_single_bit(VL.size()))
5132 return false;
5133
5134 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5135 unsigned MinVF = getMinVF(2 * Sz);
5136 DemandedElts.clearAllBits();
5137 // Iterate through possible vectorization factors and check if vectorized +
5138 // shuffles is better than just gather.
5139 for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
5141 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5142 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5144 SmallVector<Value *> PointerOps;
5145 LoadsState LS =
5146 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5147 /*TryRecursiveCheck=*/false);
5148 // Check that the sorted loads are consecutive.
5149 if (LS == LoadsState::Gather) {
5150 if (BestVF) {
5151 DemandedElts.setAllBits();
5152 break;
5153 }
5154 DemandedElts.setBits(Cnt, Cnt + VF);
5155 continue;
5156 }
5157 // If need the reorder - consider as high-cost masked gather for now.
5158 if ((LS == LoadsState::Vectorize ||
5160 !Order.empty() && !isReverseOrder(Order))
5162 States.push_back(LS);
5163 }
5164 if (DemandedElts.isAllOnes())
5165 // All loads gathered - try smaller VF.
5166 continue;
5167 // Can be vectorized later as a serie of loads/insertelements.
5168 InstructionCost VecLdCost = 0;
5169 if (!DemandedElts.isZero()) {
5170 VecLdCost =
5171 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5172 /*Extract=*/false, CostKind) +
5173 ScalarGEPCost;
5174 for (unsigned Idx : seq<unsigned>(VL.size()))
5175 if (DemandedElts[Idx])
5176 VecLdCost +=
5177 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5178 }
5179 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5180 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5181 for (auto [I, LS] : enumerate(States)) {
5182 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5183 InstructionCost VectorGEPCost =
5184 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5185 ? 0
5186 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5187 LI0->getPointerOperand(),
5188 Instruction::GetElementPtr, CostKind, ScalarTy,
5189 SubVecTy)
5190 .second;
5191 if (LS == LoadsState::ScatterVectorize) {
5192 if (static_cast<unsigned>(
5193 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5194 PointerOps.size() - 1 ||
5195 any_of(PointerOps, [&](Value *V) {
5196 return getUnderlyingObject(V) !=
5197 getUnderlyingObject(PointerOps.front());
5198 }))
5199 VectorGEPCost += TTI.getScalarizationOverhead(
5200 SubVecTy, APInt::getAllOnes(VF),
5201 /*Insert=*/true, /*Extract=*/false, CostKind);
5202 else
5203 VectorGEPCost +=
5205 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5206 /*Insert=*/true, /*Extract=*/false, CostKind) +
5207 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5208 CostKind);
5209 }
5210 switch (LS) {
5212 VecLdCost +=
5213 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5214 LI0->getPointerAddressSpace(), CostKind,
5216 VectorGEPCost;
5217 break;
5219 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5220 LI0->getPointerOperand(),
5221 /*VariableMask=*/false,
5222 CommonAlignment, CostKind) +
5223 VectorGEPCost;
5224 break;
5226 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5227 LI0->getPointerOperand(),
5228 /*VariableMask=*/false,
5229 CommonAlignment, CostKind) +
5230 VectorGEPCost;
5231 break;
5232 case LoadsState::Gather:
5233 // Gathers are already calculated - ignore.
5234 continue;
5235 }
5236 SmallVector<int> ShuffleMask(VL.size());
5237 for (int Idx : seq<int>(0, VL.size()))
5238 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5239 if (I > 0)
5240 VecLdCost +=
5241 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5242 CostKind, I * VF, SubVecTy);
5243 }
5244 // If masked gather cost is higher - better to vectorize, so
5245 // consider it as a gather node. It will be better estimated
5246 // later.
5247 if (MaskedGatherCost >= VecLdCost &&
5248 VecLdCost - GatherCost < -SLPCostThreshold) {
5249 if (BestVF)
5250 *BestVF = VF;
5251 return true;
5252 }
5253 }
5254 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5255 };
5256 // TODO: need to improve analysis of the pointers, if not all of them are
5257 // GEPs or have > 2 operands, we end up with a gather node, which just
5258 // increases the cost.
5259 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5260 bool ProfitableGatherPointers =
5261 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5262 return L->isLoopInvariant(V);
5263 })) <= Sz / 2;
5264 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5265 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5266 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5267 (GEP && GEP->getNumOperands() == 2 &&
5268 isa<Constant, Instruction>(GEP->getOperand(1)));
5269 })) {
5270 // Check if potential masked gather can be represented as series
5271 // of loads + insertsubvectors.
5272 // If masked gather cost is higher - better to vectorize, so
5273 // consider it as a gather node. It will be better estimated
5274 // later.
5275 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5276 ProfitableGatherPointers))
5278 }
5279
5280 return LoadsState::Gather;
5281}
5282
5284 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5285 const DataLayout &DL, ScalarEvolution &SE,
5286 SmallVectorImpl<unsigned> &SortedIndices) {
5287 assert(
5288 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5289 "Expected list of pointer operands.");
5290 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5291 // Ptr into, sort and return the sorted indices with values next to one
5292 // another.
5295 Bases;
5296 Bases
5297 .try_emplace(std::make_pair(
5299 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5300
5301 SortedIndices.clear();
5302 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5303 auto Key = std::make_pair(BBs[Cnt + 1],
5305 bool Found = any_of(Bases.try_emplace(Key).first->second,
5306 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5307 std::optional<int> Diff = getPointersDiff(
5308 ElemTy, std::get<0>(Base.front()), ElemTy,
5309 Ptr, DL, SE,
5310 /*StrictCheck=*/true);
5311 if (!Diff)
5312 return false;
5313
5314 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5315 return true;
5316 });
5317
5318 if (!Found) {
5319 // If we haven't found enough to usefully cluster, return early.
5320 if (Bases.size() > VL.size() / 2 - 1)
5321 return false;
5322
5323 // Not found already - add a new Base
5324 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5325 }
5326 }
5327
5328 if (Bases.size() == VL.size())
5329 return false;
5330
5331 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5332 Bases.front().second.size() == VL.size()))
5333 return false;
5334
5335 // For each of the bases sort the pointers by Offset and check if any of the
5336 // base become consecutively allocated.
5337 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5338 SmallPtrSet<Value *, 13> FirstPointers;
5339 SmallPtrSet<Value *, 13> SecondPointers;
5340 Value *P1 = Ptr1;
5341 Value *P2 = Ptr2;
5342 if (P1 == P2)
5343 return false;
5344 unsigned Depth = 0;
5345 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) &&
5347 FirstPointers.insert(P1);
5348 SecondPointers.insert(P2);
5349 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5350 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5351 ++Depth;
5352 }
5353 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5354 "Unable to find matching root.");
5355 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5356 };
5357 for (auto &Base : Bases) {
5358 for (auto &Vec : Base.second) {
5359 if (Vec.size() > 1) {
5360 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5361 const std::tuple<Value *, int, unsigned> &Y) {
5362 return std::get<1>(X) < std::get<1>(Y);
5363 });
5364 int InitialOffset = std::get<1>(Vec[0]);
5365 bool AnyConsecutive =
5366 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5367 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5368 });
5369 // Fill SortedIndices array only if it looks worth-while to sort the
5370 // ptrs.
5371 if (!AnyConsecutive)
5372 return false;
5373 }
5374 }
5375 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5376 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5377 });
5378 }
5379
5380 for (auto &T : Bases)
5381 for (const auto &Vec : T.second)
5382 for (const auto &P : Vec)
5383 SortedIndices.push_back(std::get<2>(P));
5384
5385 assert(SortedIndices.size() == VL.size() &&
5386 "Expected SortedIndices to be the size of VL");
5387 return true;
5388}
5389
5390std::optional<BoUpSLP::OrdersType>
5391BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5392 assert(TE.isGather() && "Expected gather node only.");
5393 Type *ScalarTy = TE.Scalars[0]->getType();
5394
5396 Ptrs.reserve(TE.Scalars.size());
5398 BBs.reserve(TE.Scalars.size());
5399 for (Value *V : TE.Scalars) {
5400 auto *L = dyn_cast<LoadInst>(V);
5401 if (!L || !L->isSimple())
5402 return std::nullopt;
5403 Ptrs.push_back(L->getPointerOperand());
5404 BBs.push_back(L->getParent());
5405 }
5406
5407 BoUpSLP::OrdersType Order;
5408 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5409 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5410 return std::move(Order);
5411 return std::nullopt;
5412}
5413
5414/// Check if two insertelement instructions are from the same buildvector.
5417 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5418 // Instructions must be from the same basic blocks.
5419 if (VU->getParent() != V->getParent())
5420 return false;
5421 // Checks if 2 insertelements are from the same buildvector.
5422 if (VU->getType() != V->getType())
5423 return false;
5424 // Multiple used inserts are separate nodes.
5425 if (!VU->hasOneUse() && !V->hasOneUse())
5426 return false;
5427 auto *IE1 = VU;
5428 auto *IE2 = V;
5429 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5430 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5431 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5432 return false;
5433 // Go through the vector operand of insertelement instructions trying to find
5434 // either VU as the original vector for IE2 or V as the original vector for
5435 // IE1.
5436 SmallBitVector ReusedIdx(
5437 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5438 bool IsReusedIdx = false;
5439 do {
5440 if (IE2 == VU && !IE1)
5441 return VU->hasOneUse();
5442 if (IE1 == V && !IE2)
5443 return V->hasOneUse();
5444 if (IE1 && IE1 != V) {
5445 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5446 IsReusedIdx |= ReusedIdx.test(Idx1);
5447 ReusedIdx.set(Idx1);
5448 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5449 IE1 = nullptr;
5450 else
5451 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5452 }
5453 if (IE2 && IE2 != VU) {
5454 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5455 IsReusedIdx |= ReusedIdx.test(Idx2);
5456 ReusedIdx.set(Idx2);
5457 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5458 IE2 = nullptr;
5459 else
5460 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5461 }
5462 } while (!IsReusedIdx && (IE1 || IE2));
5463 return false;
5464}
5465
5466std::optional<BoUpSLP::OrdersType>
5467BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5468 // No need to reorder if need to shuffle reuses, still need to shuffle the
5469 // node.
5470 if (!TE.ReuseShuffleIndices.empty()) {
5471 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5472 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5473 "Reshuffling scalars not yet supported for nodes with padding");
5474
5475 if (isSplat(TE.Scalars))
5476 return std::nullopt;
5477 // Check if reuse shuffle indices can be improved by reordering.
5478 // For this, check that reuse mask is "clustered", i.e. each scalar values
5479 // is used once in each submask of size <number_of_scalars>.
5480 // Example: 4 scalar values.
5481 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5482 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5483 // element 3 is used twice in the second submask.
5484 unsigned Sz = TE.Scalars.size();
5485 if (TE.isGather()) {
5486 if (std::optional<OrdersType> CurrentOrder =
5488 SmallVector<int> Mask;
5489 fixupOrderingIndices(*CurrentOrder);
5490 inversePermutation(*CurrentOrder, Mask);
5491 ::addMask(Mask, TE.ReuseShuffleIndices);
5492 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5493 unsigned Sz = TE.Scalars.size();
5494 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5495 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5496 if (Idx != PoisonMaskElem)
5497 Res[Idx + K * Sz] = I + K * Sz;
5498 }
5499 return std::move(Res);
5500 }
5501 }
5502 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5503 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5504 2 * TE.getVectorFactor())) == 1)
5505 return std::nullopt;
5506 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5507 Sz)) {
5508 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5509 if (TE.ReorderIndices.empty())
5510 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5511 else
5512 inversePermutation(TE.ReorderIndices, ReorderMask);
5513 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5514 unsigned VF = ReorderMask.size();
5515 OrdersType ResOrder(VF, VF);
5516 unsigned NumParts = divideCeil(VF, Sz);
5517 SmallBitVector UsedVals(NumParts);
5518 for (unsigned I = 0; I < VF; I += Sz) {
5519 int Val = PoisonMaskElem;
5520 unsigned UndefCnt = 0;
5521 unsigned Limit = std::min(Sz, VF - I);
5522 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5523 [&](int Idx) {
5524 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5525 Val = Idx;
5526 if (Idx == PoisonMaskElem)
5527 ++UndefCnt;
5528 return Idx != PoisonMaskElem && Idx != Val;
5529 }) ||
5530 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5531 UndefCnt > Sz / 2)
5532 return std::nullopt;
5533 UsedVals.set(Val);
5534 for (unsigned K = 0; K < NumParts; ++K) {
5535 unsigned Idx = Val + Sz * K;
5536 if (Idx < VF)
5537 ResOrder[Idx] = I + K;
5538 }
5539 }
5540 return std::move(ResOrder);
5541 }
5542 unsigned VF = TE.getVectorFactor();
5543 // Try build correct order for extractelement instructions.
5544 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5545 TE.ReuseShuffleIndices.end());
5546 if (TE.getOpcode() == Instruction::ExtractElement &&
5547 all_of(TE.Scalars, [Sz](Value *V) {
5548 if (isa<PoisonValue>(V))
5549 return true;
5550 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5551 return Idx && *Idx < Sz;
5552 })) {
5553 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5554 "by BinaryOperator and CastInst.");
5555 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5556 if (TE.ReorderIndices.empty())
5557 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5558 else
5559 inversePermutation(TE.ReorderIndices, ReorderMask);
5560 for (unsigned I = 0; I < VF; ++I) {
5561 int &Idx = ReusedMask[I];
5562 if (Idx == PoisonMaskElem)
5563 continue;
5564 Value *V = TE.Scalars[ReorderMask[Idx]];
5565 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5566 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5567 }
5568 }
5569 // Build the order of the VF size, need to reorder reuses shuffles, they are
5570 // always of VF size.
5571 OrdersType ResOrder(VF);
5572 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5573 auto *It = ResOrder.begin();
5574 for (unsigned K = 0; K < VF; K += Sz) {
5575 OrdersType CurrentOrder(TE.ReorderIndices);
5576 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5577 if (SubMask.front() == PoisonMaskElem)
5578 std::iota(SubMask.begin(), SubMask.end(), 0);
5579 reorderOrder(CurrentOrder, SubMask);
5580 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5581 std::advance(It, Sz);
5582 }
5583 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5584 return Data.index() == Data.value();
5585 }))
5586 return std::nullopt; // No need to reorder.
5587 return std::move(ResOrder);
5588 }
5589 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5590 any_of(TE.UserTreeIndices,
5591 [](const EdgeInfo &EI) {
5592 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5593 }) &&
5594 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5595 return std::nullopt;
5596 if ((TE.State == TreeEntry::Vectorize ||
5597 TE.State == TreeEntry::StridedVectorize) &&
5598 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5599 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5600 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5601 "BinaryOperator and CastInst.");
5602 return TE.ReorderIndices;
5603 }
5604 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5605 if (!TE.ReorderIndices.empty())
5606 return TE.ReorderIndices;
5607
5608 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5609 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5610 if (!V->hasNUsesOrMore(1))
5611 continue;
5612 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5613 if (!II)
5614 continue;
5615 Instruction *BVHead = nullptr;
5616 BasicBlock *BB = II->getParent();
5617 while (II && II->hasOneUse() && II->getParent() == BB) {
5618 BVHead = II;
5619 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5620 }
5621 I = BVHead;
5622 }
5623
5624 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5625 assert(BB1 != BB2 && "Expected different basic blocks.");
5626 auto *NodeA = DT->getNode(BB1);
5627 auto *NodeB = DT->getNode(BB2);
5628 assert(NodeA && "Should only process reachable instructions");
5629 assert(NodeB && "Should only process reachable instructions");
5630 assert((NodeA == NodeB) ==
5631 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5632 "Different nodes should have different DFS numbers");
5633 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5634 };
5635 auto PHICompare = [&](unsigned I1, unsigned I2) {
5636 Value *V1 = TE.Scalars[I1];
5637 Value *V2 = TE.Scalars[I2];
5638 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0) ||
5639 isa<PoisonValue>(V1) || isa<PoisonValue>(V2))
5640 return false;
5641 if (V1->getNumUses() < V2->getNumUses())
5642 return true;
5643 if (V1->getNumUses() > V2->getNumUses())
5644 return false;
5645 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5646 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5647 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5648 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5649 FirstUserOfPhi2->getParent());
5650 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5651 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5652 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5653 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5654 if (IE1 && !IE2)
5655 return true;
5656 if (!IE1 && IE2)
5657 return false;
5658 if (IE1 && IE2) {
5659 if (UserBVHead[I1] && !UserBVHead[I2])
5660 return true;
5661 if (!UserBVHead[I1])
5662 return false;
5663 if (UserBVHead[I1] == UserBVHead[I2])
5664 return getElementIndex(IE1) < getElementIndex(IE2);
5665 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5666 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5667 UserBVHead[I2]->getParent());
5668 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5669 }
5670 if (EE1 && !EE2)
5671 return true;
5672 if (!EE1 && EE2)
5673 return false;
5674 if (EE1 && EE2) {
5675 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5676 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5677 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5678 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5679 if (!Inst2 && !P2)
5680 return Inst1 || P1;
5681 if (EE1->getOperand(0) == EE2->getOperand(0))
5682 return getElementIndex(EE1) < getElementIndex(EE2);
5683 if (!Inst1 && Inst2)
5684 return false;
5685 if (Inst1 && Inst2) {
5686 if (Inst1->getParent() != Inst2->getParent())
5687 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5688 return Inst1->comesBefore(Inst2);
5689 }
5690 if (!P1 && P2)
5691 return false;
5692 assert(P1 && P2 &&
5693 "Expected either instructions or arguments vector operands.");
5694 return P1->getArgNo() < P2->getArgNo();
5695 }
5696 return false;
5697 };
5698 OrdersType Phis(TE.Scalars.size());
5699 std::iota(Phis.begin(), Phis.end(), 0);
5700 stable_sort(Phis, PHICompare);
5701 if (isIdentityOrder(Phis))
5702 return std::nullopt; // No need to reorder.
5703 return std::move(Phis);
5704 }
5705 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5706 // TODO: add analysis of other gather nodes with extractelement
5707 // instructions and other values/instructions, not only undefs.
5708 if ((TE.getOpcode() == Instruction::ExtractElement ||
5709 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5710 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5711 all_of(TE.Scalars, [](Value *V) {
5712 auto *EE = dyn_cast<ExtractElementInst>(V);
5713 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5714 })) {
5715 // Check that gather of extractelements can be represented as
5716 // just a shuffle of a single vector.
5717 OrdersType CurrentOrder;
5718 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5719 /*ResizeAllowed=*/true);
5720 if (Reuse || !CurrentOrder.empty())
5721 return std::move(CurrentOrder);
5722 }
5723 // If the gather node is <undef, v, .., poison> and
5724 // insertelement poison, v, 0 [+ permute]
5725 // is cheaper than
5726 // insertelement poison, v, n - try to reorder.
5727 // If rotating the whole graph, exclude the permute cost, the whole graph
5728 // might be transformed.
5729 int Sz = TE.Scalars.size();
5730 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5731 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5732 const auto *It =
5733 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5734 if (It == TE.Scalars.begin())
5735 return OrdersType();
5736 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5737 if (It != TE.Scalars.end()) {
5738 OrdersType Order(Sz, Sz);
5739 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5740 Order[Idx] = 0;
5741 fixupOrderingIndices(Order);
5742 SmallVector<int> Mask;
5743 inversePermutation(Order, Mask);
5744 InstructionCost PermuteCost =
5745 TopToBottom
5746 ? 0
5748 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5749 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5750 PoisonValue::get(Ty), *It);
5751 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5752 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5753 PoisonValue::get(Ty), *It);
5754 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5755 OrdersType Order(Sz, Sz);
5756 Order[Idx] = 0;
5757 return std::move(Order);
5758 }
5759 }
5760 }
5761 if (isSplat(TE.Scalars))
5762 return std::nullopt;
5763 if (TE.Scalars.size() >= 3)
5764 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5765 return Order;
5766 // Check if can include the order of vectorized loads. For masked gathers do
5767 // extra analysis later, so include such nodes into a special list.
5768 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5769 SmallVector<Value *> PointerOps;
5770 OrdersType CurrentOrder;
5771 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5772 CurrentOrder, PointerOps);
5774 return std::move(CurrentOrder);
5775 }
5776 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5777 // has been auditted for correctness with non-power-of-two vectors.
5778 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5779 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5780 return CurrentOrder;
5781 }
5782 return std::nullopt;
5783}
5784
5785/// Checks if the given mask is a "clustered" mask with the same clusters of
5786/// size \p Sz, which are not identity submasks.
5788 unsigned Sz) {
5789 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5790 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5791 return false;
5792 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5793 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5794 if (Cluster != FirstCluster)
5795 return false;
5796 }
5797 return true;
5798}
5799
5800void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5801 // Reorder reuses mask.
5802 reorderReuses(TE.ReuseShuffleIndices, Mask);
5803 const unsigned Sz = TE.Scalars.size();
5804 // For vectorized and non-clustered reused no need to do anything else.
5805 if (!TE.isGather() ||
5807 Sz) ||
5808 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5809 return;
5810 SmallVector<int> NewMask;
5811 inversePermutation(TE.ReorderIndices, NewMask);
5812 addMask(NewMask, TE.ReuseShuffleIndices);
5813 // Clear reorder since it is going to be applied to the new mask.
5814 TE.ReorderIndices.clear();
5815 // Try to improve gathered nodes with clustered reuses, if possible.
5816 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5817 SmallVector<unsigned> NewOrder(Slice);
5818 inversePermutation(NewOrder, NewMask);
5819 reorderScalars(TE.Scalars, NewMask);
5820 // Fill the reuses mask with the identity submasks.
5821 for (auto *It = TE.ReuseShuffleIndices.begin(),
5822 *End = TE.ReuseShuffleIndices.end();
5823 It != End; std::advance(It, Sz))
5824 std::iota(It, std::next(It, Sz), 0);
5825}
5826
5828 ArrayRef<unsigned> SecondaryOrder) {
5829 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5830 "Expected same size of orders");
5831 unsigned Sz = Order.size();
5832 SmallBitVector UsedIndices(Sz);
5833 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5834 if (Order[Idx] != Sz)
5835 UsedIndices.set(Order[Idx]);
5836 }
5837 if (SecondaryOrder.empty()) {
5838 for (unsigned Idx : seq<unsigned>(0, Sz))
5839 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5840 Order[Idx] = Idx;
5841 } else {
5842 for (unsigned Idx : seq<unsigned>(0, Sz))
5843 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5844 !UsedIndices.test(SecondaryOrder[Idx]))
5845 Order[Idx] = SecondaryOrder[Idx];
5846 }
5847}
5848
5850 // Maps VF to the graph nodes.
5852 // ExtractElement gather nodes which can be vectorized and need to handle
5853 // their ordering.
5855
5856 // Phi nodes can have preferred ordering based on their result users
5858
5859 // AltShuffles can also have a preferred ordering that leads to fewer
5860 // instructions, e.g., the addsub instruction in x86.
5861 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5862
5863 // Maps a TreeEntry to the reorder indices of external users.
5865 ExternalUserReorderMap;
5866 // Find all reorderable nodes with the given VF.
5867 // Currently the are vectorized stores,loads,extracts + some gathering of
5868 // extracts.
5869 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5870 const std::unique_ptr<TreeEntry> &TE) {
5871 // Look for external users that will probably be vectorized.
5872 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5873 findExternalStoreUsersReorderIndices(TE.get());
5874 if (!ExternalUserReorderIndices.empty()) {
5875 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5876 ExternalUserReorderMap.try_emplace(TE.get(),
5877 std::move(ExternalUserReorderIndices));
5878 }
5879
5880 // Patterns like [fadd,fsub] can be combined into a single instruction in
5881 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5882 // to take into account their order when looking for the most used order.
5883 if (TE->isAltShuffle()) {
5884 VectorType *VecTy =
5885 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5886 unsigned Opcode0 = TE->getOpcode();
5887 unsigned Opcode1 = TE->getAltOpcode();
5888 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5889 // If this pattern is supported by the target then we consider the order.
5890 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5891 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5892 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5893 }
5894 // TODO: Check the reverse order too.
5895 }
5896
5897 if (std::optional<OrdersType> CurrentOrder =
5898 getReorderingData(*TE, /*TopToBottom=*/true)) {
5899 // Do not include ordering for nodes used in the alt opcode vectorization,
5900 // better to reorder them during bottom-to-top stage. If follow the order
5901 // here, it causes reordering of the whole graph though actually it is
5902 // profitable just to reorder the subgraph that starts from the alternate
5903 // opcode vectorization node. Such nodes already end-up with the shuffle
5904 // instruction and it is just enough to change this shuffle rather than
5905 // rotate the scalars for the whole graph.
5906 unsigned Cnt = 0;
5907 const TreeEntry *UserTE = TE.get();
5908 while (UserTE && Cnt < RecursionMaxDepth) {
5909 if (UserTE->UserTreeIndices.size() != 1)
5910 break;
5911 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5912 return EI.UserTE->State == TreeEntry::Vectorize &&
5913 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5914 }))
5915 return;
5916 UserTE = UserTE->UserTreeIndices.back().UserTE;
5917 ++Cnt;
5918 }
5919 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5920 if (!(TE->State == TreeEntry::Vectorize ||
5921 TE->State == TreeEntry::StridedVectorize) ||
5922 !TE->ReuseShuffleIndices.empty())
5923 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5924 if (TE->State == TreeEntry::Vectorize &&
5925 TE->getOpcode() == Instruction::PHI)
5926 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5927 }
5928 });
5929
5930 // Reorder the graph nodes according to their vectorization factor.
5931 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5932 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5933 auto It = VFToOrderedEntries.find(VF);
5934 if (It == VFToOrderedEntries.end())
5935 continue;
5936 // Try to find the most profitable order. We just are looking for the most
5937 // used order and reorder scalar elements in the nodes according to this
5938 // mostly used order.
5939 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5940 // Delete VF entry upon exit.
5941 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5942
5943 // All operands are reordered and used only in this node - propagate the
5944 // most used order to the user node.
5947 OrdersUses;
5949 for (const TreeEntry *OpTE : OrderedEntries) {
5950 // No need to reorder this nodes, still need to extend and to use shuffle,
5951 // just need to merge reordering shuffle and the reuse shuffle.
5952 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5953 continue;
5954 // Count number of orders uses.
5955 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5956 &PhisToOrders]() -> const OrdersType & {
5957 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5958 auto It = GathersToOrders.find(OpTE);
5959 if (It != GathersToOrders.end())
5960 return It->second;
5961 }
5962 if (OpTE->isAltShuffle()) {
5963 auto It = AltShufflesToOrders.find(OpTE);
5964 if (It != AltShufflesToOrders.end())
5965 return It->second;
5966 }
5967 if (OpTE->State == TreeEntry::Vectorize &&
5968 OpTE->getOpcode() == Instruction::PHI) {
5969 auto It = PhisToOrders.find(OpTE);
5970 if (It != PhisToOrders.end())
5971 return It->second;
5972 }
5973 return OpTE->ReorderIndices;
5974 }();
5975 // First consider the order of the external scalar users.
5976 auto It = ExternalUserReorderMap.find(OpTE);
5977 if (It != ExternalUserReorderMap.end()) {
5978 const auto &ExternalUserReorderIndices = It->second;
5979 // If the OpTE vector factor != number of scalars - use natural order,
5980 // it is an attempt to reorder node with reused scalars but with
5981 // external uses.
5982 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5983 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5984 ExternalUserReorderIndices.size();
5985 } else {
5986 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5987 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5988 }
5989 // No other useful reorder data in this entry.
5990 if (Order.empty())
5991 continue;
5992 }
5993 // Stores actually store the mask, not the order, need to invert.
5994 if (OpTE->State == TreeEntry::Vectorize &&
5995 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5996 assert(!OpTE->isAltShuffle() &&
5997 "Alternate instructions are only supported by BinaryOperator "
5998 "and CastInst.");
5999 SmallVector<int> Mask;
6000 inversePermutation(Order, Mask);
6001 unsigned E = Order.size();
6002 OrdersType CurrentOrder(E, E);
6003 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6004 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6005 });
6006 fixupOrderingIndices(CurrentOrder);
6007 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6008 } else {
6009 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6010 }
6011 }
6012 if (OrdersUses.empty())
6013 continue;
6014 // Choose the most used order.
6015 unsigned IdentityCnt = 0;
6016 unsigned FilledIdentityCnt = 0;
6017 OrdersType IdentityOrder(VF, VF);
6018 for (auto &Pair : OrdersUses) {
6019 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6020 if (!Pair.first.empty())
6021 FilledIdentityCnt += Pair.second;
6022 IdentityCnt += Pair.second;
6023 combineOrders(IdentityOrder, Pair.first);
6024 }
6025 }
6026 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6027 unsigned Cnt = IdentityCnt;
6028 for (auto &Pair : OrdersUses) {
6029 // Prefer identity order. But, if filled identity found (non-empty order)
6030 // with same number of uses, as the new candidate order, we can choose
6031 // this candidate order.
6032 if (Cnt < Pair.second ||
6033 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6034 Cnt == Pair.second && !BestOrder.empty() &&
6035 isIdentityOrder(BestOrder))) {
6036 combineOrders(Pair.first, BestOrder);
6037 BestOrder = Pair.first;
6038 Cnt = Pair.second;
6039 } else {
6040 combineOrders(BestOrder, Pair.first);
6041 }
6042 }
6043 // Set order of the user node.
6044 if (isIdentityOrder(BestOrder))
6045 continue;
6046 fixupOrderingIndices(BestOrder);
6047 SmallVector<int> Mask;
6048 inversePermutation(BestOrder, Mask);
6049 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6050 unsigned E = BestOrder.size();
6051 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6052 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6053 });
6054 // Do an actual reordering, if profitable.
6055 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6056 // Just do the reordering for the nodes with the given VF.
6057 if (TE->Scalars.size() != VF) {
6058 if (TE->ReuseShuffleIndices.size() == VF) {
6059 // Need to reorder the reuses masks of the operands with smaller VF to
6060 // be able to find the match between the graph nodes and scalar
6061 // operands of the given node during vectorization/cost estimation.
6062 assert(all_of(TE->UserTreeIndices,
6063 [VF, &TE](const EdgeInfo &EI) {
6064 return EI.UserTE->Scalars.size() == VF ||
6065 EI.UserTE->Scalars.size() ==
6066 TE->Scalars.size();
6067 }) &&
6068 "All users must be of VF size.");
6069 if (SLPReVec) {
6070 assert(SLPReVec && "Only supported by REVEC.");
6071 // ShuffleVectorInst does not do reorderOperands (and it should not
6072 // because ShuffleVectorInst supports only a limited set of
6073 // patterns). Only do reorderNodeWithReuses if all of the users are
6074 // not ShuffleVectorInst.
6075 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6076 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6077 }))
6078 continue;
6079 assert(none_of(TE->UserTreeIndices,
6080 [&](const EdgeInfo &EI) {
6081 return isa<ShuffleVectorInst>(
6082 EI.UserTE->getMainOp());
6083 }) &&
6084 "Does not know how to reorder.");
6085 }
6086 // Update ordering of the operands with the smaller VF than the given
6087 // one.
6088 reorderNodeWithReuses(*TE, Mask);
6089 }
6090 continue;
6091 }
6092 if ((TE->State == TreeEntry::Vectorize ||
6093 TE->State == TreeEntry::StridedVectorize) &&
6095 InsertElementInst>(TE->getMainOp()) ||
6096 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6097 assert(!TE->isAltShuffle() &&
6098 "Alternate instructions are only supported by BinaryOperator "
6099 "and CastInst.");
6100 // Build correct orders for extract{element,value}, loads and
6101 // stores.
6102 reorderOrder(TE->ReorderIndices, Mask);
6103 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6104 TE->reorderOperands(Mask);
6105 } else {
6106 // Reorder the node and its operands.
6107 TE->reorderOperands(Mask);
6108 assert(TE->ReorderIndices.empty() &&
6109 "Expected empty reorder sequence.");
6110 reorderScalars(TE->Scalars, Mask);
6111 }
6112 if (!TE->ReuseShuffleIndices.empty()) {
6113 // Apply reversed order to keep the original ordering of the reused
6114 // elements to avoid extra reorder indices shuffling.
6115 OrdersType CurrentOrder;
6116 reorderOrder(CurrentOrder, MaskOrder);
6117 SmallVector<int> NewReuses;
6118 inversePermutation(CurrentOrder, NewReuses);
6119 addMask(NewReuses, TE->ReuseShuffleIndices);
6120 TE->ReuseShuffleIndices.swap(NewReuses);
6121 }
6122 }
6123 }
6124}
6125
6126bool BoUpSLP::canReorderOperands(
6127 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6128 ArrayRef<TreeEntry *> ReorderableGathers,
6129 SmallVectorImpl<TreeEntry *> &GatherOps) {
6130 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6131 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6132 return OpData.first == I &&
6133 (OpData.second->State == TreeEntry::Vectorize ||
6134 OpData.second->State == TreeEntry::StridedVectorize);
6135 }))
6136 continue;
6137 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6138 // Do not reorder if operand node is used by many user nodes.
6139 if (any_of(TE->UserTreeIndices,
6140 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6141 return false;
6142 // Add the node to the list of the ordered nodes with the identity
6143 // order.
6144 Edges.emplace_back(I, TE);
6145 // Add ScatterVectorize nodes to the list of operands, where just
6146 // reordering of the scalars is required. Similar to the gathers, so
6147 // simply add to the list of gathered ops.
6148 // If there are reused scalars, process this node as a regular vectorize
6149 // node, just reorder reuses mask.
6150 if (TE->State != TreeEntry::Vectorize &&
6151 TE->State != TreeEntry::StridedVectorize &&
6152 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6153 GatherOps.push_back(TE);
6154 continue;
6155 }
6156 TreeEntry *Gather = nullptr;
6157 if (count_if(ReorderableGathers,
6158 [&Gather, UserTE, I](TreeEntry *TE) {
6159 assert(TE->State != TreeEntry::Vectorize &&
6160 TE->State != TreeEntry::StridedVectorize &&
6161 "Only non-vectorized nodes are expected.");
6162 if (any_of(TE->UserTreeIndices,
6163 [UserTE, I](const EdgeInfo &EI) {
6164 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6165 })) {
6166 assert(TE->isSame(UserTE->getOperand(I)) &&
6167 "Operand entry does not match operands.");
6168 Gather = TE;
6169 return true;
6170 }
6171 return false;
6172 }) > 1 &&
6173 !allConstant(UserTE->getOperand(I)))
6174 return false;
6175 if (Gather)
6176 GatherOps.push_back(Gather);
6177 }
6178 return true;
6179}
6180
6181void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6182 SetVector<TreeEntry *> OrderedEntries;
6183 DenseSet<const TreeEntry *> GathersToOrders;
6184 // Find all reorderable leaf nodes with the given VF.
6185 // Currently the are vectorized loads,extracts without alternate operands +
6186 // some gathering of extracts.
6187 SmallVector<TreeEntry *> NonVectorized;
6188 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6189 if (TE->State != TreeEntry::Vectorize &&
6190 TE->State != TreeEntry::StridedVectorize)
6191 NonVectorized.push_back(TE.get());
6192 if (std::optional<OrdersType> CurrentOrder =
6193 getReorderingData(*TE, /*TopToBottom=*/false)) {
6194 OrderedEntries.insert(TE.get());
6195 if (!(TE->State == TreeEntry::Vectorize ||
6196 TE->State == TreeEntry::StridedVectorize) ||
6197 !TE->ReuseShuffleIndices.empty())
6198 GathersToOrders.insert(TE.get());
6199 }
6200 }
6201
6202 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6203 // I.e., if the node has operands, that are reordered, try to make at least
6204 // one operand order in the natural order and reorder others + reorder the
6205 // user node itself.
6207 while (!OrderedEntries.empty()) {
6208 // 1. Filter out only reordered nodes.
6209 // 2. If the entry has multiple uses - skip it and jump to the next node.
6211 SmallVector<TreeEntry *> Filtered;
6212 for (TreeEntry *TE : OrderedEntries) {
6213 if (!(TE->State == TreeEntry::Vectorize ||
6214 TE->State == TreeEntry::StridedVectorize ||
6215 (TE->isGather() && GathersToOrders.contains(TE))) ||
6216 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6217 !all_of(drop_begin(TE->UserTreeIndices),
6218 [TE](const EdgeInfo &EI) {
6219 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6220 }) ||
6221 !Visited.insert(TE).second) {
6222 Filtered.push_back(TE);
6223 continue;
6224 }
6225 // Build a map between user nodes and their operands order to speedup
6226 // search. The graph currently does not provide this dependency directly.
6227 for (EdgeInfo &EI : TE->UserTreeIndices)
6228 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6229 }
6230 // Erase filtered entries.
6231 for (TreeEntry *TE : Filtered)
6232 OrderedEntries.remove(TE);
6234 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6235 UsersVec(Users.begin(), Users.end());
6236 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6237 return Data1.first->Idx > Data2.first->Idx;
6238 });
6239 for (auto &Data : UsersVec) {
6240 // Check that operands are used only in the User node.
6241 SmallVector<TreeEntry *> GatherOps;
6242 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6243 GatherOps)) {
6244 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6245 OrderedEntries.remove(Op.second);
6246 continue;
6247 }
6248 // All operands are reordered and used only in this node - propagate the
6249 // most used order to the user node.
6252 OrdersUses;
6253 // Do the analysis for each tree entry only once, otherwise the order of
6254 // the same node my be considered several times, though might be not
6255 // profitable.
6258 for (const auto &Op : Data.second) {
6259 TreeEntry *OpTE = Op.second;
6260 if (!VisitedOps.insert(OpTE).second)
6261 continue;
6262 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6263 continue;
6264 const auto Order = [&]() -> const OrdersType {
6265 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6266 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6267 .value_or(OrdersType(1));
6268 return OpTE->ReorderIndices;
6269 }();
6270 // The order is partially ordered, skip it in favor of fully non-ordered
6271 // orders.
6272 if (Order.size() == 1)
6273 continue;
6274 unsigned NumOps = count_if(
6275 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6276 return P.second == OpTE;
6277 });
6278 // Stores actually store the mask, not the order, need to invert.
6279 if (OpTE->State == TreeEntry::Vectorize &&
6280 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6281 assert(!OpTE->isAltShuffle() &&
6282 "Alternate instructions are only supported by BinaryOperator "
6283 "and CastInst.");
6284 SmallVector<int> Mask;
6285 inversePermutation(Order, Mask);
6286 unsigned E = Order.size();
6287 OrdersType CurrentOrder(E, E);
6288 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6289 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6290 });
6291 fixupOrderingIndices(CurrentOrder);
6292 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6293 NumOps;
6294 } else {
6295 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6296 }
6297 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6298 const auto AllowsReordering = [&](const TreeEntry *TE) {
6299 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6300 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6301 (IgnoreReorder && TE->Idx == 0))
6302 return true;
6303 if (TE->isGather()) {
6304 if (GathersToOrders.contains(TE))
6305 return !getReorderingData(*TE, /*TopToBottom=*/false)
6306 .value_or(OrdersType(1))
6307 .empty();
6308 return true;
6309 }
6310 return false;
6311 };
6312 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6313 TreeEntry *UserTE = EI.UserTE;
6314 if (!VisitedUsers.insert(UserTE).second)
6315 continue;
6316 // May reorder user node if it requires reordering, has reused
6317 // scalars, is an alternate op vectorize node or its op nodes require
6318 // reordering.
6319 if (AllowsReordering(UserTE))
6320 continue;
6321 // Check if users allow reordering.
6322 // Currently look up just 1 level of operands to avoid increase of
6323 // the compile time.
6324 // Profitable to reorder if definitely more operands allow
6325 // reordering rather than those with natural order.
6327 if (static_cast<unsigned>(count_if(
6328 Ops, [UserTE, &AllowsReordering](
6329 const std::pair<unsigned, TreeEntry *> &Op) {
6330 return AllowsReordering(Op.second) &&
6331 all_of(Op.second->UserTreeIndices,
6332 [UserTE](const EdgeInfo &EI) {
6333 return EI.UserTE == UserTE;
6334 });
6335 })) <= Ops.size() / 2)
6336 ++Res.first->second;
6337 }
6338 }
6339 if (OrdersUses.empty()) {
6340 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6341 OrderedEntries.remove(Op.second);
6342 continue;
6343 }
6344 // Choose the most used order.
6345 unsigned IdentityCnt = 0;
6346 unsigned VF = Data.second.front().second->getVectorFactor();
6347 OrdersType IdentityOrder(VF, VF);
6348 for (auto &Pair : OrdersUses) {
6349 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6350 IdentityCnt += Pair.second;
6351 combineOrders(IdentityOrder, Pair.first);
6352 }
6353 }
6354 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6355 unsigned Cnt = IdentityCnt;
6356 for (auto &Pair : OrdersUses) {
6357 // Prefer identity order. But, if filled identity found (non-empty
6358 // order) with same number of uses, as the new candidate order, we can
6359 // choose this candidate order.
6360 if (Cnt < Pair.second) {
6361 combineOrders(Pair.first, BestOrder);
6362 BestOrder = Pair.first;
6363 Cnt = Pair.second;
6364 } else {
6365 combineOrders(BestOrder, Pair.first);
6366 }
6367 }
6368 // Set order of the user node.
6369 if (isIdentityOrder(BestOrder)) {
6370 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6371 OrderedEntries.remove(Op.second);
6372 continue;
6373 }
6374 fixupOrderingIndices(BestOrder);
6375 // Erase operands from OrderedEntries list and adjust their orders.
6376 VisitedOps.clear();
6377 SmallVector<int> Mask;
6378 inversePermutation(BestOrder, Mask);
6379 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6380 unsigned E = BestOrder.size();
6381 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6382 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6383 });
6384 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6385 TreeEntry *TE = Op.second;
6386 OrderedEntries.remove(TE);
6387 if (!VisitedOps.insert(TE).second)
6388 continue;
6389 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6390 reorderNodeWithReuses(*TE, Mask);
6391 continue;
6392 }
6393 // Gathers are processed separately.
6394 if (TE->State != TreeEntry::Vectorize &&
6395 TE->State != TreeEntry::StridedVectorize &&
6396 (TE->State != TreeEntry::ScatterVectorize ||
6397 TE->ReorderIndices.empty()))
6398 continue;
6399 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6400 TE->ReorderIndices.empty()) &&
6401 "Non-matching sizes of user/operand entries.");
6402 reorderOrder(TE->ReorderIndices, Mask);
6403 if (IgnoreReorder && TE == VectorizableTree.front().get())
6404 IgnoreReorder = false;
6405 }
6406 // For gathers just need to reorder its scalars.
6407 for (TreeEntry *Gather : GatherOps) {
6408 assert(Gather->ReorderIndices.empty() &&
6409 "Unexpected reordering of gathers.");
6410 if (!Gather->ReuseShuffleIndices.empty()) {
6411 // Just reorder reuses indices.
6412 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6413 continue;
6414 }
6415 reorderScalars(Gather->Scalars, Mask);
6416 OrderedEntries.remove(Gather);
6417 }
6418 // Reorder operands of the user node and set the ordering for the user
6419 // node itself.
6420 if (Data.first->State != TreeEntry::Vectorize ||
6421 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6422 Data.first->getMainOp()) ||
6423 Data.first->isAltShuffle())
6424 Data.first->reorderOperands(Mask);
6425 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6426 Data.first->isAltShuffle() ||
6427 Data.first->State == TreeEntry::StridedVectorize) {
6428 reorderScalars(Data.first->Scalars, Mask);
6429 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6430 /*BottomOrder=*/true);
6431 if (Data.first->ReuseShuffleIndices.empty() &&
6432 !Data.first->ReorderIndices.empty() &&
6433 !Data.first->isAltShuffle()) {
6434 // Insert user node to the list to try to sink reordering deeper in
6435 // the graph.
6436 OrderedEntries.insert(Data.first);
6437 }
6438 } else {
6439 reorderOrder(Data.first->ReorderIndices, Mask);
6440 }
6441 }
6442 }
6443 // If the reordering is unnecessary, just remove the reorder.
6444 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6445 VectorizableTree.front()->ReuseShuffleIndices.empty())
6446 VectorizableTree.front()->ReorderIndices.clear();
6447}
6448
6449Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6450 if ((Entry.getOpcode() == Instruction::Store ||
6451 Entry.getOpcode() == Instruction::Load) &&
6452 Entry.State == TreeEntry::StridedVectorize &&
6453 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6454 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6455 return dyn_cast<Instruction>(Entry.Scalars.front());
6456}
6457
6459 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6460 DenseMap<Value *, unsigned> ScalarToExtUses;
6461 // Collect the values that we need to extract from the tree.
6462 for (auto &TEPtr : VectorizableTree) {
6463 TreeEntry *Entry = TEPtr.get();
6464
6465 // No need to handle users of gathered values.
6466 if (Entry->isGather())
6467 continue;
6468
6469 // For each lane:
6470 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6471 Value *Scalar = Entry->Scalars[Lane];
6472 if (!isa<Instruction>(Scalar))
6473 continue;
6474 // All uses must be replaced already? No need to do it again.
6475 auto It = ScalarToExtUses.find(Scalar);
6476 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6477 continue;
6478
6479 // Check if the scalar is externally used as an extra arg.
6480 const auto ExtI = ExternallyUsedValues.find(Scalar);
6481 if (ExtI != ExternallyUsedValues.end()) {
6482 int FoundLane = Entry->findLaneForValue(Scalar);
6483 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6484 << FoundLane << " from " << *Scalar << ".\n");
6485 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6486 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6487 continue;
6488 }
6489 for (User *U : Scalar->users()) {
6490 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6491
6492 Instruction *UserInst = dyn_cast<Instruction>(U);
6493 if (!UserInst || isDeleted(UserInst))
6494 continue;
6495
6496 // Ignore users in the user ignore list.
6497 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6498 continue;
6499
6500 // Skip in-tree scalars that become vectors
6501 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6502 // Some in-tree scalars will remain as scalar in vectorized
6503 // instructions. If that is the case, the one in FoundLane will
6504 // be used.
6505 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6507 Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
6508 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6509 << ".\n");
6510 assert(!UseEntry->isGather() && "Bad state");
6511 continue;
6512 }
6513 U = nullptr;
6514 if (It != ScalarToExtUses.end()) {
6515 ExternalUses[It->second].User = nullptr;
6516 break;
6517 }
6518 }
6519
6520 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6521 U = nullptr;
6522 int FoundLane = Entry->findLaneForValue(Scalar);
6523 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6524 << " from lane " << FoundLane << " from " << *Scalar
6525 << ".\n");
6526 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6527 ExternalUses.emplace_back(Scalar, U, FoundLane);
6528 if (!U)
6529 break;
6530 }
6531 }
6532 }
6533}
6534
6536BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6539 PtrToStoresMap;
6540 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6541 Value *V = TE->Scalars[Lane];
6542 // Don't iterate over the users of constant data.
6543 if (!isa<Instruction>(V))
6544 continue;
6545 // To save compilation time we don't visit if we have too many users.
6546 if (V->hasNUsesOrMore(UsesLimit))
6547 break;
6548
6549 // Collect stores per pointer object.
6550 for (User *U : V->users()) {
6551 auto *SI = dyn_cast<StoreInst>(U);
6552 // Test whether we can handle the store. V might be a global, which could
6553 // be used in a different function.
6554 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6555 !isValidElementType(SI->getValueOperand()->getType()))
6556 continue;
6557 // Skip entry if already
6558 if (getTreeEntry(U))
6559 continue;
6560
6561 Value *Ptr =
6562 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6563 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6564 SI->getValueOperand()->getType(), Ptr}];
6565 // For now just keep one store per pointer object per lane.
6566 // TODO: Extend this to support multiple stores per pointer per lane
6567 if (StoresVec.size() > Lane)
6568 continue;
6569 if (!StoresVec.empty()) {
6570 std::optional<int> Diff = getPointersDiff(
6571 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6572 SI->getValueOperand()->getType(),
6573 StoresVec.front()->getPointerOperand(), *DL, *SE,
6574 /*StrictCheck=*/true);
6575 // We failed to compare the pointers so just abandon this store.
6576 if (!Diff)
6577 continue;
6578 }
6579 StoresVec.push_back(SI);
6580 }
6581 }
6582 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6583 unsigned I = 0;
6584 for (auto &P : PtrToStoresMap) {
6585 Res[I].swap(P.second);
6586 ++I;
6587 }
6588 return Res;
6589}
6590
6591bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6592 OrdersType &ReorderIndices) const {
6593 // We check whether the stores in StoreVec can form a vector by sorting them
6594 // and checking whether they are consecutive.
6595
6596 // To avoid calling getPointersDiff() while sorting we create a vector of
6597 // pairs {store, offset from first} and sort this instead.
6599 StoreInst *S0 = StoresVec[0];
6600 StoreOffsetVec.emplace_back(0, 0);
6601 Type *S0Ty = S0->getValueOperand()->getType();
6602 Value *S0Ptr = S0->getPointerOperand();
6603 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6604 StoreInst *SI = StoresVec[Idx];
6605 std::optional<int> Diff =
6606 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6607 SI->getPointerOperand(), *DL, *SE,
6608 /*StrictCheck=*/true);
6609 StoreOffsetVec.emplace_back(*Diff, Idx);
6610 }
6611
6612 // Check if the stores are consecutive by checking if their difference is 1.
6613 if (StoreOffsetVec.size() != StoresVec.size())
6614 return false;
6615 sort(StoreOffsetVec,
6616 [](const std::pair<int, unsigned> &L,
6617 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6618 unsigned Idx = 0;
6619 int PrevDist = 0;
6620 for (const auto &P : StoreOffsetVec) {
6621 if (Idx > 0 && P.first != PrevDist + 1)
6622 return false;
6623 PrevDist = P.first;
6624 ++Idx;
6625 }
6626
6627 // Calculate the shuffle indices according to their offset against the sorted
6628 // StoreOffsetVec.
6629 ReorderIndices.assign(StoresVec.size(), 0);
6630 bool IsIdentity = true;
6631 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6632 ReorderIndices[P.second] = I;
6633 IsIdentity &= P.second == I;
6634 }
6635 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6636 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6637 // same convention here.
6638 if (IsIdentity)
6639 ReorderIndices.clear();
6640
6641 return true;
6642}
6643
6644#ifndef NDEBUG
6646 for (unsigned Idx : Order)
6647 dbgs() << Idx << ", ";
6648 dbgs() << "\n";
6649}
6650#endif
6651
6653BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6654 unsigned NumLanes = TE->Scalars.size();
6655
6656 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6657
6658 // Holds the reorder indices for each candidate store vector that is a user of
6659 // the current TreeEntry.
6660 SmallVector<OrdersType, 1> ExternalReorderIndices;
6661
6662 // Now inspect the stores collected per pointer and look for vectorization
6663 // candidates. For each candidate calculate the reorder index vector and push
6664 // it into `ExternalReorderIndices`
6665 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6666 // If we have fewer than NumLanes stores, then we can't form a vector.
6667 if (StoresVec.size() != NumLanes)
6668 continue;
6669
6670 // If the stores are not consecutive then abandon this StoresVec.
6671 OrdersType ReorderIndices;
6672 if (!canFormVector(StoresVec, ReorderIndices))
6673 continue;
6674
6675 // We now know that the scalars in StoresVec can form a vector instruction,
6676 // so set the reorder indices.
6677 ExternalReorderIndices.push_back(ReorderIndices);
6678 }
6679 return ExternalReorderIndices;
6680}
6681
6683 const SmallDenseSet<Value *> &UserIgnoreLst) {
6684 deleteTree();
6685 UserIgnoreList = &UserIgnoreLst;
6686 if (!allSameType(Roots))
6687 return;
6688 buildTree_rec(Roots, 0, EdgeInfo());
6689}
6690
6692 deleteTree();
6693 if (!allSameType(Roots))
6694 return;
6695 buildTree_rec(Roots, 0, EdgeInfo());
6696}
6697
6698/// Tries to find subvector of loads and builds new vector of only loads if can
6699/// be profitable.
6701 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6703 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6704 bool AddNew = true) {
6705 if (VL.empty())
6706 return;
6707 Type *ScalarTy = getValueType(VL.front());
6708 if (!isValidElementType(ScalarTy))
6709 return;
6711 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6712 for (Value *V : VL) {
6713 auto *LI = dyn_cast<LoadInst>(V);
6714 if (!LI)
6715 continue;
6716 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6717 continue;
6718 bool IsFound = false;
6719 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6720 assert(LI->getParent() == Data.front().first->getParent() &&
6721 LI->getType() == Data.front().first->getType() &&
6722 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6723 getUnderlyingObject(Data.front().first->getPointerOperand(),
6725 "Expected loads with the same type, same parent and same "
6726 "underlying pointer.");
6727 std::optional<int> Dist = getPointersDiff(
6728 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6729 Data.front().first->getPointerOperand(), DL, SE,
6730 /*StrictCheck=*/true);
6731 if (!Dist)
6732 continue;
6733 auto It = Map.find(*Dist);
6734 if (It != Map.end() && It->second != LI)
6735 continue;
6736 if (It == Map.end()) {
6737 Data.emplace_back(LI, *Dist);
6738 Map.try_emplace(*Dist, LI);
6739 }
6740 IsFound = true;
6741 break;
6742 }
6743 if (!IsFound) {
6744 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6745 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6746 }
6747 }
6748 auto FindMatchingLoads =
6751 &GatheredLoads,
6752 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6753 int &Offset, unsigned &Start) {
6754 if (Loads.empty())
6755 return GatheredLoads.end();
6757 LoadInst *LI = Loads.front().first;
6758 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6759 if (Idx < Start)
6760 continue;
6761 ToAdd.clear();
6762 if (LI->getParent() != Data.front().first->getParent() ||
6763 LI->getType() != Data.front().first->getType())
6764 continue;
6765 std::optional<int> Dist =
6767 Data.front().first->getType(),
6768 Data.front().first->getPointerOperand(), DL, SE,
6769 /*StrictCheck=*/true);
6770 if (!Dist)
6771 continue;
6772 SmallSet<int, 4> DataDists;
6774 for (std::pair<LoadInst *, int> P : Data) {
6775 DataDists.insert(P.second);
6776 DataLoads.insert(P.first);
6777 }
6778 // Found matching gathered loads - check if all loads are unique or
6779 // can be effectively vectorized.
6780 unsigned NumUniques = 0;
6781 for (auto [Cnt, Pair] : enumerate(Loads)) {
6782 bool Used = DataLoads.contains(Pair.first);
6783 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6784 ++NumUniques;
6785 ToAdd.insert(Cnt);
6786 } else if (Used) {
6787 Repeated.insert(Cnt);
6788 }
6789 }
6790 if (NumUniques > 0 &&
6791 (Loads.size() == NumUniques ||
6792 (Loads.size() - NumUniques >= 2 &&
6793 Loads.size() - NumUniques >= Loads.size() / 2 &&
6794 (has_single_bit(Data.size() + NumUniques) ||
6795 bit_ceil(Data.size()) <
6796 bit_ceil(Data.size() + NumUniques))))) {
6797 Offset = *Dist;
6798 Start = Idx + 1;
6799 return std::next(GatheredLoads.begin(), Idx);
6800 }
6801 }
6802 ToAdd.clear();
6803 return GatheredLoads.end();
6804 };
6805 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6806 unsigned Start = 0;
6807 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6808 int Offset = 0;
6809 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6810 Offset, Start);
6811 while (It != GatheredLoads.end()) {
6812 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6813 for (unsigned Idx : LocalToAdd)
6814 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6815 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6816 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6817 Start);
6818 }
6819 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6820 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6821 })) {
6822 auto AddNewLoads =
6824 for (unsigned Idx : seq<unsigned>(Data.size())) {
6825 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6826 continue;
6827 Loads.push_back(Data[Idx]);
6828 }
6829 };
6830 if (!AddNew) {
6831 LoadInst *LI = Data.front().first;
6832 It = find_if(
6833 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6834 return PD.front().first->getParent() == LI->getParent() &&
6835 PD.front().first->getType() == LI->getType();
6836 });
6837 while (It != GatheredLoads.end()) {
6838 AddNewLoads(*It);
6839 It = std::find_if(
6840 std::next(It), GatheredLoads.end(),
6841 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6842 return PD.front().first->getParent() == LI->getParent() &&
6843 PD.front().first->getType() == LI->getType();
6844 });
6845 }
6846 }
6847 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6848 AddNewLoads(GatheredLoads.emplace_back());
6849 }
6850 }
6851}
6852
6853void BoUpSLP::tryToVectorizeGatheredLoads(
6854 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6855 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6856 8> &GatheredLoads) {
6857 GatheredLoadsEntriesFirst = VectorizableTree.size();
6858
6859 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6860 LoadEntriesToVectorize.size());
6861 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6862 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6863 VectorizableTree[Idx]->Scalars.end());
6864
6865 // Sort loads by distance.
6866 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6867 const std::pair<LoadInst *, int> &L2) {
6868 return L1.second > L2.second;
6869 };
6870
6871 auto IsMaskedGatherSupported = [&](ArrayRef<LoadInst *> Loads) {
6872 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6873 Loads.size());
6874 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6875 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6876 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6877 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6878 };
6879
6880 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6881 BoUpSLP::ValueSet &VectorizedLoads,
6882 SmallVectorImpl<LoadInst *> &NonVectorized,
6883 bool Final, unsigned MaxVF) {
6885 unsigned StartIdx = 0;
6886 SmallVector<int> CandidateVFs;
6887 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6888 CandidateVFs.push_back(MaxVF);
6889 for (int NumElts = getFloorFullVectorNumberOfElements(
6890 *TTI, Loads.front()->getType(), MaxVF);
6891 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6892 *TTI, Loads.front()->getType(), NumElts - 1)) {
6893 CandidateVFs.push_back(NumElts);
6894 if (VectorizeNonPowerOf2 && NumElts > 2)
6895 CandidateVFs.push_back(NumElts - 1);
6896 }
6897
6898 if (Final && CandidateVFs.empty())
6899 return Results;
6900
6901 unsigned BestVF = Final ? CandidateVFs.back() : 0;
6902 for (unsigned NumElts : CandidateVFs) {
6903 if (Final && NumElts > BestVF)
6904 continue;
6905 SmallVector<unsigned> MaskedGatherVectorized;
6906 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
6907 ++Cnt) {
6908 ArrayRef<LoadInst *> Slice =
6909 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
6910 if (VectorizedLoads.count(Slice.front()) ||
6911 VectorizedLoads.count(Slice.back()) ||
6913 continue;
6914 // Check if it is profitable to try vectorizing gathered loads. It is
6915 // profitable if we have more than 3 consecutive loads or if we have
6916 // less but all users are vectorized or deleted.
6917 bool AllowToVectorize = false;
6918 // Check if it is profitable to vectorize 2-elements loads.
6919 if (NumElts == 2) {
6920 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
6921 Slice.front()->getType(), ElementCount::getFixed(NumElts));
6922 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
6923 for (LoadInst *LI : Slice) {
6924 // If single use/user - allow to vectorize.
6925 if (LI->hasOneUse())
6926 continue;
6927 // 1. Check if number of uses equals number of users.
6928 // 2. All users are deleted.
6929 // 3. The load broadcasts are not allowed or the load is not
6930 // broadcasted.
6931 if (static_cast<unsigned int>(std::distance(
6932 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6933 return false;
6934 if (!IsLegalBroadcastLoad)
6935 continue;
6936 if (LI->hasNUsesOrMore(UsesLimit))
6937 return false;
6938 for (User *U : LI->users()) {
6939 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
6940 continue;
6941 if (const TreeEntry *UTE = getTreeEntry(U)) {
6942 for (int I : seq<int>(UTE->getNumOperands())) {
6943 if (all_of(UTE->getOperand(I),
6944 [LI](Value *V) { return V == LI; }))
6945 // Found legal broadcast - do not vectorize.
6946 return false;
6947 }
6948 }
6949 }
6950 }
6951 return true;
6952 };
6953 AllowToVectorize = CheckIfAllowed(Slice);
6954 } else {
6955 AllowToVectorize =
6956 (NumElts >= 3 ||
6957 any_of(ValueToGatherNodes.at(Slice.front()),
6958 [=](const TreeEntry *TE) {
6959 return TE->Scalars.size() == 2 &&
6960 ((TE->Scalars.front() == Slice.front() &&
6961 TE->Scalars.back() == Slice.back()) ||
6962 (TE->Scalars.front() == Slice.back() &&
6963 TE->Scalars.back() == Slice.front()));
6964 })) &&
6965 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
6966 Slice.size());
6967 }
6968 if (AllowToVectorize) {
6969 SmallVector<Value *> PointerOps;
6970 OrdersType CurrentOrder;
6971 // Try to build vector load.
6972 ArrayRef<Value *> Values(
6973 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
6974 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
6975 PointerOps, &BestVF);
6976 if (LS != LoadsState::Gather ||
6977 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
6978 if (LS == LoadsState::ScatterVectorize) {
6979 if (MaskedGatherVectorized.empty() ||
6980 Cnt >= MaskedGatherVectorized.back() + NumElts)
6981 MaskedGatherVectorized.push_back(Cnt);
6982 continue;
6983 }
6984 if (LS != LoadsState::Gather) {
6985 Results.emplace_back(Values, LS);
6986 VectorizedLoads.insert(Slice.begin(), Slice.end());
6987 // If we vectorized initial block, no need to try to vectorize it
6988 // again.
6989 if (Cnt == StartIdx)
6990 StartIdx += NumElts;
6991 }
6992 // Check if the whole array was vectorized already - exit.
6993 if (StartIdx >= Loads.size())
6994 break;
6995 // Erase last masked gather candidate, if another candidate within
6996 // the range is found to be better.
6997 if (!MaskedGatherVectorized.empty() &&
6998 Cnt < MaskedGatherVectorized.back() + NumElts)
6999 MaskedGatherVectorized.pop_back();
7000 Cnt += NumElts - 1;
7001 continue;
7002 }
7003 }
7004 if (!AllowToVectorize || BestVF == 0)
7006 }
7007 // Mark masked gathers candidates as vectorized, if any.
7008 for (unsigned Cnt : MaskedGatherVectorized) {
7009 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7010 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7011 ArrayRef<Value *> Values(
7012 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7013 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7014 VectorizedLoads.insert(Slice.begin(), Slice.end());
7015 // If we vectorized initial block, no need to try to vectorize it again.
7016 if (Cnt == StartIdx)
7017 StartIdx += NumElts;
7018 }
7019 }
7020 for (LoadInst *LI : Loads) {
7021 if (!VectorizedLoads.contains(LI))
7022 NonVectorized.push_back(LI);
7023 }
7024 return Results;
7025 };
7026 auto ProcessGatheredLoads =
7027 [&, &TTI = *TTI](
7029 bool Final = false) {
7030 SmallVector<LoadInst *> NonVectorized;
7031 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7032 if (LoadsDists.size() <= 1) {
7033 NonVectorized.push_back(LoadsDists.back().first);
7034 continue;
7035 }
7036 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7037 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7038 transform(
7039 LoadsDists, OriginalLoads.begin(),
7040 [](const std::pair<LoadInst *, int> &L) { return L.first; });
7041 stable_sort(LocalLoadsDists, LoadSorter);
7043 unsigned MaxConsecutiveDistance = 0;
7044 unsigned CurrentConsecutiveDist = 1;
7045 int LastDist = LocalLoadsDists.front().second;
7046 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7047 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7048 if (getTreeEntry(L.first))
7049 continue;
7050 assert(LastDist >= L.second &&
7051 "Expected first distance always not less than second");
7052 if (static_cast<unsigned>(LastDist - L.second) ==
7053 CurrentConsecutiveDist) {
7054 ++CurrentConsecutiveDist;
7055 MaxConsecutiveDistance =
7056 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7057 Loads.push_back(L.first);
7058 continue;
7059 }
7060 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7061 !Loads.empty())
7062 Loads.pop_back();
7063 CurrentConsecutiveDist = 1;
7064 LastDist = L.second;
7065 Loads.push_back(L.first);
7066 }
7067 if (Loads.size() <= 1)
7068 continue;
7069 if (AllowMaskedGather)
7070 MaxConsecutiveDistance = Loads.size();
7071 else if (MaxConsecutiveDistance < 2)
7072 continue;
7073 BoUpSLP::ValueSet VectorizedLoads;
7074 SmallVector<LoadInst *> SortedNonVectorized;
7076 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7077 Final, MaxConsecutiveDistance);
7078 if (!Results.empty() && !SortedNonVectorized.empty() &&
7079 OriginalLoads.size() == Loads.size() &&
7080 MaxConsecutiveDistance == Loads.size() &&
7082 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7083 return P.second == LoadsState::ScatterVectorize;
7084 })) {
7085 VectorizedLoads.clear();
7086 SmallVector<LoadInst *> UnsortedNonVectorized;
7088 UnsortedResults =
7089 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7090 UnsortedNonVectorized, Final,
7091 OriginalLoads.size());
7092 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7093 SortedNonVectorized.swap(UnsortedNonVectorized);
7094 Results.swap(UnsortedResults);
7095 }
7096 }
7097 for (auto [Slice, _] : Results) {
7098 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7099 << Slice.size() << ")\n");
7100 if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
7101 for (Value *L : Slice)
7102 if (!getTreeEntry(L))
7103 SortedNonVectorized.push_back(cast<LoadInst>(L));
7104 continue;
7105 }
7106
7107 // Select maximum VF as a maximum of user gathered nodes and
7108 // distance between scalar loads in these nodes.
7109 unsigned MaxVF = Slice.size();
7110 unsigned UserMaxVF = 0;
7111 unsigned InterleaveFactor = 0;
7112 if (MaxVF == 2) {
7113 UserMaxVF = MaxVF;
7114 } else {
7115 // Found distance between segments of the interleaved loads.
7116 std::optional<unsigned> InterleavedLoadsDistance = 0;
7117 unsigned Order = 0;
7118 std::optional<unsigned> CommonVF = 0;
7120 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7121 for (auto [Idx, V] : enumerate(Slice)) {
7122 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7123 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7124 unsigned Pos =
7125 EntryToPosition.try_emplace(E, Idx).first->second;
7126 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7127 if (CommonVF) {
7128 if (*CommonVF == 0) {
7129 CommonVF = E->Scalars.size();
7130 continue;
7131 }
7132 if (*CommonVF != E->Scalars.size())
7133 CommonVF.reset();
7134 }
7135 // Check if the load is the part of the interleaved load.
7136 if (Pos != Idx && InterleavedLoadsDistance) {
7137 if (!DeinterleavedNodes.contains(E) &&
7138 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7139 if (isa<Constant>(V))
7140 return false;
7141 if (getTreeEntry(V))
7142 return true;
7143 const auto &Nodes = ValueToGatherNodes.at(V);
7144 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7145 !is_contained(Slice, V);
7146 })) {
7147 InterleavedLoadsDistance.reset();
7148 continue;
7149 }
7150 DeinterleavedNodes.insert(E);
7151 if (*InterleavedLoadsDistance == 0) {
7152 InterleavedLoadsDistance = Idx - Pos;
7153 continue;
7154 }
7155 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7156 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7157 InterleavedLoadsDistance.reset();
7158 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7159 }
7160 }
7161 }
7162 DeinterleavedNodes.clear();
7163 // Check if the large load represents interleaved load operation.
7164 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7165 CommonVF.value_or(0) != 0) {
7166 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7167 unsigned VF = *CommonVF;
7168 OrdersType Order;
7169 SmallVector<Value *> PointerOps;
7170 // Segmented load detected - vectorize at maximum vector factor.
7171 if (InterleaveFactor <= Slice.size() &&
7173 getWidenedType(Slice.front()->getType(), VF),
7174 InterleaveFactor,
7175 cast<LoadInst>(Slice.front())->getAlign(),
7176 cast<LoadInst>(Slice.front())
7178 canVectorizeLoads(Slice, Slice.front(), Order,
7179 PointerOps) == LoadsState::Vectorize) {
7180 UserMaxVF = InterleaveFactor * VF;
7181 } else {
7182 InterleaveFactor = 0;
7183 }
7184 }
7185 // Cannot represent the loads as consecutive vectorizable nodes -
7186 // just exit.
7187 unsigned ConsecutiveNodesSize = 0;
7188 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7189 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7190 [&, Slice = Slice](const auto &P) {
7191 const auto *It = find_if(Slice, [&](Value *V) {
7192 return std::get<1>(P).contains(V);
7193 });
7194 if (It == Slice.end())
7195 return false;
7197 VectorizableTree[std::get<0>(P)]->Scalars;
7198 ConsecutiveNodesSize += VL.size();
7199 unsigned Start = std::distance(Slice.begin(), It);
7200 unsigned Sz = Slice.size() - Start;
7201 return Sz < VL.size() ||
7202 Slice.slice(std::distance(Slice.begin(), It),
7203 VL.size()) != VL;
7204 }))
7205 continue;
7206 // Try to build long masked gather loads.
7207 UserMaxVF = bit_ceil(UserMaxVF);
7208 if (InterleaveFactor == 0 &&
7209 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7210 [&, Slice = Slice](unsigned Idx) {
7211 OrdersType Order;
7212 SmallVector<Value *> PointerOps;
7213 return canVectorizeLoads(
7214 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7215 Slice[Idx * UserMaxVF], Order,
7216 PointerOps) ==
7217 LoadsState::ScatterVectorize;
7218 }))
7219 UserMaxVF = MaxVF;
7220 if (Slice.size() != ConsecutiveNodesSize)
7221 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7222 }
7223 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7224 bool IsVectorized = true;
7225 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7226 ArrayRef<Value *> SubSlice =
7227 Slice.slice(I, std::min(VF, E - I));
7228 if (getTreeEntry(SubSlice.front()))
7229 continue;
7230 // Check if the subslice is to be-vectorized entry, which is not
7231 // equal to entry.
7232 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7233 [&](const auto &P) {
7234 return !SubSlice.equals(
7235 VectorizableTree[std::get<0>(P)]
7236 ->Scalars) &&
7237 set_is_subset(SubSlice, std::get<1>(P));
7238 }))
7239 continue;
7240 unsigned Sz = VectorizableTree.size();
7241 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7242 if (Sz == VectorizableTree.size()) {
7243 IsVectorized = false;
7244 // Try non-interleaved vectorization with smaller vector
7245 // factor.
7246 if (InterleaveFactor > 0) {
7247 VF = 2 * (MaxVF / InterleaveFactor);
7248 InterleaveFactor = 0;
7249 }
7250 continue;
7251 }
7252 }
7253 if (IsVectorized)
7254 break;
7255 }
7256 }
7257 NonVectorized.append(SortedNonVectorized);
7258 }
7259 return NonVectorized;
7260 };
7261 for (const auto &GLs : GatheredLoads) {
7262 const auto &Ref = GLs.second;
7263 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7264 if (!Ref.empty() && !NonVectorized.empty() &&
7265 std::accumulate(
7266 Ref.begin(), Ref.end(), 0u,
7267 [](unsigned S, ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
7268 return S + LoadsDists.size();
7269 }) != NonVectorized.size() &&
7270 IsMaskedGatherSupported(NonVectorized)) {
7272 for (LoadInst *LI : NonVectorized) {
7273 // Reinsert non-vectorized loads to other list of loads with the same
7274 // base pointers.
7275 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7276 FinalGatheredLoads,
7277 /*AddNew=*/false);
7278 }
7279 // Final attempt to vectorize non-vectorized loads.
7280 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7281 }
7282 }
7283 // Try to vectorize postponed load entries, previously marked as gathered.
7284 for (unsigned Idx : LoadEntriesToVectorize) {
7285 const TreeEntry &E = *VectorizableTree[Idx];
7286 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7287 // Avoid reordering, if possible.
7288 if (!E.ReorderIndices.empty()) {
7289 // Build a mask out of the reorder indices and reorder scalars per this
7290 // mask.
7291 SmallVector<int> ReorderMask;
7292 inversePermutation(E.ReorderIndices, ReorderMask);
7293 reorderScalars(GatheredScalars, ReorderMask);
7294 }
7295 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7296 }
7297 // If no new entries created, consider it as no gathered loads entries must be
7298 // handled.
7299 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7300 VectorizableTree.size())
7301 GatheredLoadsEntriesFirst.reset();
7302}
7303
7304/// \return true if the specified list of values has only one instruction that
7305/// requires scheduling, false otherwise.
7306#ifndef NDEBUG
7308 Value *NeedsScheduling = nullptr;
7309 for (Value *V : VL) {
7311 continue;
7312 if (!NeedsScheduling) {
7313 NeedsScheduling = V;
7314 continue;
7315 }
7316 return false;
7317 }
7318 return NeedsScheduling;
7319}
7320#endif
7321
7322/// Generates key/subkey pair for the given value to provide effective sorting
7323/// of the values and better detection of the vectorizable values sequences. The
7324/// keys/subkeys can be used for better sorting of the values themselves (keys)
7325/// and in values subgroups (subkeys).
7326static std::pair<size_t, size_t> generateKeySubkey(
7327 Value *V, const TargetLibraryInfo *TLI,
7328 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7329 bool AllowAlternate) {
7330 hash_code Key = hash_value(V->getValueID() + 2);
7331 hash_code SubKey = hash_value(0);
7332 // Sort the loads by the distance between the pointers.
7333 if (auto *LI = dyn_cast<LoadInst>(V)) {
7334 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7335 if (LI->isSimple())
7336 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7337 else
7338 Key = SubKey = hash_value(LI);
7339 } else if (isVectorLikeInstWithConstOps(V)) {
7340 // Sort extracts by the vector operands.
7341 if (isa<ExtractElementInst, UndefValue>(V))
7342 Key = hash_value(Value::UndefValueVal + 1);
7343 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7344 if (!isUndefVector(EI->getVectorOperand()).all() &&
7345 !isa<UndefValue>(EI->getIndexOperand()))
7346 SubKey = hash_value(EI->getVectorOperand());
7347 }
7348 } else if (auto *I = dyn_cast<Instruction>(V)) {
7349 // Sort other instructions just by the opcodes except for CMPInst.
7350 // For CMP also sort by the predicate kind.
7351 if ((isa<BinaryOperator, CastInst>(I)) &&
7352 isValidForAlternation(I->getOpcode())) {
7353 if (AllowAlternate)
7354 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7355 else
7356 Key = hash_combine(hash_value(I->getOpcode()), Key);
7357 SubKey = hash_combine(
7358 hash_value(I->getOpcode()), hash_value(I->getType()),
7359 hash_value(isa<BinaryOperator>(I)
7360 ? I->getType()
7361 : cast<CastInst>(I)->getOperand(0)->getType()));
7362 // For casts, look through the only operand to improve compile time.
7363 if (isa<CastInst>(I)) {
7364 std::pair<size_t, size_t> OpVals =
7365 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7366 /*AllowAlternate=*/true);
7367 Key = hash_combine(OpVals.first, Key);
7368 SubKey = hash_combine(OpVals.first, SubKey);
7369 }
7370 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7371 CmpInst::Predicate Pred = CI->getPredicate();
7372 if (CI->isCommutative())
7373 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7375 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7376 hash_value(SwapPred),
7377 hash_value(CI->getOperand(0)->getType()));
7378 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7381 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7382 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7383 SubKey = hash_combine(hash_value(I->getOpcode()),
7384 hash_value(Call->getCalledFunction()));
7385 } else {
7386 Key = hash_combine(hash_value(Call), Key);
7387 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7388 }
7389 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7390 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7391 hash_value(Op.Tag), SubKey);
7392 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7393 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7394 SubKey = hash_value(Gep->getPointerOperand());
7395 else
7396 SubKey = hash_value(Gep);
7397 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7398 !isa<ConstantInt>(I->getOperand(1))) {
7399 // Do not try to vectorize instructions with potentially high cost.
7400 SubKey = hash_value(I);
7401 } else {
7402 SubKey = hash_value(I->getOpcode());
7403 }
7404 Key = hash_combine(hash_value(I->getParent()), Key);
7405 }
7406 return std::make_pair(Key, SubKey);
7407}
7408
7409/// Checks if the specified instruction \p I is an alternate operation for
7410/// the given \p MainOp and \p AltOp instructions.
7411static bool isAlternateInstruction(const Instruction *I,
7412 const Instruction *MainOp,
7413 const Instruction *AltOp,
7414 const TargetLibraryInfo &TLI);
7415
7416bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7417 ArrayRef<Value *> VL) const {
7418 unsigned Opcode0 = S.getOpcode();
7419 unsigned Opcode1 = S.getAltOpcode();
7420 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7421 // If this pattern is supported by the target then consider it profitable.
7422 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7423 Opcode0, Opcode1, OpcodeMask))
7424 return true;
7426 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7427 Operands.emplace_back();
7428 // Prepare the operand vector.
7429 for (Value *V : VL) {
7430 if (isa<PoisonValue>(V)) {
7431 Operands.back().push_back(
7432 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7433 continue;
7434 }
7435 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7436 }
7437 }
7438 if (Operands.size() == 2) {
7439 // Try find best operands candidates.
7440 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7442 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7443 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7444 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7445 std::optional<int> Res = findBestRootPair(Candidates);
7446 switch (Res.value_or(0)) {
7447 case 0:
7448 break;
7449 case 1:
7450 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7451 break;
7452 case 2:
7453 std::swap(Operands[0][I], Operands[1][I]);
7454 break;
7455 default:
7456 llvm_unreachable("Unexpected index.");
7457 }
7458 }
7459 }
7460 DenseSet<unsigned> UniqueOpcodes;
7461 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7462 unsigned NonInstCnt = 0;
7463 // Estimate number of instructions, required for the vectorized node and for
7464 // the buildvector node.
7465 unsigned UndefCnt = 0;
7466 // Count the number of extra shuffles, required for vector nodes.
7467 unsigned ExtraShuffleInsts = 0;
7468 // Check that operands do not contain same values and create either perfect
7469 // diamond match or shuffled match.
7470 if (Operands.size() == 2) {
7471 // Do not count same operands twice.
7472 if (Operands.front() == Operands.back()) {
7473 Operands.erase(Operands.begin());
7474 } else if (!allConstant(Operands.front()) &&
7475 all_of(Operands.front(), [&](Value *V) {
7476 return is_contained(Operands.back(), V);
7477 })) {
7478 Operands.erase(Operands.begin());
7479 ++ExtraShuffleInsts;
7480 }
7481 }
7482 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7483 // Vectorize node, if:
7484 // 1. at least single operand is constant or splat.
7485 // 2. Operands have many loop invariants (the instructions are not loop
7486 // invariants).
7487 // 3. At least single unique operands is supposed to vectorized.
7488 return none_of(Operands,
7489 [&](ArrayRef<Value *> Op) {
7490 if (allConstant(Op) ||
7491 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7492 getSameOpcode(Op, *TLI).getMainOp()))
7493 return false;
7495 for (Value *V : Op) {
7496 if (isa<Constant, ExtractElementInst>(V) ||
7497 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
7498 if (isa<UndefValue>(V))
7499 ++UndefCnt;
7500 continue;
7501 }
7502 auto Res = Uniques.try_emplace(V, 0);
7503 // Found first duplicate - need to add shuffle.
7504 if (!Res.second && Res.first->second == 1)
7505 ++ExtraShuffleInsts;
7506 ++Res.first->getSecond();
7507 if (auto *I = dyn_cast<Instruction>(V))
7508 UniqueOpcodes.insert(I->getOpcode());
7509 else if (Res.second)
7510 ++NonInstCnt;
7511 }
7512 return none_of(Uniques, [&](const auto &P) {
7513 return P.first->hasNUsesOrMore(P.second + 1) &&
7514 none_of(P.first->users(), [&](User *U) {
7515 return getTreeEntry(U) || Uniques.contains(U);
7516 });
7517 });
7518 }) ||
7519 // Do not vectorize node, if estimated number of vector instructions is
7520 // more than estimated number of buildvector instructions. Number of
7521 // vector operands is number of vector instructions + number of vector
7522 // instructions for operands (buildvectors). Number of buildvector
7523 // instructions is just number_of_operands * number_of_scalars.
7524 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7525 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7526 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7527}
7528
7529BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7530 const InstructionsState &S, ArrayRef<Value *> VL,
7531 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7532 SmallVectorImpl<Value *> &PointerOps) {
7533 assert(S.getMainOp() &&
7534 "Expected instructions with same/alternate opcodes only.");
7535
7536 unsigned ShuffleOrOp =
7537 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7538 Instruction *VL0 = S.getMainOp();
7539 switch (ShuffleOrOp) {
7540 case Instruction::PHI: {
7541 // Too many operands - gather, most probably won't be vectorized.
7542 if (VL0->getNumOperands() > MaxPHINumOperands)
7543 return TreeEntry::NeedToGather;
7544 // Check for terminator values (e.g. invoke).
7545 for (Value *V : VL) {
7546 auto *PHI = dyn_cast<PHINode>(V);
7547 if (!PHI)
7548 continue;
7549 for (Value *Incoming : PHI->incoming_values()) {
7550 Instruction *Term = dyn_cast<Instruction>(Incoming);
7551 if (Term && Term->isTerminator()) {
7553 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7554 return TreeEntry::NeedToGather;
7555 }
7556 }
7557 }
7558
7559 return TreeEntry::Vectorize;
7560 }
7561 case Instruction::ExtractValue:
7562 case Instruction::ExtractElement: {
7563 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7564 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7565 if (!has_single_bit(VL.size()))
7566 return TreeEntry::NeedToGather;
7567 if (Reuse || !CurrentOrder.empty())
7568 return TreeEntry::Vectorize;
7569 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7570 return TreeEntry::NeedToGather;
7571 }
7572 case Instruction::InsertElement: {
7573 // Check that we have a buildvector and not a shuffle of 2 or more
7574 // different vectors.
7575 ValueSet SourceVectors;
7576 for (Value *V : VL) {
7577 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7578 assert(getElementIndex(V) != std::nullopt &&
7579 "Non-constant or undef index?");
7580 }
7581
7582 if (count_if(VL, [&SourceVectors](Value *V) {
7583 return !SourceVectors.contains(V);
7584 }) >= 2) {
7585 // Found 2nd source vector - cancel.
7586 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7587 "different source vectors.\n");
7588 return TreeEntry::NeedToGather;
7589 }
7590
7591 if (any_of(VL, [&SourceVectors](Value *V) {
7592 // The last InsertElement can have multiple uses.
7593 return SourceVectors.contains(V) && !V->hasOneUse();
7594 })) {
7595 assert(SLPReVec && "Only supported by REVEC.");
7596 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7597 "multiple uses.\n");
7598 return TreeEntry::NeedToGather;
7599 }
7600
7601 return TreeEntry::Vectorize;
7602 }
7603 case Instruction::Load: {
7604 // Check that a vectorized load would load the same memory as a scalar
7605 // load. For example, we don't want to vectorize loads that are smaller
7606 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7607 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7608 // from such a struct, we read/write packed bits disagreeing with the
7609 // unvectorized version.
7610 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7612 return TreeEntry::Vectorize;
7614 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7615 // Delay slow vectorized nodes for better vectorization attempts.
7616 LoadEntriesToVectorize.insert(VectorizableTree.size());
7617 return TreeEntry::NeedToGather;
7618 }
7619 return TreeEntry::ScatterVectorize;
7621 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7622 // Delay slow vectorized nodes for better vectorization attempts.
7623 LoadEntriesToVectorize.insert(VectorizableTree.size());
7624 return TreeEntry::NeedToGather;
7625 }
7626 return TreeEntry::StridedVectorize;
7627 case LoadsState::Gather:
7628#ifndef NDEBUG
7629 Type *ScalarTy = VL0->getType();
7630 if (DL->getTypeSizeInBits(ScalarTy) !=
7631 DL->getTypeAllocSizeInBits(ScalarTy))
7632 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7633 else if (any_of(VL, [](Value *V) {
7634 auto *LI = dyn_cast<LoadInst>(V);
7635 return !LI || !LI->isSimple();
7636 }))
7637 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7638 else
7639 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7640#endif // NDEBUG
7642 return TreeEntry::NeedToGather;
7643 }
7644 llvm_unreachable("Unexpected state of loads");
7645 }
7646 case Instruction::ZExt:
7647 case Instruction::SExt:
7648 case Instruction::FPToUI:
7649 case Instruction::FPToSI:
7650 case Instruction::FPExt:
7651 case Instruction::PtrToInt:
7652 case Instruction::IntToPtr:
7653 case Instruction::SIToFP:
7654 case Instruction::UIToFP:
7655 case Instruction::Trunc:
7656 case Instruction::FPTrunc:
7657 case Instruction::BitCast: {
7658 Type *SrcTy = VL0->getOperand(0)->getType();
7659 for (Value *V : VL) {
7660 if (isa<PoisonValue>(V))
7661 continue;
7662 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7663 if (Ty != SrcTy || !isValidElementType(Ty)) {
7664 LLVM_DEBUG(
7665 dbgs() << "SLP: Gathering casts with different src types.\n");
7666 return TreeEntry::NeedToGather;
7667 }
7668 }
7669 return TreeEntry::Vectorize;
7670 }
7671 case Instruction::ICmp:
7672 case Instruction::FCmp: {
7673 // Check that all of the compares have the same predicate.
7674 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7676 Type *ComparedTy = VL0->getOperand(0)->getType();
7677 for (Value *V : VL) {
7678 if (isa<PoisonValue>(V))
7679 continue;
7680 auto *Cmp = cast<CmpInst>(V);
7681 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7682 Cmp->getOperand(0)->getType() != ComparedTy) {
7683 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7684 return TreeEntry::NeedToGather;
7685 }
7686 }
7687 return TreeEntry::Vectorize;
7688 }
7689 case Instruction::Select:
7690 case Instruction::FNeg:
7691 case Instruction::Add:
7692 case Instruction::FAdd:
7693 case Instruction::Sub:
7694 case Instruction::FSub:
7695 case Instruction::Mul:
7696 case Instruction::FMul:
7697 case Instruction::UDiv:
7698 case Instruction::SDiv:
7699 case Instruction::FDiv:
7700 case Instruction::URem:
7701 case Instruction::SRem:
7702 case Instruction::FRem:
7703 case Instruction::Shl:
7704 case Instruction::LShr:
7705 case Instruction::AShr:
7706 case Instruction::And:
7707 case Instruction::Or:
7708 case Instruction::Xor:
7709 case Instruction::Freeze:
7710 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7712 auto *I = dyn_cast<Instruction>(V);
7713 return I && I->isBinaryOp() && !I->isFast();
7714 }))
7715 return TreeEntry::NeedToGather;
7716 return TreeEntry::Vectorize;
7717 case Instruction::GetElementPtr: {
7718 // We don't combine GEPs with complicated (nested) indexing.
7719 for (Value *V : VL) {
7720 auto *I = dyn_cast<GetElementPtrInst>(V);
7721 if (!I)
7722 continue;
7723 if (I->getNumOperands() != 2) {
7724 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7725 return TreeEntry::NeedToGather;
7726 }
7727 }
7728
7729 // We can't combine several GEPs into one vector if they operate on
7730 // different types.
7731 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7732 for (Value *V : VL) {
7733 auto *GEP = dyn_cast<GEPOperator>(V);
7734 if (!GEP)
7735 continue;
7736 Type *CurTy = GEP->getSourceElementType();
7737 if (Ty0 != CurTy) {
7738 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7739 return TreeEntry::NeedToGather;
7740 }
7741 }
7742
7743 // We don't combine GEPs with non-constant indexes.
7744 Type *Ty1 = VL0->getOperand(1)->getType();
7745 for (Value *V : VL) {
7746 auto *I = dyn_cast<GetElementPtrInst>(V);
7747 if (!I)
7748 continue;
7749 auto *Op = I->getOperand(1);
7750 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7751 (Op->getType() != Ty1 &&
7752 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7753 Op->getType()->getScalarSizeInBits() >
7754 DL->getIndexSizeInBits(
7755 V->getType()->getPointerAddressSpace())))) {
7756 LLVM_DEBUG(
7757 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7758 return TreeEntry::NeedToGather;
7759 }
7760 }
7761
7762 return TreeEntry::Vectorize;
7763 }
7764 case Instruction::Store: {
7765 // Check if the stores are consecutive or if we need to swizzle them.
7766 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7767 // Avoid types that are padded when being allocated as scalars, while
7768 // being packed together in a vector (such as i1).
7769 if (DL->getTypeSizeInBits(ScalarTy) !=
7770 DL->getTypeAllocSizeInBits(ScalarTy)) {
7771 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7772 return TreeEntry::NeedToGather;
7773 }
7774 // Make sure all stores in the bundle are simple - we can't vectorize
7775 // atomic or volatile stores.
7776 for (Value *V : VL) {
7777 auto *SI = cast<StoreInst>(V);
7778 if (!SI->isSimple()) {
7779 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7780 return TreeEntry::NeedToGather;
7781 }
7782 PointerOps.push_back(SI->getPointerOperand());
7783 }
7784
7785 // Check the order of pointer operands.
7786 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7787 Value *Ptr0;
7788 Value *PtrN;
7789 if (CurrentOrder.empty()) {
7790 Ptr0 = PointerOps.front();
7791 PtrN = PointerOps.back();
7792 } else {
7793 Ptr0 = PointerOps[CurrentOrder.front()];
7794 PtrN = PointerOps[CurrentOrder.back()];
7795 }
7796 std::optional<int> Dist =
7797 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7798 // Check that the sorted pointer operands are consecutive.
7799 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7800 return TreeEntry::Vectorize;
7801 }
7802
7803 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7804 return TreeEntry::NeedToGather;
7805 }
7806 case Instruction::Call: {
7807 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7809 auto *I = dyn_cast<Instruction>(V);
7810 return I && !I->isFast();
7811 }))
7812 return TreeEntry::NeedToGather;
7813 // Check if the calls are all to the same vectorizable intrinsic or
7814 // library function.
7815 CallInst *CI = cast<CallInst>(VL0);
7817
7818 VFShape Shape = VFShape::get(
7819 CI->getFunctionType(),
7820 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7821 false /*HasGlobalPred*/);
7822 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7823
7824 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7825 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7826 return TreeEntry::NeedToGather;
7827 }
7828 Function *F = CI->getCalledFunction();
7829 unsigned NumArgs = CI->arg_size();
7830 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7831 for (unsigned J = 0; J != NumArgs; ++J)
7833 ScalarArgs[J] = CI->getArgOperand(J);
7834 for (Value *V : VL) {
7835 CallInst *CI2 = dyn_cast<CallInst>(V);
7836 if (!CI2 || CI2->getCalledFunction() != F ||
7837 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7838 (VecFunc &&
7839 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7841 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7842 << "\n");
7843 return TreeEntry::NeedToGather;
7844 }
7845 // Some intrinsics have scalar arguments and should be same in order for
7846 // them to be vectorized.
7847 for (unsigned J = 0; J != NumArgs; ++J) {
7849 Value *A1J = CI2->getArgOperand(J);
7850 if (ScalarArgs[J] != A1J) {
7852 << "SLP: mismatched arguments in call:" << *CI
7853 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7854 return TreeEntry::NeedToGather;
7855 }
7856 }
7857 }
7858 // Verify that the bundle operands are identical between the two calls.
7859 if (CI->hasOperandBundles() &&
7860 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7861 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7862 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7863 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7864 << "!=" << *V << '\n');
7865 return TreeEntry::NeedToGather;
7866 }
7867 }
7868
7869 return TreeEntry::Vectorize;
7870 }
7871 case Instruction::ShuffleVector: {
7872 if (!S.isAltShuffle()) {
7873 // REVEC can support non alternate shuffle.
7875 return TreeEntry::Vectorize;
7876 // If this is not an alternate sequence of opcode like add-sub
7877 // then do not vectorize this instruction.
7878 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7879 return TreeEntry::NeedToGather;
7880 }
7881 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7882 LLVM_DEBUG(
7883 dbgs()
7884 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7885 "the whole alt sequence is not profitable.\n");
7886 return TreeEntry::NeedToGather;
7887 }
7888
7889 return TreeEntry::Vectorize;
7890 }
7891 default:
7892 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
7893 return TreeEntry::NeedToGather;
7894 }
7895}
7896
7897namespace {
7898/// Allows to correctly handle operands of the phi nodes based on the \p Main
7899/// PHINode order of incoming basic blocks/values.
7900class PHIHandler {
7901 DominatorTree &DT;
7902 PHINode *Main = nullptr;
7905
7906public:
7907 PHIHandler() = delete;
7908 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
7909 : DT(DT), Main(Main), Phis(Phis),
7910 Operands(Main->getNumIncomingValues(),
7911 SmallVector<Value *>(Phis.size(), nullptr)) {}
7912 void buildOperands() {
7913 constexpr unsigned FastLimit = 4;
7914 if (Main->getNumIncomingValues() <= FastLimit) {
7915 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7916 BasicBlock *InBB = Main->getIncomingBlock(I);
7917 if (!DT.isReachableFromEntry(InBB)) {
7918 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7919 continue;
7920 }
7921 // Prepare the operand vector.
7922 for (auto [Idx, V] : enumerate(Phis)) {
7923 auto *P = dyn_cast<PHINode>(V);
7924 if (!P) {
7925 assert(isa<PoisonValue>(V) &&
7926 "Expected isa instruction or poison value.");
7927 Operands[I][Idx] = V;
7928 continue;
7929 }
7930 if (P->getIncomingBlock(I) == InBB)
7931 Operands[I][Idx] = P->getIncomingValue(I);
7932 else
7933 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
7934 }
7935 }
7936 return;
7937 }
7939 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7940 BasicBlock *InBB = Main->getIncomingBlock(I);
7941 if (!DT.isReachableFromEntry(InBB)) {
7942 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7943 continue;
7944 }
7945 Blocks.try_emplace(InBB).first->second.push_back(I);
7946 }
7947 for (auto [Idx, V] : enumerate(Phis)) {
7948 if (isa<PoisonValue>(V)) {
7949 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
7950 Operands[I][Idx] = V;
7951 continue;
7952 }
7953 auto *P = cast<PHINode>(V);
7954 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
7955 BasicBlock *InBB = P->getIncomingBlock(I);
7956 if (InBB == Main->getIncomingBlock(I)) {
7957 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
7958 continue;
7959 Operands[I][Idx] = P->getIncomingValue(I);
7960 continue;
7961 }
7962 auto It = Blocks.find(InBB);
7963 if (It == Blocks.end())
7964 continue;
7965 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
7966 }
7967 }
7968 for (const auto &P : Blocks) {
7969 if (P.getSecond().size() <= 1)
7970 continue;
7971 unsigned BasicI = P.getSecond().front();
7972 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
7974 [&](const auto &Data) {
7975 return !Data.value() ||
7976 Data.value() == Operands[BasicI][Data.index()];
7977 }) &&
7978 "Expected empty operands list.");
7979 Operands[I] = Operands[BasicI];
7980 }
7981 }
7982 }
7983 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
7984};
7985} // namespace
7986
7987void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7988 const EdgeInfo &UserTreeIdx,
7989 unsigned InterleaveFactor) {
7990 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
7991
7992 SmallVector<int> ReuseShuffleIndices;
7993 SmallVector<Value *> UniqueValues;
7994 SmallVector<Value *> NonUniqueValueVL;
7995 auto TryToFindDuplicates = [&](const InstructionsState &S,
7996 bool DoNotFail = false) {
7997 // Check that every instruction appears once in this bundle.
7998 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
7999 for (Value *V : VL) {
8000 if (isConstant(V)) {
8001 ReuseShuffleIndices.emplace_back(
8002 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8003 UniqueValues.emplace_back(V);
8004 continue;
8005 }
8006 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8007 ReuseShuffleIndices.emplace_back(Res.first->second);
8008 if (Res.second)
8009 UniqueValues.emplace_back(V);
8010 }
8011 size_t NumUniqueScalarValues = UniqueValues.size();
8012 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8013 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8014 if (NumUniqueScalarValues == VL.size() &&
8015 (VectorizeNonPowerOf2 || IsFullVectors)) {
8016 ReuseShuffleIndices.clear();
8017 } else {
8018 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8019 if ((UserTreeIdx.UserTE &&
8020 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8021 !has_single_bit(VL.size())) {
8022 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8023 "for nodes with padding.\n");
8024 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8025 return false;
8026 }
8027 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8028 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8029 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8030 return isa<UndefValue>(V) || !isConstant(V);
8031 }))) {
8032 if (DoNotFail && UniquePositions.size() > 1 &&
8033 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8034 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8035 // Find the number of elements, which forms full vectors.
8036 unsigned PWSz = getFullVectorNumberOfElements(
8037 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8038 if (PWSz == VL.size()) {
8039 ReuseShuffleIndices.clear();
8040 } else {
8041 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8042 NonUniqueValueVL.append(
8043 PWSz - UniqueValues.size(),
8044 PoisonValue::get(UniqueValues.front()->getType()));
8045 VL = NonUniqueValueVL;
8046 }
8047 return true;
8048 }
8049 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8050 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8051 return false;
8052 }
8053 VL = UniqueValues;
8054 }
8055 return true;
8056 };
8057
8058 InstructionsState S = getSameOpcode(VL, *TLI);
8059
8060 // Don't go into catchswitch blocks, which can happen with PHIs.
8061 // Such blocks can only have PHIs and the catchswitch. There is no
8062 // place to insert a shuffle if we need to, so just avoid that issue.
8063 if (S.getMainOp() &&
8064 isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8065 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8066 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8067 return;
8068 }
8069
8070 // Check if this is a duplicate of another entry.
8071 if (S.getOpcode()) {
8072 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8073 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
8074 << ".\n");
8075 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8076 auto It = MultiNodeScalars.find(S.getMainOp());
8077 if (It != MultiNodeScalars.end()) {
8078 auto *TEIt = find_if(It->getSecond(),
8079 [&](TreeEntry *ME) { return ME->isSame(VL); });
8080 if (TEIt != It->getSecond().end())
8081 E = *TEIt;
8082 else
8083 E = nullptr;
8084 } else {
8085 E = nullptr;
8086 }
8087 }
8088 if (!E) {
8089 if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8090 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
8091 if (TryToFindDuplicates(S))
8092 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8093 ReuseShuffleIndices);
8094 return;
8095 }
8097 Nodes.insert(getTreeEntry(S.getMainOp()));
8098 for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8099 Nodes.insert(E);
8100 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8101 if (any_of(Nodes, [&](const TreeEntry *E) {
8102 if (all_of(E->Scalars,
8103 [&](Value *V) { return Values.contains(V); }))
8104 return true;
8105 SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8106 E->Scalars.end());
8107 return (
8108 all_of(VL, [&](Value *V) { return EValues.contains(V); }));
8109 })) {
8110 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8111 if (TryToFindDuplicates(S))
8112 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8113 ReuseShuffleIndices);
8114 return;
8115 }
8116 } else {
8117 // Record the reuse of the tree node. FIXME, currently this is only
8118 // used to properly draw the graph rather than for the actual
8119 // vectorization.
8120 E->UserTreeIndices.push_back(UserTreeIdx);
8121 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8122 << ".\n");
8123 return;
8124 }
8125 }
8126 }
8127
8128 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8129 // a load), in which case peek through to include it in the tree, without
8130 // ballooning over-budget.
8131 if (Depth >= RecursionMaxDepth &&
8132 !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 &&
8133 (match(S.getMainOp(), m_Load(m_Value())) ||
8134 all_of(VL, [&S](const Value *I) {
8135 return match(I,
8137 cast<Instruction>(I)->getOpcode() ==
8138 S.getMainOp()->getOpcode();
8139 })))) {
8140 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8141 if (TryToFindDuplicates(S))
8142 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8143 ReuseShuffleIndices);
8144 return;
8145 }
8146
8147 // Don't handle scalable vectors
8148 if (S.getOpcode() == Instruction::ExtractElement &&
8149 isa<ScalableVectorType>(
8150 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8151 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8152 if (TryToFindDuplicates(S))
8153 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8154 ReuseShuffleIndices);
8155 return;
8156 }
8157
8158 // Don't handle vectors.
8159 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8160 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8161 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8162 return;
8163 }
8164
8165 // If all of the operands are identical or constant we have a simple solution.
8166 // If we deal with insert/extract instructions, they all must have constant
8167 // indices, otherwise we should gather them, not try to vectorize.
8168 // If alternate op node with 2 elements with gathered operands - do not
8169 // vectorize.
8170 auto &&NotProfitableForVectorization = [&S, this,
8172 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
8173 return false;
8174 if (VectorizableTree.size() < MinTreeSize)
8175 return false;
8176 if (Depth >= RecursionMaxDepth - 1)
8177 return true;
8178 // Check if all operands are extracts, part of vector node or can build a
8179 // regular vectorize node.
8180 SmallVector<unsigned, 8> InstsCount;
8181 for (Value *V : VL) {
8182 auto *I = cast<Instruction>(V);
8183 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8184 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8185 }));
8186 }
8187 bool IsCommutative =
8188 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8189 if ((IsCommutative &&
8190 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8191 (!IsCommutative &&
8192 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8193 return true;
8194 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8196 auto *I1 = cast<Instruction>(VL.front());
8197 auto *I2 = cast<Instruction>(VL.back());
8198 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8199 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8200 I2->getOperand(Op));
8201 if (static_cast<unsigned>(count_if(
8202 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8204 })) >= S.getMainOp()->getNumOperands() / 2)
8205 return false;
8206 if (S.getMainOp()->getNumOperands() > 2)
8207 return true;
8208 if (IsCommutative) {
8209 // Check permuted operands.
8210 Candidates.clear();
8211 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8212 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8213 I2->getOperand((Op + 1) % E));
8214 if (any_of(
8215 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8217 }))
8218 return false;
8219 }
8220 return true;
8221 };
8222 SmallVector<unsigned> SortedIndices;
8223 BasicBlock *BB = nullptr;
8224 bool IsScatterVectorizeUserTE =
8225 UserTreeIdx.UserTE &&
8226 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8227 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
8228 bool AreScatterAllGEPSameBlock =
8229 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8230 VL.size() > 2 &&
8231 all_of(VL,
8232 [&BB](Value *V) {
8233 auto *I = dyn_cast<GetElementPtrInst>(V);
8234 if (!I)
8235 return doesNotNeedToBeScheduled(V);
8236 if (!BB)
8237 BB = I->getParent();
8238 return BB == I->getParent() && I->getNumOperands() == 2;
8239 }) &&
8240 BB &&
8241 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8242 SortedIndices));
8243 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8244 if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) ||
8245 (isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8246 S.getMainOp()) &&
8248 NotProfitableForVectorization(VL)) {
8249 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8250 if (TryToFindDuplicates(S))
8251 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8252 ReuseShuffleIndices);
8253 return;
8254 }
8255
8256 // Don't vectorize ephemeral values.
8257 if (S.getOpcode() && !EphValues.empty()) {
8258 for (Value *V : VL) {
8259 if (EphValues.count(V)) {
8260 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8261 << ") is ephemeral.\n");
8262 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8263 return;
8264 }
8265 }
8266 }
8267
8268 // We now know that this is a vector of instructions of the same type from
8269 // the same block.
8270
8271 // Check that none of the instructions in the bundle are already in the tree.
8272 for (Value *V : VL) {
8273 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8275 continue;
8276 if (getTreeEntry(V)) {
8277 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8278 << ") is already in tree.\n");
8279 if (TryToFindDuplicates(S))
8280 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8281 ReuseShuffleIndices);
8282 return;
8283 }
8284 }
8285
8286 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8287 if (UserIgnoreList && !UserIgnoreList->empty()) {
8288 for (Value *V : VL) {
8289 if (UserIgnoreList->contains(V)) {
8290 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8291 if (TryToFindDuplicates(S))
8292 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8293 ReuseShuffleIndices);
8294 return;
8295 }
8296 }
8297 }
8298
8299 // Special processing for sorted pointers for ScatterVectorize node with
8300 // constant indeces only.
8301 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8302 assert(VL.front()->getType()->isPointerTy() &&
8303 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8304 "Expected pointers only.");
8305 // Reset S to make it GetElementPtr kind of node.
8306 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8307 assert(It != VL.end() && "Expected at least one GEP.");
8308 S = getSameOpcode(*It, *TLI);
8309 }
8310
8311 // Check that all of the users of the scalars that we want to vectorize are
8312 // schedulable.
8313 Instruction *VL0 = S.getMainOp();
8314 BB = VL0->getParent();
8315
8316 if (S.getMainOp() &&
8317 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8318 !DT->isReachableFromEntry(BB))) {
8319 // Don't go into unreachable blocks. They may contain instructions with
8320 // dependency cycles which confuse the final scheduling.
8321 // Do not vectorize EH and non-returning blocks, not profitable in most
8322 // cases.
8323 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8324 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8325 return;
8326 }
8327
8328 // Check that every instruction appears once in this bundle.
8329 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8330 return;
8331
8332 // Perform specific checks for each particular instruction kind.
8333 OrdersType CurrentOrder;
8334 SmallVector<Value *> PointerOps;
8335 TreeEntry::EntryState State = getScalarsVectorizationState(
8336 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8337 if (State == TreeEntry::NeedToGather) {
8338 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8339 ReuseShuffleIndices);
8340 return;
8341 }
8342
8343 auto &BSRef = BlocksSchedules[BB];
8344 if (!BSRef)
8345 BSRef = std::make_unique<BlockScheduling>(BB);
8346
8347 BlockScheduling &BS = *BSRef;
8348
8349 std::optional<ScheduleData *> Bundle =
8350 BS.tryScheduleBundle(UniqueValues, this, S);
8351#ifdef EXPENSIVE_CHECKS
8352 // Make sure we didn't break any internal invariants
8353 BS.verify();
8354#endif
8355 if (!Bundle) {
8356 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8357 assert((!BS.getScheduleData(VL0) ||
8358 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8359 "tryScheduleBundle should cancelScheduling on failure");
8360 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8361 ReuseShuffleIndices);
8362 NonScheduledFirst.insert(VL.front());
8363 if (S.getOpcode() == Instruction::Load &&
8364 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8366 return;
8367 }
8368 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8369
8370 unsigned ShuffleOrOp = S.isAltShuffle() ?
8371 (unsigned) Instruction::ShuffleVector : S.getOpcode();
8372 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8373 // Postpone PHI nodes creation
8374 SmallVector<unsigned> PHIOps;
8375 for (unsigned I : seq<unsigned>(Operands.size())) {
8377 if (Op.empty())
8378 continue;
8379 InstructionsState S = getSameOpcode(Op, *TLI);
8380 if (S.getOpcode() != Instruction::PHI || S.isAltShuffle())
8381 buildTree_rec(Op, Depth + 1, {TE, I});
8382 else
8383 PHIOps.push_back(I);
8384 }
8385 for (unsigned I : PHIOps)
8386 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8387 };
8388 switch (ShuffleOrOp) {
8389 case Instruction::PHI: {
8390 auto *PH = cast<PHINode>(VL0);
8391
8392 TreeEntry *TE =
8393 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8394 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
8395
8396 // Keeps the reordered operands to avoid code duplication.
8397 PHIHandler Handler(*DT, PH, VL);
8398 Handler.buildOperands();
8399 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8400 TE->setOperand(I, Handler.getOperands(I));
8401 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8402 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8403 Operands[I] = Handler.getOperands(I);
8404 CreateOperandNodes(TE, Operands);
8405 return;
8406 }
8407 case Instruction::ExtractValue:
8408 case Instruction::ExtractElement: {
8409 if (CurrentOrder.empty()) {
8410 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8411 } else {
8412 LLVM_DEBUG({
8413 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8414 "with order";
8415 for (unsigned Idx : CurrentOrder)
8416 dbgs() << " " << Idx;
8417 dbgs() << "\n";
8418 });
8419 fixupOrderingIndices(CurrentOrder);
8420 }
8421 // Insert new order with initial value 0, if it does not exist,
8422 // otherwise return the iterator to the existing one.
8423 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8424 ReuseShuffleIndices, CurrentOrder);
8425 // This is a special case, as it does not gather, but at the same time
8426 // we are not extending buildTree_rec() towards the operands.
8427 ValueList Op0;
8428 Op0.assign(VL.size(), VL0->getOperand(0));
8429 VectorizableTree.back()->setOperand(0, Op0);
8430 return;
8431 }
8432 case Instruction::InsertElement: {
8433 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8434
8435 auto OrdCompare = [](const std::pair<int, int> &P1,
8436 const std::pair<int, int> &P2) {
8437 return P1.first > P2.first;
8438 };
8440 decltype(OrdCompare)>
8441 Indices(OrdCompare);
8442 for (int I = 0, E = VL.size(); I < E; ++I) {
8443 unsigned Idx = *getElementIndex(VL[I]);
8444 Indices.emplace(Idx, I);
8445 }
8446 OrdersType CurrentOrder(VL.size(), VL.size());
8447 bool IsIdentity = true;
8448 for (int I = 0, E = VL.size(); I < E; ++I) {
8449 CurrentOrder[Indices.top().second] = I;
8450 IsIdentity &= Indices.top().second == I;
8451 Indices.pop();
8452 }
8453 if (IsIdentity)
8454 CurrentOrder.clear();
8455 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8456 {}, CurrentOrder);
8457 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
8458
8459 TE->setOperand(*this);
8460 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8461 return;
8462 }
8463 case Instruction::Load: {
8464 // Check that a vectorized load would load the same memory as a scalar
8465 // load. For example, we don't want to vectorize loads that are smaller
8466 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8467 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8468 // from such a struct, we read/write packed bits disagreeing with the
8469 // unvectorized version.
8470 TreeEntry *TE = nullptr;
8471 fixupOrderingIndices(CurrentOrder);
8472 switch (State) {
8473 case TreeEntry::Vectorize:
8474 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8475 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8476 if (CurrentOrder.empty())
8477 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
8478 else
8479 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
8480 break;
8481 case TreeEntry::StridedVectorize:
8482 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8483 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8484 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8485 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
8486 break;
8487 case TreeEntry::ScatterVectorize:
8488 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8489 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8490 UserTreeIdx, ReuseShuffleIndices);
8491 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
8492 break;
8493 case TreeEntry::CombinedVectorize:
8494 case TreeEntry::NeedToGather:
8495 llvm_unreachable("Unexpected loads state.");
8496 }
8497 TE->setOperand(*this);
8498 if (State == TreeEntry::ScatterVectorize)
8499 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8500 return;
8501 }
8502 case Instruction::ZExt:
8503 case Instruction::SExt:
8504 case Instruction::FPToUI:
8505 case Instruction::FPToSI:
8506 case Instruction::FPExt:
8507 case Instruction::PtrToInt:
8508 case Instruction::IntToPtr:
8509 case Instruction::SIToFP:
8510 case Instruction::UIToFP:
8511 case Instruction::Trunc:
8512 case Instruction::FPTrunc:
8513 case Instruction::BitCast: {
8514 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8515 std::make_pair(std::numeric_limits<unsigned>::min(),
8516 std::numeric_limits<unsigned>::max()));
8517 if (ShuffleOrOp == Instruction::ZExt ||
8518 ShuffleOrOp == Instruction::SExt) {
8519 CastMaxMinBWSizes = std::make_pair(
8520 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8521 PrevMaxBW),
8522 std::min<unsigned>(
8523 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8524 PrevMinBW));
8525 } else if (ShuffleOrOp == Instruction::Trunc) {
8526 CastMaxMinBWSizes = std::make_pair(
8527 std::max<unsigned>(
8528 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8529 PrevMaxBW),
8530 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8531 PrevMinBW));
8532 }
8533 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8534 ReuseShuffleIndices);
8535 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
8536
8537 TE->setOperand(*this);
8538 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8539 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8540 if (ShuffleOrOp == Instruction::Trunc) {
8541 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8542 } else if (ShuffleOrOp == Instruction::SIToFP ||
8543 ShuffleOrOp == Instruction::UIToFP) {
8544 unsigned NumSignBits =
8545 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8546 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8547 APInt Mask = DB->getDemandedBits(OpI);
8548 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8549 }
8550 if (NumSignBits * 2 >=
8551 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8552 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8553 }
8554 return;
8555 }
8556 case Instruction::ICmp:
8557 case Instruction::FCmp: {
8558 // Check that all of the compares have the same predicate.
8559 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8560 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8561 ReuseShuffleIndices);
8562 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
8563
8565 VLOperands Ops(VL, VL0, *this);
8566 if (cast<CmpInst>(VL0)->isCommutative()) {
8567 // Commutative predicate - collect + sort operands of the instructions
8568 // so that each side is more likely to have the same opcode.
8570 "Commutative Predicate mismatch");
8571 Ops.reorder();
8572 Left = Ops.getVL(0);
8573 Right = Ops.getVL(1);
8574 } else {
8575 // Collect operands - commute if it uses the swapped predicate.
8576 for (Value *V : VL) {
8577 if (isa<PoisonValue>(V)) {
8578 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8579 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8580 continue;
8581 }
8582 auto *Cmp = cast<CmpInst>(V);
8583 Value *LHS = Cmp->getOperand(0);
8584 Value *RHS = Cmp->getOperand(1);
8585 if (Cmp->getPredicate() != P0)
8586 std::swap(LHS, RHS);
8587 Left.push_back(LHS);
8588 Right.push_back(RHS);
8589 }
8590 }
8591 TE->setOperand(0, Left);
8592 TE->setOperand(1, Right);
8593 buildTree_rec(Left, Depth + 1, {TE, 0});
8594 buildTree_rec(Right, Depth + 1, {TE, 1});
8595 if (ShuffleOrOp == Instruction::ICmp) {
8596 unsigned NumSignBits0 =
8597 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8598 if (NumSignBits0 * 2 >=
8599 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8600 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8601 unsigned NumSignBits1 =
8602 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8603 if (NumSignBits1 * 2 >=
8604 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8605 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8606 }
8607 return;
8608 }
8609 case Instruction::Select:
8610 case Instruction::FNeg:
8611 case Instruction::Add:
8612 case Instruction::FAdd:
8613 case Instruction::Sub:
8614 case Instruction::FSub:
8615 case Instruction::Mul:
8616 case Instruction::FMul:
8617 case Instruction::UDiv:
8618 case Instruction::SDiv:
8619 case Instruction::FDiv:
8620 case Instruction::URem:
8621 case Instruction::SRem:
8622 case Instruction::FRem:
8623 case Instruction::Shl:
8624 case Instruction::LShr:
8625 case Instruction::AShr:
8626 case Instruction::And:
8627 case Instruction::Or:
8628 case Instruction::Xor:
8629 case Instruction::Freeze: {
8630 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8631 ReuseShuffleIndices);
8632 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
8633
8634 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8635 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8636 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8637 return;
8638 }
8639 case Instruction::GetElementPtr: {
8640 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8641 ReuseShuffleIndices);
8642 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
8644 // Prepare the operand vector for pointer operands.
8645 for (Value *V : VL) {
8646 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8647 if (!GEP) {
8648 Operands.front().push_back(V);
8649 continue;
8650 }
8651 Operands.front().push_back(GEP->getPointerOperand());
8652 }
8653 TE->setOperand(0, Operands.front());
8654 // Need to cast all indices to the same type before vectorization to
8655 // avoid crash.
8656 // Required to be able to find correct matches between different gather
8657 // nodes and reuse the vectorized values rather than trying to gather them
8658 // again.
8659 int IndexIdx = 1;
8660 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8661 Type *Ty = all_of(VL,
8662 [VL0Ty, IndexIdx](Value *V) {
8663 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8664 if (!GEP)
8665 return true;
8666 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8667 })
8668 ? VL0Ty
8669 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8670 ->getPointerOperandType()
8671 ->getScalarType());
8672 // Prepare the operand vector.
8673 for (Value *V : VL) {
8674 auto *I = dyn_cast<GetElementPtrInst>(V);
8675 if (!I) {
8676 Operands.back().push_back(
8677 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8678 continue;
8679 }
8680 auto *Op = I->getOperand(IndexIdx);
8681 auto *CI = dyn_cast<ConstantInt>(Op);
8682 if (!CI)
8683 Operands.back().push_back(Op);
8684 else
8685 Operands.back().push_back(ConstantFoldIntegerCast(
8686 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8687 }
8688 TE->setOperand(IndexIdx, Operands.back());
8689
8690 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8691 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8692 return;
8693 }
8694 case Instruction::Store: {
8695 bool Consecutive = CurrentOrder.empty();
8696 if (!Consecutive)
8697 fixupOrderingIndices(CurrentOrder);
8698 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8699 ReuseShuffleIndices, CurrentOrder);
8700 TE->setOperand(*this);
8701 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8702 if (Consecutive)
8703 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
8704 else
8705 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
8706 return;
8707 }
8708 case Instruction::Call: {
8709 // Check if the calls are all to the same vectorizable intrinsic or
8710 // library function.
8711 CallInst *CI = cast<CallInst>(VL0);
8713
8714 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8715 ReuseShuffleIndices);
8716 TE->setOperand(*this, isCommutative(VL0));
8717 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8718 // For scalar operands no need to create an entry since no need to
8719 // vectorize it.
8721 continue;
8722 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8723 }
8724 return;
8725 }
8726 case Instruction::ShuffleVector: {
8727 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8728 ReuseShuffleIndices);
8729 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
8730
8731 // Reorder operands if reordering would enable vectorization.
8732 auto *CI = dyn_cast<CmpInst>(VL0);
8733 if (CI && any_of(VL, [](Value *V) {
8734 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8735 })) {
8736 auto *MainCI = cast<CmpInst>(S.getMainOp());
8737 auto *AltCI = cast<CmpInst>(S.getAltOp());
8738 CmpInst::Predicate MainP = MainCI->getPredicate();
8739 CmpInst::Predicate AltP = AltCI->getPredicate();
8740 assert(MainP != AltP &&
8741 "Expected different main/alternate predicates.");
8743 // Collect operands - commute if it uses the swapped predicate or
8744 // alternate operation.
8745 for (Value *V : VL) {
8746 if (isa<PoisonValue>(V)) {
8747 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8748 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8749 continue;
8750 }
8751 auto *Cmp = cast<CmpInst>(V);
8752 Value *LHS = Cmp->getOperand(0);
8753 Value *RHS = Cmp->getOperand(1);
8754
8755 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8756 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8757 std::swap(LHS, RHS);
8758 } else {
8759 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8760 std::swap(LHS, RHS);
8761 }
8762 Left.push_back(LHS);
8763 Right.push_back(RHS);
8764 }
8765 TE->setOperand(0, Left);
8766 TE->setOperand(1, Right);
8767 buildTree_rec(Left, Depth + 1, {TE, 0});
8768 buildTree_rec(Right, Depth + 1, {TE, 1});
8769 return;
8770 }
8771
8772 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8773 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8774 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8775 return;
8776 }
8777 default:
8778 break;
8779 }
8780 llvm_unreachable("Unexpected vectorization of the instructions.");
8781}
8782
8784 unsigned N = 1;
8785 Type *EltTy = T;
8786
8787 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8788 if (EltTy->isEmptyTy())
8789 return 0;
8790 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8791 // Check that struct is homogeneous.
8792 for (const auto *Ty : ST->elements())
8793 if (Ty != *ST->element_begin())
8794 return 0;
8795 N *= ST->getNumElements();
8796 EltTy = *ST->element_begin();
8797 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8798 N *= AT->getNumElements();
8799 EltTy = AT->getElementType();
8800 } else {
8801 auto *VT = cast<FixedVectorType>(EltTy);
8802 N *= VT->getNumElements();
8803 EltTy = VT->getElementType();
8804 }
8805 }
8806
8807 if (!isValidElementType(EltTy))
8808 return 0;
8809 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8810 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8811 VTSize != DL->getTypeStoreSizeInBits(T))
8812 return 0;
8813 return N;
8814}
8815
8816bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
8817 SmallVectorImpl<unsigned> &CurrentOrder,
8818 bool ResizeAllowed) const {
8819 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8820 assert(It != VL.end() && "Expected at least one extract instruction.");
8821 auto *E0 = cast<Instruction>(*It);
8822 assert(
8823 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8824 "Invalid opcode");
8825 // Check if all of the extracts come from the same vector and from the
8826 // correct offset.
8827 Value *Vec = E0->getOperand(0);
8828
8829 CurrentOrder.clear();
8830
8831 // We have to extract from a vector/aggregate with the same number of elements.
8832 unsigned NElts;
8833 if (E0->getOpcode() == Instruction::ExtractValue) {
8834 NElts = canMapToVector(Vec->getType());
8835 if (!NElts)
8836 return false;
8837 // Check if load can be rewritten as load of vector.
8838 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8839 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8840 return false;
8841 } else {
8842 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8843 }
8844
8845 unsigned E = VL.size();
8846 if (!ResizeAllowed && NElts != E)
8847 return false;
8848 SmallVector<int> Indices(E, PoisonMaskElem);
8849 unsigned MinIdx = NElts, MaxIdx = 0;
8850 for (auto [I, V] : enumerate(VL)) {
8851 auto *Inst = dyn_cast<Instruction>(V);
8852 if (!Inst)
8853 continue;
8854 if (Inst->getOperand(0) != Vec)
8855 return false;
8856 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8857 if (isa<UndefValue>(EE->getIndexOperand()))
8858 continue;
8859 std::optional<unsigned> Idx = getExtractIndex(Inst);
8860 if (!Idx)
8861 return false;
8862 const unsigned ExtIdx = *Idx;
8863 if (ExtIdx >= NElts)
8864 continue;
8865 Indices[I] = ExtIdx;
8866 if (MinIdx > ExtIdx)
8867 MinIdx = ExtIdx;
8868 if (MaxIdx < ExtIdx)
8869 MaxIdx = ExtIdx;
8870 }
8871 if (MaxIdx - MinIdx + 1 > E)
8872 return false;
8873 if (MaxIdx + 1 <= E)
8874 MinIdx = 0;
8875
8876 // Check that all of the indices extract from the correct offset.
8877 bool ShouldKeepOrder = true;
8878 // Assign to all items the initial value E + 1 so we can check if the extract
8879 // instruction index was used already.
8880 // Also, later we can check that all the indices are used and we have a
8881 // consecutive access in the extract instructions, by checking that no
8882 // element of CurrentOrder still has value E + 1.
8883 CurrentOrder.assign(E, E);
8884 for (unsigned I = 0; I < E; ++I) {
8885 if (Indices[I] == PoisonMaskElem)
8886 continue;
8887 const unsigned ExtIdx = Indices[I] - MinIdx;
8888 if (CurrentOrder[ExtIdx] != E) {
8889 CurrentOrder.clear();
8890 return false;
8891 }
8892 ShouldKeepOrder &= ExtIdx == I;
8893 CurrentOrder[ExtIdx] = I;
8894 }
8895 if (ShouldKeepOrder)
8896 CurrentOrder.clear();
8897
8898 return ShouldKeepOrder;
8899}
8900
8901bool BoUpSLP::areAllUsersVectorized(
8902 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
8903 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
8904 all_of(I->users(), [this](User *U) {
8905 return ScalarToTreeEntry.contains(U) ||
8906 isVectorLikeInstWithConstOps(U) ||
8907 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8908 });
8909}
8910
8911static std::pair<InstructionCost, InstructionCost>
8914 ArrayRef<Type *> ArgTys) {
8916
8917 // Calculate the cost of the scalar and vector calls.
8918 FastMathFlags FMF;
8919 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
8920 FMF = FPCI->getFastMathFlags();
8922 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
8923 dyn_cast<IntrinsicInst>(CI));
8924 auto IntrinsicCost =
8926
8927 auto Shape = VFShape::get(CI->getFunctionType(),
8929 false /*HasGlobalPred*/);
8930 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
8931 auto LibCost = IntrinsicCost;
8932 if (!CI->isNoBuiltin() && VecFunc) {
8933 // Calculate the cost of the vector library call.
8934 // If the corresponding vector call is cheaper, return its cost.
8935 LibCost =
8936 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
8937 }
8938 return {IntrinsicCost, LibCost};
8939}
8940
8941void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
8942 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
8943 SmallVectorImpl<Value *> *OpScalars,
8944 SmallVectorImpl<Value *> *AltScalars) const {
8945 unsigned Sz = Scalars.size();
8946 Mask.assign(Sz, PoisonMaskElem);
8947 SmallVector<int> OrderMask;
8948 if (!ReorderIndices.empty())
8949 inversePermutation(ReorderIndices, OrderMask);
8950 for (unsigned I = 0; I < Sz; ++I) {
8951 unsigned Idx = I;
8952 if (!ReorderIndices.empty())
8953 Idx = OrderMask[I];
8954 if (isa<PoisonValue>(Scalars[Idx]))
8955 continue;
8956 auto *OpInst = cast<Instruction>(Scalars[Idx]);
8957 if (IsAltOp(OpInst)) {
8958 Mask[I] = Sz + Idx;
8959 if (AltScalars)
8960 AltScalars->push_back(OpInst);
8961 } else {
8962 Mask[I] = Idx;
8963 if (OpScalars)
8964 OpScalars->push_back(OpInst);
8965 }
8966 }
8967 if (!ReuseShuffleIndices.empty()) {
8968 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
8969 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
8970 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
8971 });
8972 Mask.swap(NewMask);
8973 }
8974}
8975
8977 const Instruction *MainOp,
8978 const Instruction *AltOp,
8979 const TargetLibraryInfo &TLI) {
8980 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
8981 auto *AltCI = cast<CmpInst>(AltOp);
8982 CmpInst::Predicate MainP = MainCI->getPredicate();
8983 CmpInst::Predicate AltP = AltCI->getPredicate();
8984 assert(MainP != AltP && "Expected different main/alternate predicates.");
8985 auto *CI = cast<CmpInst>(I);
8986 if (isCmpSameOrSwapped(MainCI, CI, TLI))
8987 return false;
8988 if (isCmpSameOrSwapped(AltCI, CI, TLI))
8989 return true;
8990 CmpInst::Predicate P = CI->getPredicate();
8992
8993 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
8994 "CmpInst expected to match either main or alternate predicate or "
8995 "their swap.");
8996 (void)AltP;
8997 return MainP != P && MainP != SwappedP;
8998 }
8999 return I->getOpcode() == AltOp->getOpcode();
9000}
9001
9002TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9003 assert(!Ops.empty());
9004 const auto *Op0 = Ops.front();
9005
9006 const bool IsConstant = all_of(Ops, [](Value *V) {
9007 // TODO: We should allow undef elements here
9008 return isConstant(V) && !isa<UndefValue>(V);
9009 });
9010 const bool IsUniform = all_of(Ops, [=](Value *V) {
9011 // TODO: We should allow undef elements here
9012 return V == Op0;
9013 });
9014 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9015 // TODO: We should allow undef elements here
9016 if (auto *CI = dyn_cast<ConstantInt>(V))
9017 return CI->getValue().isPowerOf2();
9018 return false;
9019 });
9020 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9021 // TODO: We should allow undef elements here
9022 if (auto *CI = dyn_cast<ConstantInt>(V))
9023 return CI->getValue().isNegatedPowerOf2();
9024 return false;
9025 });
9026
9028 if (IsConstant && IsUniform)
9030 else if (IsConstant)
9032 else if (IsUniform)
9034
9036 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9037 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9038
9039 return {VK, VP};
9040}
9041
9042namespace {
9043/// The base class for shuffle instruction emission and shuffle cost estimation.
9044class BaseShuffleAnalysis {
9045protected:
9046 Type *ScalarTy = nullptr;
9047
9048 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9049
9050 /// V is expected to be a vectorized value.
9051 /// When REVEC is disabled, there is no difference between VF and
9052 /// VNumElements.
9053 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9054 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9055 /// of 8.
9056 unsigned getVF(Value *V) const {
9057 assert(V && "V cannot be nullptr");
9058 assert(isa<FixedVectorType>(V->getType()) &&
9059 "V does not have FixedVectorType");
9060 assert(ScalarTy && "ScalarTy cannot be nullptr");
9061 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9062 unsigned VNumElements =
9063 cast<FixedVectorType>(V->getType())->getNumElements();
9064 assert(VNumElements > ScalarTyNumElements &&
9065 "the number of elements of V is not large enough");
9066 assert(VNumElements % ScalarTyNumElements == 0 &&
9067 "the number of elements of V is not a vectorized value");
9068 return VNumElements / ScalarTyNumElements;
9069 }
9070
9071 /// Checks if the mask is an identity mask.
9072 /// \param IsStrict if is true the function returns false if mask size does
9073 /// not match vector size.
9074 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9075 bool IsStrict) {
9076 int Limit = Mask.size();
9077 int VF = VecTy->getNumElements();
9078 int Index = -1;
9079 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9080 return true;
9081 if (!IsStrict) {
9082 // Consider extract subvector starting from index 0.
9084 Index == 0)
9085 return true;
9086 // All VF-size submasks are identity (e.g.
9087 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9088 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9089 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9090 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9092 }))
9093 return true;
9094 }
9095 return false;
9096 }
9097
9098 /// Tries to combine 2 different masks into single one.
9099 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9100 /// change the size of the vector, \p LocalVF is the original size of the
9101 /// shuffled vector.
9102 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9103 ArrayRef<int> ExtMask) {
9104 unsigned VF = Mask.size();
9105 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9106 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9107 if (ExtMask[I] == PoisonMaskElem)
9108 continue;
9109 int MaskedIdx = Mask[ExtMask[I] % VF];
9110 NewMask[I] =
9111 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9112 }
9113 Mask.swap(NewMask);
9114 }
9115
9116 /// Looks through shuffles trying to reduce final number of shuffles in the
9117 /// code. The function looks through the previously emitted shuffle
9118 /// instructions and properly mark indices in mask as undef.
9119 /// For example, given the code
9120 /// \code
9121 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9122 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9123 /// \endcode
9124 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9125 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9126 /// <0, 1, 2, 3> for the shuffle.
9127 /// If 2 operands are of different size, the smallest one will be resized and
9128 /// the mask recalculated properly.
9129 /// For example, given the code
9130 /// \code
9131 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9132 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9133 /// \endcode
9134 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9135 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9136 /// <0, 1, 2, 3> for the shuffle.
9137 /// So, it tries to transform permutations to simple vector merge, if
9138 /// possible.
9139 /// \param V The input vector which must be shuffled using the given \p Mask.
9140 /// If the better candidate is found, \p V is set to this best candidate
9141 /// vector.
9142 /// \param Mask The input mask for the shuffle. If the best candidate is found
9143 /// during looking-through-shuffles attempt, it is updated accordingly.
9144 /// \param SinglePermute true if the shuffle operation is originally a
9145 /// single-value-permutation. In this case the look-through-shuffles procedure
9146 /// may look for resizing shuffles as the best candidates.
9147 /// \return true if the shuffle results in the non-resizing identity shuffle
9148 /// (and thus can be ignored), false - otherwise.
9149 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9150 bool SinglePermute) {
9151 Value *Op = V;
9152 ShuffleVectorInst *IdentityOp = nullptr;
9153 SmallVector<int> IdentityMask;
9154 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9155 // Exit if not a fixed vector type or changing size shuffle.
9156 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9157 if (!SVTy)
9158 break;
9159 // Remember the identity or broadcast mask, if it is not a resizing
9160 // shuffle. If no better candidates are found, this Op and Mask will be
9161 // used in the final shuffle.
9162 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9163 if (!IdentityOp || !SinglePermute ||
9164 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9166 IdentityMask.size()))) {
9167 IdentityOp = SV;
9168 // Store current mask in the IdentityMask so later we did not lost
9169 // this info if IdentityOp is selected as the best candidate for the
9170 // permutation.
9171 IdentityMask.assign(Mask);
9172 }
9173 }
9174 // Remember the broadcast mask. If no better candidates are found, this Op
9175 // and Mask will be used in the final shuffle.
9176 // Zero splat can be used as identity too, since it might be used with
9177 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9178 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9179 // expensive, the analysis founds out, that the source vector is just a
9180 // broadcast, this original mask can be transformed to identity mask <0,
9181 // 1, 2, 3>.
9182 // \code
9183 // %0 = shuffle %v, poison, zeroinitalizer
9184 // %res = shuffle %0, poison, <3, 1, 2, 0>
9185 // \endcode
9186 // may be transformed to
9187 // \code
9188 // %0 = shuffle %v, poison, zeroinitalizer
9189 // %res = shuffle %0, poison, <0, 1, 2, 3>
9190 // \endcode
9191 if (SV->isZeroEltSplat()) {
9192 IdentityOp = SV;
9193 IdentityMask.assign(Mask);
9194 }
9195 int LocalVF = Mask.size();
9196 if (auto *SVOpTy =
9197 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9198 LocalVF = SVOpTy->getNumElements();
9199 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9200 for (auto [Idx, I] : enumerate(Mask)) {
9201 if (I == PoisonMaskElem ||
9202 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9203 continue;
9204 ExtMask[Idx] = SV->getMaskValue(I);
9205 }
9206 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9207 SV->getOperand(0),
9208 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9209 .all();
9210 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9211 SV->getOperand(1),
9212 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9213 .all();
9214 if (!IsOp1Undef && !IsOp2Undef) {
9215 // Update mask and mark undef elems.
9216 for (int &I : Mask) {
9217 if (I == PoisonMaskElem)
9218 continue;
9219 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9221 I = PoisonMaskElem;
9222 }
9223 break;
9224 }
9225 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9226 combineMasks(LocalVF, ShuffleMask, Mask);
9227 Mask.swap(ShuffleMask);
9228 if (IsOp2Undef)
9229 Op = SV->getOperand(0);
9230 else
9231 Op = SV->getOperand(1);
9232 }
9233 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9234 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9236 if (IdentityOp) {
9237 V = IdentityOp;
9238 assert(Mask.size() == IdentityMask.size() &&
9239 "Expected masks of same sizes.");
9240 // Clear known poison elements.
9241 for (auto [I, Idx] : enumerate(Mask))
9242 if (Idx == PoisonMaskElem)
9243 IdentityMask[I] = PoisonMaskElem;
9244 Mask.swap(IdentityMask);
9245 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9246 return SinglePermute &&
9247 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9248 /*IsStrict=*/true) ||
9249 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9250 Shuffle->isZeroEltSplat() &&
9252 }
9253 V = Op;
9254 return false;
9255 }
9256 V = Op;
9257 return true;
9258 }
9259
9260 /// Smart shuffle instruction emission, walks through shuffles trees and
9261 /// tries to find the best matching vector for the actual shuffle
9262 /// instruction.
9263 template <typename T, typename ShuffleBuilderTy>
9264 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9265 ShuffleBuilderTy &Builder) {
9266 assert(V1 && "Expected at least one vector value.");
9267 if (V2)
9268 Builder.resizeToMatch(V1, V2);
9269 int VF = Mask.size();
9270 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9271 VF = FTy->getNumElements();
9272 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9273 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9274 .all()) {
9275 // Peek through shuffles.
9276 Value *Op1 = V1;
9277 Value *Op2 = V2;
9278 int VF =
9279 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9280 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9281 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9282 for (int I = 0, E = Mask.size(); I < E; ++I) {
9283 if (Mask[I] < VF)
9284 CombinedMask1[I] = Mask[I];
9285 else
9286 CombinedMask2[I] = Mask[I] - VF;
9287 }
9288 Value *PrevOp1;
9289 Value *PrevOp2;
9290 do {
9291 PrevOp1 = Op1;
9292 PrevOp2 = Op2;
9293 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9294 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9295 // Check if we have 2 resizing shuffles - need to peek through operands
9296 // again.
9297 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9298 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9299 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9300 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9301 if (I == PoisonMaskElem)
9302 continue;
9303 ExtMask1[Idx] = SV1->getMaskValue(I);
9304 }
9305 SmallBitVector UseMask1 = buildUseMask(
9306 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9307 ->getNumElements(),
9308 ExtMask1, UseMask::SecondArg);
9309 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9310 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9311 if (I == PoisonMaskElem)
9312 continue;
9313 ExtMask2[Idx] = SV2->getMaskValue(I);
9314 }
9315 SmallBitVector UseMask2 = buildUseMask(
9316 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9317 ->getNumElements(),
9318 ExtMask2, UseMask::SecondArg);
9319 if (SV1->getOperand(0)->getType() ==
9320 SV2->getOperand(0)->getType() &&
9321 SV1->getOperand(0)->getType() != SV1->getType() &&
9322 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9323 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9324 Op1 = SV1->getOperand(0);
9325 Op2 = SV2->getOperand(0);
9326 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9327 int LocalVF = ShuffleMask1.size();
9328 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9329 LocalVF = FTy->getNumElements();
9330 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9331 CombinedMask1.swap(ShuffleMask1);
9332 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9333 LocalVF = ShuffleMask2.size();
9334 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9335 LocalVF = FTy->getNumElements();
9336 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9337 CombinedMask2.swap(ShuffleMask2);
9338 }
9339 }
9340 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9341 Builder.resizeToMatch(Op1, Op2);
9342 VF = std::max(cast<VectorType>(Op1->getType())
9343 ->getElementCount()
9344 .getKnownMinValue(),
9345 cast<VectorType>(Op2->getType())
9346 ->getElementCount()
9347 .getKnownMinValue());
9348 for (int I = 0, E = Mask.size(); I < E; ++I) {
9349 if (CombinedMask2[I] != PoisonMaskElem) {
9350 assert(CombinedMask1[I] == PoisonMaskElem &&
9351 "Expected undefined mask element");
9352 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9353 }
9354 }
9355 if (Op1 == Op2 &&
9356 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9357 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9358 isa<ShuffleVectorInst>(Op1) &&
9359 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9360 ArrayRef(CombinedMask1))))
9361 return Builder.createIdentity(Op1);
9362 return Builder.createShuffleVector(
9363 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9364 CombinedMask1);
9365 }
9366 if (isa<PoisonValue>(V1))
9367 return Builder.createPoison(
9368 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9369 SmallVector<int> NewMask(Mask);
9370 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9371 assert(V1 && "Expected non-null value after looking through shuffles.");
9372
9373 if (!IsIdentity)
9374 return Builder.createShuffleVector(V1, NewMask);
9375 return Builder.createIdentity(V1);
9376 }
9377};
9378} // namespace
9379
9380/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9381static std::pair<InstructionCost, InstructionCost>
9383 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9384 Type *ScalarTy, VectorType *VecTy) {
9385 InstructionCost ScalarCost = 0;
9386 InstructionCost VecCost = 0;
9387 // Here we differentiate two cases: (1) when Ptrs represent a regular
9388 // vectorization tree node (as they are pointer arguments of scattered
9389 // loads) or (2) when Ptrs are the arguments of loads or stores being
9390 // vectorized as plane wide unit-stride load/store since all the
9391 // loads/stores are known to be from/to adjacent locations.
9392 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9393 // Case 2: estimate costs for pointer related costs when vectorizing to
9394 // a wide load/store.
9395 // Scalar cost is estimated as a set of pointers with known relationship
9396 // between them.
9397 // For vector code we will use BasePtr as argument for the wide load/store
9398 // but we also need to account all the instructions which are going to
9399 // stay in vectorized code due to uses outside of these scalar
9400 // loads/stores.
9401 ScalarCost = TTI.getPointersChainCost(
9402 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9403 CostKind);
9404
9405 SmallVector<const Value *> PtrsRetainedInVecCode;
9406 for (Value *V : Ptrs) {
9407 if (V == BasePtr) {
9408 PtrsRetainedInVecCode.push_back(V);
9409 continue;
9410 }
9411 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9412 // For simplicity assume Ptr to stay in vectorized code if it's not a
9413 // GEP instruction. We don't care since it's cost considered free.
9414 // TODO: We should check for any uses outside of vectorizable tree
9415 // rather than just single use.
9416 if (!Ptr || !Ptr->hasOneUse())
9417 PtrsRetainedInVecCode.push_back(V);
9418 }
9419
9420 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9421 // If all pointers stay in vectorized code then we don't have
9422 // any savings on that.
9423 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9424 }
9425 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9426 TTI::PointersChainInfo::getKnownStride(),
9427 VecTy, CostKind);
9428 } else {
9429 // Case 1: Ptrs are the arguments of loads that we are going to transform
9430 // into masked gather load intrinsic.
9431 // All the scalar GEPs will be removed as a result of vectorization.
9432 // For any external uses of some lanes extract element instructions will
9433 // be generated (which cost is estimated separately).
9434 TTI::PointersChainInfo PtrsInfo =
9435 all_of(Ptrs,
9436 [](const Value *V) {
9437 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9438 return Ptr && !Ptr->hasAllConstantIndices();
9439 })
9440 ? TTI::PointersChainInfo::getUnknownStride()
9441 : TTI::PointersChainInfo::getKnownStride();
9442
9443 ScalarCost =
9444 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9445 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9446 if (!BaseGEP) {
9447 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9448 if (It != Ptrs.end())
9449 BaseGEP = cast<GEPOperator>(*It);
9450 }
9451 if (BaseGEP) {
9452 SmallVector<const Value *> Indices(BaseGEP->indices());
9453 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9454 BaseGEP->getPointerOperand(), Indices, VecTy,
9455 CostKind);
9456 }
9457 }
9458
9459 return std::make_pair(ScalarCost, VecCost);
9460}
9461
9462void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9463 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9464 "Expected gather node without reordering.");
9466 SmallSet<size_t, 2> LoadKeyUsed;
9467
9468 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9469 // instructions have same opcode already.
9470 if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
9471 all_of(TE.Scalars, isConstant))
9472 return;
9473
9474 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9475 return VectorizableTree[Idx]->isSame(TE.Scalars);
9476 }))
9477 return;
9478
9479 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9480 Key = hash_combine(hash_value(LI->getParent()), Key);
9481 Value *Ptr =
9483 if (LoadKeyUsed.contains(Key)) {
9484 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9485 if (LIt != LoadsMap.end()) {
9486 for (LoadInst *RLI : LIt->second) {
9487 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9488 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9489 /*StrictCheck=*/true))
9490 return hash_value(RLI->getPointerOperand());
9491 }
9492 for (LoadInst *RLI : LIt->second) {
9494 LI->getPointerOperand(), *TLI)) {
9495 hash_code SubKey = hash_value(RLI->getPointerOperand());
9496 return SubKey;
9497 }
9498 }
9499 if (LIt->second.size() > 2) {
9500 hash_code SubKey =
9501 hash_value(LIt->second.back()->getPointerOperand());
9502 return SubKey;
9503 }
9504 }
9505 }
9506 LoadKeyUsed.insert(Key);
9507 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9508 return hash_value(LI->getPointerOperand());
9509 };
9512 bool IsOrdered = true;
9513 unsigned NumInstructions = 0;
9514 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9515 // nodes.
9516 for (auto [I, V] : enumerate(TE.Scalars)) {
9517 size_t Key = 1, Idx = 1;
9518 if (auto *Inst = dyn_cast<Instruction>(V);
9519 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9520 !isDeleted(Inst) && !isVectorized(V)) {
9521 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9522 /*AllowAlternate=*/false);
9523 ++NumInstructions;
9524 }
9525 auto &Container = SortedValues[Key];
9526 if (IsOrdered && !KeyToIndex.contains(V) &&
9527 !(isa<Constant, ExtractElementInst>(V) ||
9529 ((Container.contains(Idx) &&
9530 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9531 (!Container.empty() && !Container.contains(Idx) &&
9532 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9533 IsOrdered = false;
9534 auto &KTI = KeyToIndex[V];
9535 if (KTI.empty())
9536 Container[Idx].push_back(V);
9537 KTI.push_back(I);
9538 }
9540 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9541 if (!IsOrdered && NumInstructions > 1) {
9542 unsigned Cnt = 0;
9543 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9544 for (const auto &D : SortedValues) {
9545 for (const auto &P : D.second) {
9546 unsigned Sz = 0;
9547 for (Value *V : P.second) {
9548 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9549 for (auto [K, Idx] : enumerate(Indices)) {
9550 TE.ReorderIndices[Cnt + K] = Idx;
9551 TE.Scalars[Cnt + K] = V;
9552 }
9553 Sz += Indices.size();
9554 Cnt += Indices.size();
9555 }
9556 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9557 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9558 *TTI, TE.Scalars.front()->getType(), Sz);
9559 SubVectors.emplace_back(Cnt - Sz, SubVF);
9560 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9561 DemandedElts.clearBit(I);
9562 } else if (!P.second.empty() && isConstant(P.second.front())) {
9563 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9564 DemandedElts.clearBit(I);
9565 }
9566 }
9567 }
9568 }
9569 // Reuses always require shuffles, so consider it as profitable.
9570 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9571 return;
9572 // Do simple cost estimation.
9575 auto *ScalarTy = TE.Scalars.front()->getType();
9576 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9577 for (auto [Idx, Sz] : SubVectors) {
9579 Idx, getWidenedType(ScalarTy, Sz));
9580 }
9581 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9582 assert(SLPReVec && "Only supported by REVEC.");
9583 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9584 // of CreateInsertElement.
9585 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9586 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9587 if (DemandedElts[I])
9588 Cost +=
9589 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9590 CostKind, I * ScalarTyNumElements, FTy);
9591 } else {
9592 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9593 /*Extract=*/false, CostKind);
9594 }
9595 int Sz = TE.Scalars.size();
9596 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9597 TE.ReorderIndices.end());
9598 for (unsigned I : seq<unsigned>(Sz)) {
9599 Value *V = TE.getOrdered(I);
9600 if (isa<PoisonValue>(V)) {
9601 ReorderMask[I] = PoisonMaskElem;
9602 } else if (isConstant(V) || DemandedElts[I]) {
9603 ReorderMask[I] = I + TE.ReorderIndices.size();
9604 }
9605 }
9607 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9610 VecTy, ReorderMask);
9611 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9612 ReorderMask.assign(Sz, PoisonMaskElem);
9613 for (unsigned I : seq<unsigned>(Sz)) {
9614 Value *V = TE.getOrdered(I);
9615 if (isConstant(V)) {
9616 DemandedElts.clearBit(I);
9617 if (!isa<PoisonValue>(V))
9618 ReorderMask[I] = I;
9619 } else {
9620 ReorderMask[I] = I + Sz;
9621 }
9622 }
9624 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9625 if (!DemandedElts.isAllOnes())
9626 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9627 if (Cost >= BVCost) {
9628 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9629 reorderScalars(TE.Scalars, Mask);
9630 TE.ReorderIndices.clear();
9631 }
9632}
9633
9636 BaseGraphSize = VectorizableTree.size();
9637 // Turn graph transforming mode on and off, when done.
9638 class GraphTransformModeRAAI {
9639 bool &SavedIsGraphTransformMode;
9640
9641 public:
9642 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9643 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9644 IsGraphTransformMode = true;
9645 }
9646 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9647 } TransformContext(IsGraphTransformMode);
9648 // Operands are profitable if they are:
9649 // 1. At least one constant
9650 // or
9651 // 2. Splats
9652 // or
9653 // 3. Results in good vectorization opportunity, i.e. may generate vector
9654 // nodes and reduce cost of the graph.
9655 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9656 const InstructionsState &S) {
9658 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9659 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9660 I2->getOperand(Op));
9661 return all_of(
9662 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9663 return all_of(Cand,
9664 [](const std::pair<Value *, Value *> &P) {
9665 return isa<Constant>(P.first) ||
9666 isa<Constant>(P.second) || P.first == P.second;
9667 }) ||
9669 });
9670 };
9671
9672 // Try to reorder gather nodes for better vectorization opportunities.
9673 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9674 TreeEntry &E = *VectorizableTree[Idx];
9675 if (E.isGather())
9676 reorderGatherNode(E);
9677 }
9678
9679 // The tree may grow here, so iterate over nodes, built before.
9680 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9681 TreeEntry &E = *VectorizableTree[Idx];
9682 if (E.isGather()) {
9683 ArrayRef<Value *> VL = E.Scalars;
9684 const unsigned Sz = getVectorElementSize(VL.front());
9685 unsigned MinVF = getMinVF(2 * Sz);
9686 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9687 // same opcode and same parent block or all constants.
9688 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9689 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9690 E.isAltShuffle() || !allSameBlock(VL)) ||
9691 allConstant(VL) || isSplat(VL))
9692 continue;
9693 // Try to find vectorizable sequences and transform them into a series of
9694 // insertvector instructions.
9695 unsigned StartIdx = 0;
9696 unsigned End = VL.size();
9697 for (unsigned VF = getFloorFullVectorNumberOfElements(
9698 *TTI, VL.front()->getType(), VL.size() - 1);
9699 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9700 *TTI, VL.front()->getType(), VF - 1)) {
9701 if (StartIdx + VF > End)
9702 continue;
9704 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9705 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9706 // If any instruction is vectorized already - do not try again.
9707 // Reuse the existing node, if it fully matches the slice.
9708 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9709 SE || getTreeEntry(Slice.back())) {
9710 if (!SE)
9711 continue;
9712 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9713 continue;
9714 }
9715 // Constant already handled effectively - skip.
9716 if (allConstant(Slice))
9717 continue;
9718 // Do not try to vectorize small splats (less than vector register and
9719 // only with the single non-undef element).
9720 bool IsSplat = isSplat(Slice);
9721 if (Slices.empty() || !IsSplat ||
9722 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9723 Slice.front()->getType(), VF)),
9724 1U, VF - 1) !=
9726 Slice.front()->getType(), 2 * VF)),
9727 1U, 2 * VF)) ||
9728 count(Slice, Slice.front()) ==
9729 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9730 : 1)) {
9731 if (IsSplat)
9732 continue;
9733 InstructionsState S = getSameOpcode(Slice, *TLI);
9734 if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) ||
9735 (S.getOpcode() == Instruction::Load &&
9737 (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
9738 continue;
9739 if (VF == 2) {
9740 // Try to vectorize reduced values or if all users are vectorized.
9741 // For expensive instructions extra extracts might be profitable.
9742 if ((!UserIgnoreList || E.Idx != 0) &&
9743 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9745 !all_of(Slice, [&](Value *V) {
9746 if (isa<PoisonValue>(V))
9747 return true;
9748 return areAllUsersVectorized(cast<Instruction>(V),
9749 UserIgnoreList);
9750 }))
9751 continue;
9752 if (S.getOpcode() == Instruction::Load) {
9753 OrdersType Order;
9754 SmallVector<Value *> PointerOps;
9755 LoadsState Res =
9756 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9757 // Do not vectorize gathers.
9758 if (Res == LoadsState::ScatterVectorize ||
9759 Res == LoadsState::Gather) {
9760 if (Res == LoadsState::Gather) {
9762 // If reductions and the scalars from the root node are
9763 // analyzed - mark as non-vectorizable reduction.
9764 if (UserIgnoreList && E.Idx == 0)
9765 analyzedReductionVals(Slice);
9766 }
9767 continue;
9768 }
9769 } else if (S.getOpcode() == Instruction::ExtractElement ||
9770 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9772 !CheckOperandsProfitability(
9773 S.getMainOp(),
9774 cast<Instruction>(*find_if(reverse(Slice),
9775 IsaPred<Instruction>)),
9776 S))) {
9777 // Do not vectorize extractelements (handled effectively
9778 // alread). Do not vectorize non-profitable instructions (with
9779 // low cost and non-vectorizable operands.)
9780 continue;
9781 }
9782 }
9783 }
9784 Slices.emplace_back(Cnt, Slice.size());
9785 }
9786 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9787 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9788 if (StartIdx == Cnt)
9789 StartIdx = Cnt + Sz;
9790 if (End == Cnt + Sz)
9791 End = Cnt;
9792 };
9793 for (auto [Cnt, Sz] : Slices) {
9794 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9795 // If any instruction is vectorized already - do not try again.
9796 if (TreeEntry *SE = getTreeEntry(Slice.front());
9797 SE || getTreeEntry(Slice.back())) {
9798 if (!SE)
9799 continue;
9800 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9801 continue;
9802 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9803 AddCombinedNode(SE->Idx, Cnt, Sz);
9804 continue;
9805 }
9806 unsigned PrevSize = VectorizableTree.size();
9807 [[maybe_unused]] unsigned PrevEntriesSize =
9808 LoadEntriesToVectorize.size();
9809 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9810 if (PrevSize + 1 == VectorizableTree.size() &&
9811 VectorizableTree[PrevSize]->isGather() &&
9812 VectorizableTree[PrevSize]->getOpcode() !=
9813 Instruction::ExtractElement &&
9814 !isSplat(Slice)) {
9815 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9816 analyzedReductionVals(Slice);
9817 VectorizableTree.pop_back();
9818 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9819 "LoadEntriesToVectorize expected to remain the same");
9820 continue;
9821 }
9822 AddCombinedNode(PrevSize, Cnt, Sz);
9823 }
9824 }
9825 // Restore ordering, if no extra vectorization happened.
9826 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9827 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9828 reorderScalars(E.Scalars, Mask);
9829 E.ReorderIndices.clear();
9830 }
9831 }
9832 switch (E.getOpcode()) {
9833 case Instruction::Load: {
9834 // No need to reorder masked gather loads, just reorder the scalar
9835 // operands.
9836 if (E.State != TreeEntry::Vectorize)
9837 break;
9838 Type *ScalarTy = E.getMainOp()->getType();
9839 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9840 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9841 // Check if profitable to represent consecutive load + reverse as strided
9842 // load with stride -1.
9843 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9844 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9845 SmallVector<int> Mask;
9846 inversePermutation(E.ReorderIndices, Mask);
9847 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9848 InstructionCost OriginalVecCost =
9849 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9854 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9855 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9856 if (StridedCost < OriginalVecCost)
9857 // Strided load is more profitable than consecutive load + reverse -
9858 // transform the node to strided load.
9859 E.State = TreeEntry::StridedVectorize;
9860 }
9861 break;
9862 }
9863 case Instruction::Store: {
9864 Type *ScalarTy =
9865 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9866 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9867 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9868 // Check if profitable to represent consecutive load + reverse as strided
9869 // load with stride -1.
9870 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9871 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9872 SmallVector<int> Mask;
9873 inversePermutation(E.ReorderIndices, Mask);
9874 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9875 InstructionCost OriginalVecCost =
9876 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9881 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9882 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
9883 if (StridedCost < OriginalVecCost)
9884 // Strided store is more profitable than reverse + consecutive store -
9885 // transform the node to strided store.
9886 E.State = TreeEntry::StridedVectorize;
9887 } else if (!E.ReorderIndices.empty()) {
9888 // Check for interleaved stores.
9889 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
9890 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9891 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
9892 if (Mask.size() < 4)
9893 return 0u;
9894 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9896 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
9898 VecTy, Factor, BaseSI->getAlign(),
9899 BaseSI->getPointerAddressSpace()))
9900 return Factor;
9901 }
9902
9903 return 0u;
9904 };
9905 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9906 unsigned InterleaveFactor = IsInterleaveMask(Mask);
9907 if (InterleaveFactor != 0)
9908 E.setInterleave(InterleaveFactor);
9909 }
9910 break;
9911 }
9912 case Instruction::Select: {
9913 if (E.State != TreeEntry::Vectorize)
9914 break;
9915 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
9916 if (MinMaxID == Intrinsic::not_intrinsic)
9917 break;
9918 // This node is a minmax node.
9919 E.CombinedOp = TreeEntry::MinMax;
9920 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
9921 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9922 CondEntry->State == TreeEntry::Vectorize) {
9923 // The condition node is part of the combined minmax node.
9924 CondEntry->State = TreeEntry::CombinedVectorize;
9925 }
9926 break;
9927 }
9928 default:
9929 break;
9930 }
9931 }
9932
9933 if (LoadEntriesToVectorize.empty()) {
9934 // Single load node - exit.
9935 if (VectorizableTree.size() <= 1 &&
9936 VectorizableTree.front()->getOpcode() == Instruction::Load)
9937 return;
9938 // Small graph with small VF - exit.
9939 constexpr unsigned SmallTree = 3;
9940 constexpr unsigned SmallVF = 2;
9941 if ((VectorizableTree.size() <= SmallTree &&
9942 VectorizableTree.front()->Scalars.size() == SmallVF) ||
9943 (VectorizableTree.size() <= 2 && UserIgnoreList))
9944 return;
9945
9946 if (VectorizableTree.front()->isNonPowOf2Vec() &&
9947 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
9948 getCanonicalGraphSize() <= SmallTree &&
9949 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
9950 [](const std::unique_ptr<TreeEntry> &TE) {
9951 return TE->isGather() &&
9952 TE->getOpcode() == Instruction::Load &&
9953 !allSameBlock(TE->Scalars);
9954 }) == 1)
9955 return;
9956 }
9957
9958 // A list of loads to be gathered during the vectorization process. We can
9959 // try to vectorize them at the end, if profitable.
9962 GatheredLoads;
9963
9964 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9965 TreeEntry &E = *TE;
9966 if (E.isGather() &&
9967 (E.getOpcode() == Instruction::Load ||
9968 (!E.getOpcode() && any_of(E.Scalars,
9969 [&](Value *V) {
9970 return isa<LoadInst>(V) &&
9971 !isVectorized(V) &&
9972 !isDeleted(cast<Instruction>(V));
9973 }))) &&
9974 !isSplat(E.Scalars)) {
9975 for (Value *V : E.Scalars) {
9976 auto *LI = dyn_cast<LoadInst>(V);
9977 if (!LI)
9978 continue;
9979 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
9980 continue;
9982 *this, V, *DL, *SE, *TTI,
9983 GatheredLoads[std::make_tuple(
9984 LI->getParent(),
9986 LI->getType())]);
9987 }
9988 }
9989 }
9990 // Try to vectorize gathered loads if this is not just a gather of loads.
9991 if (!GatheredLoads.empty())
9992 tryToVectorizeGatheredLoads(GatheredLoads);
9993}
9994
9995/// Merges shuffle masks and emits final shuffle instruction, if required. It
9996/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
9997/// when the actual shuffle instruction is generated only if this is actually
9998/// required. Otherwise, the shuffle instruction emission is delayed till the
9999/// end of the process, to reduce the number of emitted instructions and further
10000/// analysis/transformations.
10001class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10002 bool IsFinalized = false;
10003 SmallVector<int> CommonMask;
10005 const TargetTransformInfo &TTI;
10007 SmallDenseSet<Value *> VectorizedVals;
10008 BoUpSLP &R;
10009 SmallPtrSetImpl<Value *> &CheckedExtracts;
10010 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10011 /// While set, still trying to estimate the cost for the same nodes and we
10012 /// can delay actual cost estimation (virtual shuffle instruction emission).
10013 /// May help better estimate the cost if same nodes must be permuted + allows
10014 /// to move most of the long shuffles cost estimation to TTI.
10015 bool SameNodesEstimated = true;
10016
10017 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10018 if (Ty->getScalarType()->isPointerTy()) {
10022 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10023 Ty->getScalarType());
10024 if (auto *VTy = dyn_cast<VectorType>(Ty))
10025 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10026 return Res;
10027 }
10028 return Constant::getAllOnesValue(Ty);
10029 }
10030
10031 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10032 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10033 return TTI::TCC_Free;
10034 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10035 InstructionCost GatherCost = 0;
10036 SmallVector<Value *> Gathers(VL);
10037 if (!Root && isSplat(VL)) {
10038 // Found the broadcasting of the single scalar, calculate the cost as
10039 // the broadcast.
10040 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10041 assert(It != VL.end() && "Expected at least one non-undef value.");
10042 // Add broadcast for non-identity shuffle only.
10043 bool NeedShuffle =
10044 count(VL, *It) > 1 &&
10045 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10046 if (!NeedShuffle) {
10047 if (isa<FixedVectorType>(ScalarTy)) {
10048 assert(SLPReVec && "FixedVectorType is not expected.");
10049 return TTI.getShuffleCost(
10050 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10051 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10052 cast<FixedVectorType>(ScalarTy));
10053 }
10054 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10055 CostKind, std::distance(VL.begin(), It),
10056 PoisonValue::get(VecTy), *It);
10057 }
10058
10059 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10060 transform(VL, ShuffleMask.begin(), [](Value *V) {
10061 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10062 });
10063 InstructionCost InsertCost =
10064 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10065 PoisonValue::get(VecTy), *It);
10066 return InsertCost + ::getShuffleCost(TTI,
10068 VecTy, ShuffleMask, CostKind,
10069 /*Index=*/0, /*SubTp=*/nullptr,
10070 /*Args=*/*It);
10071 }
10072 return GatherCost +
10073 (all_of(Gathers, IsaPred<UndefValue>)
10075 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10076 ScalarTy));
10077 };
10078
10079 /// Compute the cost of creating a vector containing the extracted values from
10080 /// \p VL.
10082 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10083 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10084 unsigned NumParts) {
10085 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10086 unsigned NumElts =
10087 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10088 auto *EE = dyn_cast<ExtractElementInst>(V);
10089 if (!EE)
10090 return Sz;
10091 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10092 if (!VecTy)
10093 return Sz;
10094 return std::max(Sz, VecTy->getNumElements());
10095 });
10096 // FIXME: this must be moved to TTI for better estimation.
10097 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10098 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10100 -> std::optional<TTI::ShuffleKind> {
10101 if (NumElts <= EltsPerVector)
10102 return std::nullopt;
10103 int OffsetReg0 =
10104 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10105 [](int S, int I) {
10106 if (I == PoisonMaskElem)
10107 return S;
10108 return std::min(S, I);
10109 }),
10110 EltsPerVector);
10111 int OffsetReg1 = OffsetReg0;
10112 DenseSet<int> RegIndices;
10113 // Check that if trying to permute same single/2 input vectors.
10115 int FirstRegId = -1;
10116 Indices.assign(1, OffsetReg0);
10117 for (auto [Pos, I] : enumerate(Mask)) {
10118 if (I == PoisonMaskElem)
10119 continue;
10120 int Idx = I - OffsetReg0;
10121 int RegId =
10122 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10123 if (FirstRegId < 0)
10124 FirstRegId = RegId;
10125 RegIndices.insert(RegId);
10126 if (RegIndices.size() > 2)
10127 return std::nullopt;
10128 if (RegIndices.size() == 2) {
10129 ShuffleKind = TTI::SK_PermuteTwoSrc;
10130 if (Indices.size() == 1) {
10131 OffsetReg1 = alignDown(
10132 std::accumulate(
10133 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10134 [&](int S, int I) {
10135 if (I == PoisonMaskElem)
10136 return S;
10137 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10138 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10139 if (RegId == FirstRegId)
10140 return S;
10141 return std::min(S, I);
10142 }),
10143 EltsPerVector);
10144 Indices.push_back(OffsetReg1 % NumElts);
10145 }
10146 Idx = I - OffsetReg1;
10147 }
10148 I = (Idx % NumElts) % EltsPerVector +
10149 (RegId == FirstRegId ? 0 : EltsPerVector);
10150 }
10151 return ShuffleKind;
10152 };
10154
10155 // Process extracts in blocks of EltsPerVector to check if the source vector
10156 // operand can be re-used directly. If not, add the cost of creating a
10157 // shuffle to extract the values into a vector register.
10158 for (unsigned Part : seq<unsigned>(NumParts)) {
10159 if (!ShuffleKinds[Part])
10160 continue;
10161 ArrayRef<int> MaskSlice = Mask.slice(
10162 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10163 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10164 copy(MaskSlice, SubMask.begin());
10166 std::optional<TTI::ShuffleKind> RegShuffleKind =
10167 CheckPerRegistersShuffle(SubMask, Indices);
10168 if (!RegShuffleKind) {
10169 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10171 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10172 Cost +=
10173 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10174 getWidenedType(ScalarTy, NumElts), MaskSlice);
10175 continue;
10176 }
10177 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10178 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10179 Cost +=
10180 ::getShuffleCost(TTI, *RegShuffleKind,
10181 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10182 }
10183 const unsigned BaseVF = getFullVectorNumberOfElements(
10184 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10185 for (unsigned Idx : Indices) {
10186 assert((Idx + EltsPerVector) <= BaseVF &&
10187 "SK_ExtractSubvector index out of range");
10189 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10190 Idx, getWidenedType(ScalarTy, EltsPerVector));
10191 }
10192 // Second attempt to check, if just a permute is better estimated than
10193 // subvector extract.
10194 SubMask.assign(NumElts, PoisonMaskElem);
10195 copy(MaskSlice, SubMask.begin());
10196 InstructionCost OriginalCost = ::getShuffleCost(
10197 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10198 if (OriginalCost < Cost)
10199 Cost = OriginalCost;
10200 }
10201 return Cost;
10202 }
10203 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
10204 /// shuffle emission.
10205 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
10206 ArrayRef<int> Mask) {
10207 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10208 if (Mask[Idx] != PoisonMaskElem)
10209 CommonMask[Idx] = Idx;
10210 }
10211 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10212 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10213 /// elements.
10214 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10215 ArrayRef<int> Mask, unsigned Part,
10216 unsigned SliceSize) {
10217 if (SameNodesEstimated) {
10218 // Delay the cost estimation if the same nodes are reshuffling.
10219 // If we already requested the cost of reshuffling of E1 and E2 before, no
10220 // need to estimate another cost with the sub-Mask, instead include this
10221 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10222 // estimation.
10223 if ((InVectors.size() == 2 &&
10224 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10225 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10226 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10227 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10228 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10229 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10230 "Expected all poisoned elements.");
10231 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10232 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10233 return;
10234 }
10235 // Found non-matching nodes - need to estimate the cost for the matched
10236 // and transform mask.
10237 Cost += createShuffle(InVectors.front(),
10238 InVectors.size() == 1 ? nullptr : InVectors.back(),
10239 CommonMask);
10240 transformMaskAfterShuffle(CommonMask, CommonMask);
10241 } else if (InVectors.size() == 2) {
10242 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10243 transformMaskAfterShuffle(CommonMask, CommonMask);
10244 }
10245 SameNodesEstimated = false;
10246 if (!E2 && InVectors.size() == 1) {
10247 unsigned VF = E1.getVectorFactor();
10248 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10249 VF = std::max(VF,
10250 cast<FixedVectorType>(V1->getType())->getNumElements());
10251 } else {
10252 const auto *E = cast<const TreeEntry *>(InVectors.front());
10253 VF = std::max(VF, E->getVectorFactor());
10254 }
10255 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10256 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10257 CommonMask[Idx] = Mask[Idx] + VF;
10258 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10259 transformMaskAfterShuffle(CommonMask, CommonMask);
10260 } else {
10261 auto P = InVectors.front();
10262 Cost += createShuffle(&E1, E2, Mask);
10263 unsigned VF = Mask.size();
10264 if (Value *V1 = P.dyn_cast<Value *>()) {
10265 VF = std::max(VF,
10266 getNumElements(V1->getType()));
10267 } else {
10268 const auto *E = cast<const TreeEntry *>(P);
10269 VF = std::max(VF, E->getVectorFactor());
10270 }
10271 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10272 if (Mask[Idx] != PoisonMaskElem)
10273 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10274 Cost += createShuffle(P, InVectors.front(), CommonMask);
10275 transformMaskAfterShuffle(CommonMask, CommonMask);
10276 }
10277 }
10278
10279 class ShuffleCostBuilder {
10280 const TargetTransformInfo &TTI;
10281
10282 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10283 int Index = -1;
10284 return Mask.empty() ||
10285 (VF == Mask.size() &&
10288 Index == 0);
10289 }
10290
10291 public:
10292 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10293 ~ShuffleCostBuilder() = default;
10294 InstructionCost createShuffleVector(Value *V1, Value *,
10295 ArrayRef<int> Mask) const {
10296 // Empty mask or identity mask are free.
10297 unsigned VF =
10298 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10299 if (isEmptyOrIdentity(Mask, VF))
10300 return TTI::TCC_Free;
10301 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10302 cast<VectorType>(V1->getType()), Mask);
10303 }
10304 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10305 // Empty mask or identity mask are free.
10306 unsigned VF =
10307 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10308 if (isEmptyOrIdentity(Mask, VF))
10309 return TTI::TCC_Free;
10310 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10311 cast<VectorType>(V1->getType()), Mask);
10312 }
10313 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10314 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10315 return TTI::TCC_Free;
10316 }
10317 void resizeToMatch(Value *&, Value *&) const {}
10318 };
10319
10320 /// Smart shuffle instruction emission, walks through shuffles trees and
10321 /// tries to find the best matching vector for the actual shuffle
10322 /// instruction.
10324 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10326 ArrayRef<int> Mask) {
10327 ShuffleCostBuilder Builder(TTI);
10328 SmallVector<int> CommonMask(Mask);
10329 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10330 unsigned CommonVF = Mask.size();
10331 InstructionCost ExtraCost = 0;
10332 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10333 unsigned VF) -> InstructionCost {
10334 if (E.isGather() && allConstant(E.Scalars))
10335 return TTI::TCC_Free;
10336 Type *EScalarTy = E.Scalars.front()->getType();
10337 bool IsSigned = true;
10338 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10339 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10340 IsSigned = It->second.second;
10341 }
10342 if (EScalarTy != ScalarTy) {
10343 unsigned CastOpcode = Instruction::Trunc;
10344 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10345 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10346 if (DstSz > SrcSz)
10347 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10348 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10349 getWidenedType(EScalarTy, VF),
10350 TTI::CastContextHint::None, CostKind);
10351 }
10352 return TTI::TCC_Free;
10353 };
10354 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10355 if (isa<Constant>(V))
10356 return TTI::TCC_Free;
10357 auto *VecTy = cast<VectorType>(V->getType());
10358 Type *EScalarTy = VecTy->getElementType();
10359 if (EScalarTy != ScalarTy) {
10360 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10361 unsigned CastOpcode = Instruction::Trunc;
10362 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10363 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10364 if (DstSz > SrcSz)
10365 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10366 return TTI.getCastInstrCost(
10367 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10368 VecTy, TTI::CastContextHint::None, CostKind);
10369 }
10370 return TTI::TCC_Free;
10371 };
10372 if (!V1 && !V2 && !P2.isNull()) {
10373 // Shuffle 2 entry nodes.
10374 const TreeEntry *E = cast<const TreeEntry *>(P1);
10375 unsigned VF = E->getVectorFactor();
10376 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10377 CommonVF = std::max(VF, E2->getVectorFactor());
10378 assert(all_of(Mask,
10379 [=](int Idx) {
10380 return Idx < 2 * static_cast<int>(CommonVF);
10381 }) &&
10382 "All elements in mask must be less than 2 * CommonVF.");
10383 if (E->Scalars.size() == E2->Scalars.size()) {
10384 SmallVector<int> EMask = E->getCommonMask();
10385 SmallVector<int> E2Mask = E2->getCommonMask();
10386 if (!EMask.empty() || !E2Mask.empty()) {
10387 for (int &Idx : CommonMask) {
10388 if (Idx == PoisonMaskElem)
10389 continue;
10390 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10391 Idx = EMask[Idx];
10392 else if (Idx >= static_cast<int>(CommonVF))
10393 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10394 E->Scalars.size();
10395 }
10396 }
10397 CommonVF = E->Scalars.size();
10398 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10399 GetNodeMinBWAffectedCost(*E2, CommonVF);
10400 } else {
10401 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10402 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10403 }
10404 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10405 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10406 } else if (!V1 && P2.isNull()) {
10407 // Shuffle single entry node.
10408 const TreeEntry *E = cast<const TreeEntry *>(P1);
10409 unsigned VF = E->getVectorFactor();
10410 CommonVF = VF;
10411 assert(
10412 all_of(Mask,
10413 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10414 "All elements in mask must be less than CommonVF.");
10415 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10416 SmallVector<int> EMask = E->getCommonMask();
10417 assert(!EMask.empty() && "Expected non-empty common mask.");
10418 for (int &Idx : CommonMask) {
10419 if (Idx != PoisonMaskElem)
10420 Idx = EMask[Idx];
10421 }
10422 CommonVF = E->Scalars.size();
10423 } else if (unsigned Factor = E->getInterleaveFactor();
10424 Factor > 0 && E->Scalars.size() != Mask.size() &&
10426 Factor)) {
10427 // Deinterleaved nodes are free.
10428 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10429 }
10430 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10431 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10432 // Not identity/broadcast? Try to see if the original vector is better.
10433 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10434 CommonVF == CommonMask.size() &&
10435 any_of(enumerate(CommonMask),
10436 [](const auto &&P) {
10437 return P.value() != PoisonMaskElem &&
10438 static_cast<unsigned>(P.value()) != P.index();
10439 }) &&
10440 any_of(CommonMask,
10441 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10442 SmallVector<int> ReorderMask;
10443 inversePermutation(E->ReorderIndices, ReorderMask);
10444 ::addMask(CommonMask, ReorderMask);
10445 }
10446 } else if (V1 && P2.isNull()) {
10447 // Shuffle single vector.
10448 ExtraCost += GetValueMinBWAffectedCost(V1);
10449 CommonVF = getVF(V1);
10450 assert(
10451 all_of(Mask,
10452 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10453 "All elements in mask must be less than CommonVF.");
10454 } else if (V1 && !V2) {
10455 // Shuffle vector and tree node.
10456 unsigned VF = getVF(V1);
10457 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10458 CommonVF = std::max(VF, E2->getVectorFactor());
10459 assert(all_of(Mask,
10460 [=](int Idx) {
10461 return Idx < 2 * static_cast<int>(CommonVF);
10462 }) &&
10463 "All elements in mask must be less than 2 * CommonVF.");
10464 if (E2->Scalars.size() == VF && VF != CommonVF) {
10465 SmallVector<int> E2Mask = E2->getCommonMask();
10466 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10467 for (int &Idx : CommonMask) {
10468 if (Idx == PoisonMaskElem)
10469 continue;
10470 if (Idx >= static_cast<int>(CommonVF))
10471 Idx = E2Mask[Idx - CommonVF] + VF;
10472 }
10473 CommonVF = VF;
10474 }
10475 ExtraCost += GetValueMinBWAffectedCost(V1);
10476 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10477 ExtraCost += GetNodeMinBWAffectedCost(
10478 *E2, std::min(CommonVF, E2->getVectorFactor()));
10479 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10480 } else if (!V1 && V2) {
10481 // Shuffle vector and tree node.
10482 unsigned VF = getVF(V2);
10483 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10484 CommonVF = std::max(VF, E1->getVectorFactor());
10485 assert(all_of(Mask,
10486 [=](int Idx) {
10487 return Idx < 2 * static_cast<int>(CommonVF);
10488 }) &&
10489 "All elements in mask must be less than 2 * CommonVF.");
10490 if (E1->Scalars.size() == VF && VF != CommonVF) {
10491 SmallVector<int> E1Mask = E1->getCommonMask();
10492 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10493 for (int &Idx : CommonMask) {
10494 if (Idx == PoisonMaskElem)
10495 continue;
10496 if (Idx >= static_cast<int>(CommonVF))
10497 Idx = E1Mask[Idx - CommonVF] + VF;
10498 else
10499 Idx = E1Mask[Idx];
10500 }
10501 CommonVF = VF;
10502 }
10503 ExtraCost += GetNodeMinBWAffectedCost(
10504 *E1, std::min(CommonVF, E1->getVectorFactor()));
10505 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10506 ExtraCost += GetValueMinBWAffectedCost(V2);
10507 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10508 } else {
10509 assert(V1 && V2 && "Expected both vectors.");
10510 unsigned VF = getVF(V1);
10511 CommonVF = std::max(VF, getVF(V2));
10512 assert(all_of(Mask,
10513 [=](int Idx) {
10514 return Idx < 2 * static_cast<int>(CommonVF);
10515 }) &&
10516 "All elements in mask must be less than 2 * CommonVF.");
10517 ExtraCost +=
10518 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10519 if (V1->getType() != V2->getType()) {
10520 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10521 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10522 } else {
10523 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10524 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10525 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10526 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10527 }
10528 }
10529 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10530 assert(SLPReVec && "FixedVectorType is not expected.");
10532 CommonMask);
10533 }
10534 InVectors.front() =
10535 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10536 if (InVectors.size() == 2)
10537 InVectors.pop_back();
10538 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10539 V1, V2, CommonMask, Builder);
10540 }
10541
10542public:
10544 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10545 SmallPtrSetImpl<Value *> &CheckedExtracts)
10546 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10547 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10548 CheckedExtracts(CheckedExtracts) {}
10549 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10550 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10551 unsigned NumParts, bool &UseVecBaseAsInput) {
10552 UseVecBaseAsInput = false;
10553 if (Mask.empty())
10554 return nullptr;
10555 Value *VecBase = nullptr;
10556 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10557 if (!E->ReorderIndices.empty()) {
10558 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10559 E->ReorderIndices.end());
10560 reorderScalars(VL, ReorderMask);
10561 }
10562 // Check if it can be considered reused if same extractelements were
10563 // vectorized already.
10564 bool PrevNodeFound = any_of(
10565 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10566 [&](const std::unique_ptr<TreeEntry> &TE) {
10567 return ((!TE->isAltShuffle() &&
10568 TE->getOpcode() == Instruction::ExtractElement) ||
10569 TE->isGather()) &&
10570 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10571 return VL.size() > Data.index() &&
10572 (Mask[Data.index()] == PoisonMaskElem ||
10573 isa<UndefValue>(VL[Data.index()]) ||
10574 Data.value() == VL[Data.index()]);
10575 });
10576 });
10577 SmallPtrSet<Value *, 4> UniqueBases;
10578 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10579 for (unsigned Part : seq<unsigned>(NumParts)) {
10580 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10581 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10582 for (auto [I, V] :
10583 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10584 // Ignore non-extractelement scalars.
10585 if (isa<UndefValue>(V) ||
10586 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10587 continue;
10588 // If all users of instruction are going to be vectorized and this
10589 // instruction itself is not going to be vectorized, consider this
10590 // instruction as dead and remove its cost from the final cost of the
10591 // vectorized tree.
10592 // Also, avoid adjusting the cost for extractelements with multiple uses
10593 // in different graph entries.
10594 auto *EE = cast<ExtractElementInst>(V);
10595 VecBase = EE->getVectorOperand();
10596 UniqueBases.insert(VecBase);
10597 const TreeEntry *VE = R.getTreeEntry(V);
10598 if (!CheckedExtracts.insert(V).second ||
10599 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10600 any_of(EE->users(),
10601 [&](User *U) {
10602 return isa<GetElementPtrInst>(U) &&
10603 !R.areAllUsersVectorized(cast<Instruction>(U),
10604 &VectorizedVals);
10605 }) ||
10606 (VE && VE != E))
10607 continue;
10608 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10609 if (!EEIdx)
10610 continue;
10611 unsigned Idx = *EEIdx;
10612 // Take credit for instruction that will become dead.
10613 if (EE->hasOneUse() || !PrevNodeFound) {
10614 Instruction *Ext = EE->user_back();
10615 if (isa<SExtInst, ZExtInst>(Ext) &&
10616 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10617 // Use getExtractWithExtendCost() to calculate the cost of
10618 // extractelement/ext pair.
10619 Cost -=
10620 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10621 EE->getVectorOperandType(), Idx);
10622 // Add back the cost of s|zext which is subtracted separately.
10624 Ext->getOpcode(), Ext->getType(), EE->getType(),
10625 TTI::getCastContextHint(Ext), CostKind, Ext);
10626 continue;
10627 }
10628 }
10629 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10630 CostKind, Idx);
10631 }
10632 }
10633 // Check that gather of extractelements can be represented as just a
10634 // shuffle of a single/two vectors the scalars are extracted from.
10635 // Found the bunch of extractelement instructions that must be gathered
10636 // into a vector and can be represented as a permutation elements in a
10637 // single input vector or of 2 input vectors.
10638 // Done for reused if same extractelements were vectorized already.
10639 if (!PrevNodeFound)
10640 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10641 InVectors.assign(1, E);
10642 CommonMask.assign(Mask.begin(), Mask.end());
10643 transformMaskAfterShuffle(CommonMask, CommonMask);
10644 SameNodesEstimated = false;
10645 if (NumParts != 1 && UniqueBases.size() != 1) {
10646 UseVecBaseAsInput = true;
10647 VecBase =
10648 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10649 }
10650 return VecBase;
10651 }
10652 /// Checks if the specified entry \p E needs to be delayed because of its
10653 /// dependency nodes.
10654 std::optional<InstructionCost>
10655 needToDelay(const TreeEntry *,
10657 // No need to delay the cost estimation during analysis.
10658 return std::nullopt;
10659 }
10660 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10661 if (&E1 == &E2) {
10662 assert(all_of(Mask,
10663 [&](int Idx) {
10664 return Idx < static_cast<int>(E1.getVectorFactor());
10665 }) &&
10666 "Expected single vector shuffle mask.");
10667 add(E1, Mask);
10668 return;
10669 }
10670 if (InVectors.empty()) {
10671 CommonMask.assign(Mask.begin(), Mask.end());
10672 InVectors.assign({&E1, &E2});
10673 return;
10674 }
10675 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10676 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10677 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10678 if (NumParts == 0 || NumParts >= Mask.size() ||
10679 MaskVecTy->getNumElements() % NumParts != 0 ||
10680 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10681 MaskVecTy->getNumElements() / NumParts))
10682 NumParts = 1;
10683 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10684 const auto *It =
10685 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10686 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10687 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10688 }
10689 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10690 if (InVectors.empty()) {
10691 CommonMask.assign(Mask.begin(), Mask.end());
10692 InVectors.assign(1, &E1);
10693 return;
10694 }
10695 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10696 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10697 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10698 if (NumParts == 0 || NumParts >= Mask.size() ||
10699 MaskVecTy->getNumElements() % NumParts != 0 ||
10700 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10701 MaskVecTy->getNumElements() / NumParts))
10702 NumParts = 1;
10703 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10704 const auto *It =
10705 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10706 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10707 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10708 if (!SameNodesEstimated && InVectors.size() == 1)
10709 InVectors.emplace_back(&E1);
10710 }
10711 /// Adds 2 input vectors and the mask for their shuffling.
10712 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10713 // May come only for shuffling of 2 vectors with extractelements, already
10714 // handled in adjustExtracts.
10715 assert(InVectors.size() == 1 &&
10716 all_of(enumerate(CommonMask),
10717 [&](auto P) {
10718 if (P.value() == PoisonMaskElem)
10719 return Mask[P.index()] == PoisonMaskElem;
10720 auto *EI = cast<ExtractElementInst>(
10721 cast<const TreeEntry *>(InVectors.front())
10722 ->getOrdered(P.index()));
10723 return EI->getVectorOperand() == V1 ||
10724 EI->getVectorOperand() == V2;
10725 }) &&
10726 "Expected extractelement vectors.");
10727 }
10728 /// Adds another one input vector and the mask for the shuffling.
10729 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10730 if (InVectors.empty()) {
10731 assert(CommonMask.empty() && !ForExtracts &&
10732 "Expected empty input mask/vectors.");
10733 CommonMask.assign(Mask.begin(), Mask.end());
10734 InVectors.assign(1, V1);
10735 return;
10736 }
10737 if (ForExtracts) {
10738 // No need to add vectors here, already handled them in adjustExtracts.
10739 assert(
10740 InVectors.size() == 1 && isa<const TreeEntry *>(InVectors.front()) &&
10741 !CommonMask.empty() &&
10742 all_of(enumerate(CommonMask),
10743 [&](auto P) {
10744 Value *Scalar =
10745 InVectors.front().get<const TreeEntry *>()->getOrdered(
10746 P.index());
10747 if (P.value() == PoisonMaskElem)
10748 return P.value() == Mask[P.index()] ||
10749 isa<UndefValue>(Scalar);
10750 if (isa<Constant>(V1))
10751 return true;
10752 auto *EI = cast<ExtractElementInst>(Scalar);
10753 return EI->getVectorOperand() == V1;
10754 }) &&
10755 "Expected only tree entry for extractelement vectors.");
10756 return;
10757 }
10758 assert(!InVectors.empty() && !CommonMask.empty() &&
10759 "Expected only tree entries from extracts/reused buildvectors.");
10760 unsigned VF = getVF(V1);
10761 if (InVectors.size() == 2) {
10762 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10763 transformMaskAfterShuffle(CommonMask, CommonMask);
10764 VF = std::max<unsigned>(VF, CommonMask.size());
10765 } else if (const auto *InTE =
10766 InVectors.front().dyn_cast<const TreeEntry *>()) {
10767 VF = std::max(VF, InTE->getVectorFactor());
10768 } else {
10769 VF = std::max(
10770 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10771 ->getNumElements());
10772 }
10773 InVectors.push_back(V1);
10774 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10775 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10776 CommonMask[Idx] = Mask[Idx] + VF;
10777 }
10778 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10779 Value *Root = nullptr) {
10780 Cost += getBuildVectorCost(VL, Root);
10781 if (!Root) {
10782 // FIXME: Need to find a way to avoid use of getNullValue here.
10784 unsigned VF = VL.size();
10785 if (MaskVF != 0)
10786 VF = std::min(VF, MaskVF);
10787 for (Value *V : VL.take_front(VF)) {
10788 if (isa<UndefValue>(V)) {
10789 Vals.push_back(cast<Constant>(V));
10790 continue;
10791 }
10792 Vals.push_back(Constant::getNullValue(V->getType()));
10793 }
10794 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10795 assert(SLPReVec && "FixedVectorType is not expected.");
10796 // When REVEC is enabled, we need to expand vector types into scalar
10797 // types.
10798 unsigned VecTyNumElements = VecTy->getNumElements();
10799 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10800 for (auto [I, V] : enumerate(Vals)) {
10801 Type *ScalarTy = V->getType()->getScalarType();
10802 Constant *NewVal;
10803 if (isa<PoisonValue>(V))
10804 NewVal = PoisonValue::get(ScalarTy);
10805 else if (isa<UndefValue>(V))
10806 NewVal = UndefValue::get(ScalarTy);
10807 else
10808 NewVal = Constant::getNullValue(ScalarTy);
10809 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10810 NewVal);
10811 }
10812 Vals.swap(NewVals);
10813 }
10814 return ConstantVector::get(Vals);
10815 }
10818 cast<FixedVectorType>(Root->getType())->getNumElements()),
10819 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10820 }
10822 /// Finalize emission of the shuffles.
10825 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10826 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10827 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10828 IsFinalized = true;
10829 if (Action) {
10830 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10831 if (InVectors.size() == 2)
10832 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10833 else
10834 Cost += createShuffle(Vec, nullptr, CommonMask);
10835 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10836 if (CommonMask[Idx] != PoisonMaskElem)
10837 CommonMask[Idx] = Idx;
10838 assert(VF > 0 &&
10839 "Expected vector length for the final value before action.");
10840 Value *V = cast<Value *>(Vec);
10841 Action(V, CommonMask);
10842 InVectors.front() = V;
10843 }
10844 if (!SubVectors.empty()) {
10845 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10846 if (InVectors.size() == 2)
10847 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10848 else
10849 Cost += createShuffle(Vec, nullptr, CommonMask);
10850 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10851 if (CommonMask[Idx] != PoisonMaskElem)
10852 CommonMask[Idx] = Idx;
10853 // Add subvectors permutation cost.
10854 if (!SubVectorsMask.empty()) {
10855 assert(SubVectorsMask.size() <= CommonMask.size() &&
10856 "Expected same size of masks for subvectors and common mask.");
10857 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10858 copy(SubVectorsMask, SVMask.begin());
10859 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10860 if (I2 != PoisonMaskElem) {
10861 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10862 I1 = I2 + CommonMask.size();
10863 }
10864 }
10866 getWidenedType(ScalarTy, CommonMask.size()),
10867 SVMask, CostKind);
10868 }
10869 for (auto [E, Idx] : SubVectors) {
10870 Type *EScalarTy = E->Scalars.front()->getType();
10871 bool IsSigned = true;
10872 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10873 EScalarTy =
10874 IntegerType::get(EScalarTy->getContext(), It->second.first);
10875 IsSigned = It->second.second;
10876 }
10877 if (ScalarTy != EScalarTy) {
10878 unsigned CastOpcode = Instruction::Trunc;
10879 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10880 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10881 if (DstSz > SrcSz)
10882 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10884 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10885 getWidenedType(EScalarTy, E->getVectorFactor()),
10887 }
10890 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
10891 getWidenedType(ScalarTy, E->getVectorFactor()));
10892 if (!CommonMask.empty()) {
10893 std::iota(std::next(CommonMask.begin(), Idx),
10894 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
10895 Idx);
10896 }
10897 }
10898 }
10899
10900 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
10901 if (CommonMask.empty()) {
10902 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
10903 return Cost;
10904 }
10905 return Cost +
10906 createShuffle(InVectors.front(),
10907 InVectors.size() == 2 ? InVectors.back() : nullptr,
10908 CommonMask);
10909 }
10910
10912 assert((IsFinalized || CommonMask.empty()) &&
10913 "Shuffle construction must be finalized.");
10914 }
10915};
10916
10917const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
10918 unsigned Idx) const {
10919 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
10920 return VE;
10921 const auto *It =
10922 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10923 return TE->isGather() &&
10924 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
10925 return EI.EdgeIdx == Idx && EI.UserTE == E;
10926 }) != TE->UserTreeIndices.end();
10927 });
10928 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
10929 return It->get();
10930}
10931
10932TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
10933 if (TE.State == TreeEntry::ScatterVectorize ||
10934 TE.State == TreeEntry::StridedVectorize)
10936 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
10937 !TE.isAltShuffle()) {
10938 if (TE.ReorderIndices.empty())
10940 SmallVector<int> Mask;
10941 inversePermutation(TE.ReorderIndices, Mask);
10942 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
10944 }
10946}
10947
10948/// Builds the arguments types vector for the given call instruction with the
10949/// given \p ID for the specified vector factor.
10952 const unsigned VF, unsigned MinBW,
10953 const TargetTransformInfo *TTI) {
10954 SmallVector<Type *> ArgTys;
10955 for (auto [Idx, Arg] : enumerate(CI->args())) {
10958 ArgTys.push_back(Arg->getType());
10959 continue;
10960 }
10961 if (MinBW > 0) {
10962 ArgTys.push_back(
10963 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10964 continue;
10965 }
10966 }
10967 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10968 }
10969 return ArgTys;
10970}
10971
10973BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
10974 SmallPtrSetImpl<Value *> &CheckedExtracts) {
10975 ArrayRef<Value *> VL = E->Scalars;
10976
10977 Type *ScalarTy = getValueType(VL[0]);
10978 if (!isValidElementType(ScalarTy))
10981
10982 // If we have computed a smaller type for the expression, update VecTy so
10983 // that the costs will be accurate.
10984 auto It = MinBWs.find(E);
10985 Type *OrigScalarTy = ScalarTy;
10986 if (It != MinBWs.end()) {
10987 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
10988 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
10989 if (VecTy)
10990 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
10991 }
10992 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10993 unsigned EntryVF = E->getVectorFactor();
10994 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
10995
10996 if (E->isGather()) {
10997 if (allConstant(VL))
10998 return 0;
10999 if (isa<InsertElementInst>(VL[0]))
11001 if (isa<CmpInst>(VL.front()))
11002 ScalarTy = VL.front()->getType();
11003 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11004 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11005 }
11006 InstructionCost CommonCost = 0;
11008 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11009 !isReverseOrder(E->ReorderIndices))) {
11010 SmallVector<int> NewMask;
11011 if (E->getOpcode() == Instruction::Store) {
11012 // For stores the order is actually a mask.
11013 NewMask.resize(E->ReorderIndices.size());
11014 copy(E->ReorderIndices, NewMask.begin());
11015 } else {
11016 inversePermutation(E->ReorderIndices, NewMask);
11017 }
11018 ::addMask(Mask, NewMask);
11019 }
11020 if (!E->ReuseShuffleIndices.empty())
11021 ::addMask(Mask, E->ReuseShuffleIndices);
11022 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11023 CommonCost =
11024 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11025 assert((E->State == TreeEntry::Vectorize ||
11026 E->State == TreeEntry::ScatterVectorize ||
11027 E->State == TreeEntry::StridedVectorize) &&
11028 "Unhandled state");
11029 assert(E->getOpcode() &&
11030 ((allSameType(VL) && allSameBlock(VL)) ||
11031 (E->getOpcode() == Instruction::GetElementPtr &&
11032 E->getMainOp()->getType()->isPointerTy())) &&
11033 "Invalid VL");
11034 Instruction *VL0 = E->getMainOp();
11035 unsigned ShuffleOrOp =
11036 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11037 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11038 ShuffleOrOp = E->CombinedOp;
11039 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11040 const unsigned Sz = UniqueValues.size();
11041 SmallBitVector UsedScalars(Sz, false);
11042 for (unsigned I = 0; I < Sz; ++I) {
11043 if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11044 continue;
11045 UsedScalars.set(I);
11046 }
11047 auto GetCastContextHint = [&](Value *V) {
11048 if (const TreeEntry *OpTE = getTreeEntry(V))
11049 return getCastContextHint(*OpTE);
11050 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11051 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
11054 };
11055 auto GetCostDiff =
11056 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11058 // Calculate the cost of this instruction.
11059 InstructionCost ScalarCost = 0;
11060 if (isa<CastInst, CallInst>(VL0)) {
11061 // For some of the instructions no need to calculate cost for each
11062 // particular instruction, we can use the cost of the single
11063 // instruction x total number of scalar instructions.
11064 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11065 } else {
11066 for (unsigned I = 0; I < Sz; ++I) {
11067 if (UsedScalars.test(I))
11068 continue;
11069 ScalarCost += ScalarEltCost(I);
11070 }
11071 }
11072
11073 InstructionCost VecCost = VectorCost(CommonCost);
11074 // Check if the current node must be resized, if the parent node is not
11075 // resized.
11076 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11077 E->Idx != 0 &&
11078 (E->getOpcode() != Instruction::Load ||
11079 !E->UserTreeIndices.empty())) {
11080 const EdgeInfo &EI =
11081 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11082 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11083 });
11084 if (EI.UserTE->getOpcode() != Instruction::Select ||
11085 EI.EdgeIdx != 0) {
11086 auto UserBWIt = MinBWs.find(EI.UserTE);
11087 Type *UserScalarTy =
11088 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11089 if (UserBWIt != MinBWs.end())
11090 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11091 UserBWIt->second.first);
11092 if (ScalarTy != UserScalarTy) {
11093 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11094 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11095 unsigned VecOpcode;
11096 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11097 if (BWSz > SrcBWSz)
11098 VecOpcode = Instruction::Trunc;
11099 else
11100 VecOpcode =
11101 It->second.second ? Instruction::SExt : Instruction::ZExt;
11102 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11103 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11104 CostKind);
11105 }
11106 }
11107 }
11108 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11109 ScalarCost, "Calculated costs for Tree"));
11110 return VecCost - ScalarCost;
11111 };
11112 // Calculate cost difference from vectorizing set of GEPs.
11113 // Negative value means vectorizing is profitable.
11114 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11115 assert((E->State == TreeEntry::Vectorize ||
11116 E->State == TreeEntry::StridedVectorize) &&
11117 "Entry state expected to be Vectorize or StridedVectorize here.");
11118 InstructionCost ScalarCost = 0;
11119 InstructionCost VecCost = 0;
11120 std::tie(ScalarCost, VecCost) = getGEPCosts(
11121 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11122 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11123 "Calculated GEPs cost for Tree"));
11124
11125 return VecCost - ScalarCost;
11126 };
11127
11128 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11129 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11130 if (MinMaxID == Intrinsic::not_intrinsic)
11132 Type *CanonicalType = Ty;
11133 if (CanonicalType->isPtrOrPtrVectorTy())
11134 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11135 CanonicalType->getContext(),
11136 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11137
11138 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11139 {CanonicalType, CanonicalType});
11140 InstructionCost IntrinsicCost =
11141 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11142 // If the selects are the only uses of the compares, they will be
11143 // dead and we can adjust the cost by removing their cost.
11144 if (VI && SelectOnly) {
11145 assert((!Ty->isVectorTy() || SLPReVec) &&
11146 "Expected only for scalar type.");
11147 auto *CI = cast<CmpInst>(VI->getOperand(0));
11148 IntrinsicCost -= TTI->getCmpSelInstrCost(
11149 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11150 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11151 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11152 }
11153 return IntrinsicCost;
11154 };
11155 switch (ShuffleOrOp) {
11156 case Instruction::PHI: {
11157 // Count reused scalars.
11158 InstructionCost ScalarCost = 0;
11160 for (Value *V : UniqueValues) {
11161 auto *PHI = dyn_cast<PHINode>(V);
11162 if (!PHI)
11163 continue;
11164
11165 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11166 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11167 Value *Op = PHI->getIncomingValue(I);
11168 Operands[I] = Op;
11169 }
11170 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11171 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11172 if (!OpTE->ReuseShuffleIndices.empty())
11173 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11174 OpTE->Scalars.size());
11175 }
11176
11177 return CommonCost - ScalarCost;
11178 }
11179 case Instruction::ExtractValue:
11180 case Instruction::ExtractElement: {
11181 auto GetScalarCost = [&](unsigned Idx) {
11182 if (isa<PoisonValue>(UniqueValues[Idx]))
11184
11185 auto *I = cast<Instruction>(UniqueValues[Idx]);
11186 VectorType *SrcVecTy;
11187 if (ShuffleOrOp == Instruction::ExtractElement) {
11188 auto *EE = cast<ExtractElementInst>(I);
11189 SrcVecTy = EE->getVectorOperandType();
11190 } else {
11191 auto *EV = cast<ExtractValueInst>(I);
11192 Type *AggregateTy = EV->getAggregateOperand()->getType();
11193 unsigned NumElts;
11194 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11195 NumElts = ATy->getNumElements();
11196 else
11197 NumElts = AggregateTy->getStructNumElements();
11198 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11199 }
11200 if (I->hasOneUse()) {
11201 Instruction *Ext = I->user_back();
11202 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11203 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11204 // Use getExtractWithExtendCost() to calculate the cost of
11205 // extractelement/ext pair.
11207 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11208 // Subtract the cost of s|zext which is subtracted separately.
11210 Ext->getOpcode(), Ext->getType(), I->getType(),
11212 return Cost;
11213 }
11214 }
11215 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11217 };
11218 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11219 return GetCostDiff(GetScalarCost, GetVectorCost);
11220 }
11221 case Instruction::InsertElement: {
11222 assert(E->ReuseShuffleIndices.empty() &&
11223 "Unique insertelements only are expected.");
11224 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11225 unsigned const NumElts = SrcVecTy->getNumElements();
11226 unsigned const NumScalars = VL.size();
11227
11228 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11229
11230 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11231 unsigned OffsetBeg = *getElementIndex(VL.front());
11232 unsigned OffsetEnd = OffsetBeg;
11233 InsertMask[OffsetBeg] = 0;
11234 for (auto [I, V] : enumerate(VL.drop_front())) {
11235 unsigned Idx = *getElementIndex(V);
11236 if (OffsetBeg > Idx)
11237 OffsetBeg = Idx;
11238 else if (OffsetEnd < Idx)
11239 OffsetEnd = Idx;
11240 InsertMask[Idx] = I + 1;
11241 }
11242 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11243 if (NumOfParts > 0 && NumOfParts < NumElts)
11244 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11245 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11246 VecScalarsSz;
11247 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11248 unsigned InsertVecSz = std::min<unsigned>(
11249 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11250 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11251 bool IsWholeSubvector =
11252 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11253 // Check if we can safely insert a subvector. If it is not possible, just
11254 // generate a whole-sized vector and shuffle the source vector and the new
11255 // subvector.
11256 if (OffsetBeg + InsertVecSz > VecSz) {
11257 // Align OffsetBeg to generate correct mask.
11258 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11259 InsertVecSz = VecSz;
11260 }
11261
11262 APInt DemandedElts = APInt::getZero(NumElts);
11263 // TODO: Add support for Instruction::InsertValue.
11265 if (!E->ReorderIndices.empty()) {
11266 inversePermutation(E->ReorderIndices, Mask);
11267 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11268 } else {
11269 Mask.assign(VecSz, PoisonMaskElem);
11270 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11271 }
11272 bool IsIdentity = true;
11273 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11274 Mask.swap(PrevMask);
11275 for (unsigned I = 0; I < NumScalars; ++I) {
11276 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11277 DemandedElts.setBit(InsertIdx);
11278 IsIdentity &= InsertIdx - OffsetBeg == I;
11279 Mask[InsertIdx - OffsetBeg] = I;
11280 }
11281 assert(Offset < NumElts && "Failed to find vector index offset");
11282
11284 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11285 /*Insert*/ true, /*Extract*/ false,
11286 CostKind);
11287
11288 // First cost - resize to actual vector size if not identity shuffle or
11289 // need to shift the vector.
11290 // Do not calculate the cost if the actual size is the register size and
11291 // we can merge this shuffle with the following SK_Select.
11292 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11293 if (!IsIdentity)
11295 InsertVecTy, Mask);
11296 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11297 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11298 }));
11299 // Second cost - permutation with subvector, if some elements are from the
11300 // initial vector or inserting a subvector.
11301 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11302 // subvector of ActualVecTy.
11303 SmallBitVector InMask =
11304 isUndefVector(FirstInsert->getOperand(0),
11305 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11306 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11307 if (InsertVecSz != VecSz) {
11308 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11309 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11310 CostKind, OffsetBeg - Offset, InsertVecTy);
11311 } else {
11312 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11313 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11314 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11315 I <= End; ++I)
11316 if (Mask[I] != PoisonMaskElem)
11317 Mask[I] = I + VecSz;
11318 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11319 Mask[I] =
11320 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11321 Cost +=
11322 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11323 }
11324 }
11325 return Cost;
11326 }
11327 case Instruction::ZExt:
11328 case Instruction::SExt:
11329 case Instruction::FPToUI:
11330 case Instruction::FPToSI:
11331 case Instruction::FPExt:
11332 case Instruction::PtrToInt:
11333 case Instruction::IntToPtr:
11334 case Instruction::SIToFP:
11335 case Instruction::UIToFP:
11336 case Instruction::Trunc:
11337 case Instruction::FPTrunc:
11338 case Instruction::BitCast: {
11339 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11340 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11341 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11342 unsigned Opcode = ShuffleOrOp;
11343 unsigned VecOpcode = Opcode;
11344 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11345 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11346 // Check if the values are candidates to demote.
11347 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11348 if (SrcIt != MinBWs.end()) {
11349 SrcBWSz = SrcIt->second.first;
11350 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11351 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11352 SrcVecTy =
11353 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11354 }
11355 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11356 if (BWSz == SrcBWSz) {
11357 VecOpcode = Instruction::BitCast;
11358 } else if (BWSz < SrcBWSz) {
11359 VecOpcode = Instruction::Trunc;
11360 } else if (It != MinBWs.end()) {
11361 assert(BWSz > SrcBWSz && "Invalid cast!");
11362 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11363 } else if (SrcIt != MinBWs.end()) {
11364 assert(BWSz > SrcBWSz && "Invalid cast!");
11365 VecOpcode =
11366 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11367 }
11368 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11369 !SrcIt->second.second) {
11370 VecOpcode = Instruction::UIToFP;
11371 }
11372 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11373 assert(Idx == 0 && "Expected 0 index only");
11374 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11375 VL0->getOperand(0)->getType(),
11377 };
11378 auto GetVectorCost = [=](InstructionCost CommonCost) {
11379 // Do not count cost here if minimum bitwidth is in effect and it is just
11380 // a bitcast (here it is just a noop).
11381 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11382 return CommonCost;
11383 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11384 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11385
11386 bool IsArithmeticExtendedReduction =
11387 E->Idx == 0 && UserIgnoreList &&
11388 all_of(*UserIgnoreList, [](Value *V) {
11389 auto *I = cast<Instruction>(V);
11390 return is_contained({Instruction::Add, Instruction::FAdd,
11391 Instruction::Mul, Instruction::FMul,
11392 Instruction::And, Instruction::Or,
11393 Instruction::Xor},
11394 I->getOpcode());
11395 });
11396 if (IsArithmeticExtendedReduction &&
11397 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11398 return CommonCost;
11399 return CommonCost +
11400 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11401 VecOpcode == Opcode ? VI : nullptr);
11402 };
11403 return GetCostDiff(GetScalarCost, GetVectorCost);
11404 }
11405 case Instruction::FCmp:
11406 case Instruction::ICmp:
11407 case Instruction::Select: {
11408 CmpPredicate VecPred, SwappedVecPred;
11409 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11410 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11411 match(VL0, MatchCmp))
11412 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11413 else
11414 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11417 auto GetScalarCost = [&](unsigned Idx) {
11418 if (isa<PoisonValue>(UniqueValues[Idx]))
11420
11421 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11422 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11425 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11426 // FIXME: Use CmpPredicate::getMatching here.
11427 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11428 !match(VI, MatchCmp)) ||
11429 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11430 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11431 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11434
11436 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11437 CostKind, getOperandInfo(VI->getOperand(0)),
11438 getOperandInfo(VI->getOperand(1)), VI);
11439 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11440 if (IntrinsicCost.isValid())
11441 ScalarCost = IntrinsicCost;
11442
11443 return ScalarCost;
11444 };
11445 auto GetVectorCost = [&](InstructionCost CommonCost) {
11446 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11447
11448 InstructionCost VecCost =
11449 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11450 CostKind, getOperandInfo(E->getOperand(0)),
11451 getOperandInfo(E->getOperand(1)), VL0);
11452 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11453 auto *CondType =
11454 getWidenedType(SI->getCondition()->getType(), VL.size());
11455 unsigned CondNumElements = CondType->getNumElements();
11456 unsigned VecTyNumElements = getNumElements(VecTy);
11457 assert(VecTyNumElements >= CondNumElements &&
11458 VecTyNumElements % CondNumElements == 0 &&
11459 "Cannot vectorize Instruction::Select");
11460 if (CondNumElements != VecTyNumElements) {
11461 // When the return type is i1 but the source is fixed vector type, we
11462 // need to duplicate the condition value.
11463 VecCost += ::getShuffleCost(
11464 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11465 createReplicatedMask(VecTyNumElements / CondNumElements,
11466 CondNumElements));
11467 }
11468 }
11469 return VecCost + CommonCost;
11470 };
11471 return GetCostDiff(GetScalarCost, GetVectorCost);
11472 }
11473 case TreeEntry::MinMax: {
11474 auto GetScalarCost = [&](unsigned Idx) {
11475 return GetMinMaxCost(OrigScalarTy);
11476 };
11477 auto GetVectorCost = [&](InstructionCost CommonCost) {
11478 InstructionCost VecCost = GetMinMaxCost(VecTy);
11479 return VecCost + CommonCost;
11480 };
11481 return GetCostDiff(GetScalarCost, GetVectorCost);
11482 }
11483 case Instruction::FNeg:
11484 case Instruction::Add:
11485 case Instruction::FAdd:
11486 case Instruction::Sub:
11487 case Instruction::FSub:
11488 case Instruction::Mul:
11489 case Instruction::FMul:
11490 case Instruction::UDiv:
11491 case Instruction::SDiv:
11492 case Instruction::FDiv:
11493 case Instruction::URem:
11494 case Instruction::SRem:
11495 case Instruction::FRem:
11496 case Instruction::Shl:
11497 case Instruction::LShr:
11498 case Instruction::AShr:
11499 case Instruction::And:
11500 case Instruction::Or:
11501 case Instruction::Xor: {
11502 auto GetScalarCost = [&](unsigned Idx) {
11503 if (isa<PoisonValue>(UniqueValues[Idx]))
11505
11506 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11507 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11508 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11509 TTI::OperandValueInfo Op2Info =
11510 TTI::getOperandInfo(VI->getOperand(OpIdx));
11511 SmallVector<const Value *> Operands(VI->operand_values());
11512 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11513 Op1Info, Op2Info, Operands, VI);
11514 };
11515 auto GetVectorCost = [=](InstructionCost CommonCost) {
11516 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11517 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11518 ArrayRef<Value *> Ops = E->getOperand(I);
11519 if (all_of(Ops, [&](Value *Op) {
11520 auto *CI = dyn_cast<ConstantInt>(Op);
11521 return CI && CI->getValue().countr_one() >= It->second.first;
11522 }))
11523 return CommonCost;
11524 }
11525 }
11526 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11527 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11528 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11529 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11530 Op2Info, {}, nullptr, TLI) +
11531 CommonCost;
11532 };
11533 return GetCostDiff(GetScalarCost, GetVectorCost);
11534 }
11535 case Instruction::GetElementPtr: {
11536 return CommonCost + GetGEPCostDiff(VL, VL0);
11537 }
11538 case Instruction::Load: {
11539 auto GetScalarCost = [&](unsigned Idx) {
11540 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11541 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11542 VI->getAlign(), VI->getPointerAddressSpace(),
11544 };
11545 auto *LI0 = cast<LoadInst>(VL0);
11546 auto GetVectorCost = [&](InstructionCost CommonCost) {
11547 InstructionCost VecLdCost;
11548 switch (E->State) {
11549 case TreeEntry::Vectorize:
11550 if (unsigned Factor = E->getInterleaveFactor()) {
11551 VecLdCost = TTI->getInterleavedMemoryOpCost(
11552 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11553 LI0->getPointerAddressSpace(), CostKind);
11554
11555 } else {
11556 VecLdCost = TTI->getMemoryOpCost(
11557 Instruction::Load, VecTy, LI0->getAlign(),
11558 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11559 }
11560 break;
11561 case TreeEntry::StridedVectorize: {
11562 Align CommonAlignment =
11563 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11564 VecLdCost = TTI->getStridedMemoryOpCost(
11565 Instruction::Load, VecTy, LI0->getPointerOperand(),
11566 /*VariableMask=*/false, CommonAlignment, CostKind);
11567 break;
11568 }
11569 case TreeEntry::ScatterVectorize: {
11570 Align CommonAlignment =
11571 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11572 VecLdCost = TTI->getGatherScatterOpCost(
11573 Instruction::Load, VecTy, LI0->getPointerOperand(),
11574 /*VariableMask=*/false, CommonAlignment, CostKind);
11575 break;
11576 }
11577 case TreeEntry::CombinedVectorize:
11578 case TreeEntry::NeedToGather:
11579 llvm_unreachable("Unexpected vectorization state.");
11580 }
11581 return VecLdCost + CommonCost;
11582 };
11583
11584 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11585 // If this node generates masked gather load then it is not a terminal node.
11586 // Hence address operand cost is estimated separately.
11587 if (E->State == TreeEntry::ScatterVectorize)
11588 return Cost;
11589
11590 // Estimate cost of GEPs since this tree node is a terminator.
11591 SmallVector<Value *> PointerOps(VL.size());
11592 for (auto [I, V] : enumerate(VL))
11593 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11594 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11595 }
11596 case Instruction::Store: {
11597 bool IsReorder = !E->ReorderIndices.empty();
11598 auto GetScalarCost = [=](unsigned Idx) {
11599 auto *VI = cast<StoreInst>(VL[Idx]);
11600 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11601 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11602 VI->getAlign(), VI->getPointerAddressSpace(),
11603 CostKind, OpInfo, VI);
11604 };
11605 auto *BaseSI =
11606 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11607 auto GetVectorCost = [=](InstructionCost CommonCost) {
11608 // We know that we can merge the stores. Calculate the cost.
11609 InstructionCost VecStCost;
11610 if (E->State == TreeEntry::StridedVectorize) {
11611 Align CommonAlignment =
11612 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11613 VecStCost = TTI->getStridedMemoryOpCost(
11614 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11615 /*VariableMask=*/false, CommonAlignment, CostKind);
11616 } else {
11617 assert(E->State == TreeEntry::Vectorize &&
11618 "Expected either strided or consecutive stores.");
11619 if (unsigned Factor = E->getInterleaveFactor()) {
11620 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11621 "No reused shuffles expected");
11622 CommonCost = 0;
11623 VecStCost = TTI->getInterleavedMemoryOpCost(
11624 Instruction::Store, VecTy, Factor, std::nullopt,
11625 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11626 } else {
11627 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11628 VecStCost = TTI->getMemoryOpCost(
11629 Instruction::Store, VecTy, BaseSI->getAlign(),
11630 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11631 }
11632 }
11633 return VecStCost + CommonCost;
11634 };
11635 SmallVector<Value *> PointerOps(VL.size());
11636 for (auto [I, V] : enumerate(VL)) {
11637 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11638 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11639 }
11640
11641 return GetCostDiff(GetScalarCost, GetVectorCost) +
11642 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11643 }
11644 case Instruction::Call: {
11645 auto GetScalarCost = [&](unsigned Idx) {
11646 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11649 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11650 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11651 }
11654 CI->getFunctionType()->params(), CostKind);
11655 };
11656 auto GetVectorCost = [=](InstructionCost CommonCost) {
11657 auto *CI = cast<CallInst>(VL0);
11660 CI, ID, VecTy->getNumElements(),
11661 It != MinBWs.end() ? It->second.first : 0, TTI);
11662 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11663 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11664 };
11665 return GetCostDiff(GetScalarCost, GetVectorCost);
11666 }
11667 case Instruction::ShuffleVector: {
11668 if (!SLPReVec || E->isAltShuffle())
11669 assert(E->isAltShuffle() &&
11670 ((Instruction::isBinaryOp(E->getOpcode()) &&
11671 Instruction::isBinaryOp(E->getAltOpcode())) ||
11672 (Instruction::isCast(E->getOpcode()) &&
11673 Instruction::isCast(E->getAltOpcode())) ||
11674 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11675 "Invalid Shuffle Vector Operand");
11676 // Try to find the previous shuffle node with the same operands and same
11677 // main/alternate ops.
11678 auto TryFindNodeWithEqualOperands = [=]() {
11679 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11680 if (TE.get() == E)
11681 break;
11682 if (TE->isAltShuffle() &&
11683 ((TE->getOpcode() == E->getOpcode() &&
11684 TE->getAltOpcode() == E->getAltOpcode()) ||
11685 (TE->getOpcode() == E->getAltOpcode() &&
11686 TE->getAltOpcode() == E->getOpcode())) &&
11687 TE->hasEqualOperands(*E))
11688 return true;
11689 }
11690 return false;
11691 };
11692 auto GetScalarCost = [&](unsigned Idx) {
11693 if (isa<PoisonValue>(UniqueValues[Idx]))
11695
11696 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11697 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11698 (void)E;
11699 return TTI->getInstructionCost(VI, CostKind);
11700 };
11701 // Need to clear CommonCost since the final shuffle cost is included into
11702 // vector cost.
11703 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11704 // VecCost is equal to sum of the cost of creating 2 vectors
11705 // and the cost of creating shuffle.
11706 InstructionCost VecCost = 0;
11707 if (TryFindNodeWithEqualOperands()) {
11708 LLVM_DEBUG({
11709 dbgs() << "SLP: diamond match for alternate node found.\n";
11710 E->dump();
11711 });
11712 // No need to add new vector costs here since we're going to reuse
11713 // same main/alternate vector ops, just do different shuffling.
11714 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11715 VecCost =
11716 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11717 VecCost +=
11718 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11719 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11720 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11721 VecCost = TTIRef.getCmpSelInstrCost(
11722 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11723 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11724 VL0);
11725 VecCost += TTIRef.getCmpSelInstrCost(
11726 E->getOpcode(), VecTy, MaskTy,
11727 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11728 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11729 E->getAltOp());
11730 } else {
11731 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11732 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11733 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11734 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11735 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11736 unsigned SrcBWSz =
11737 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11738 if (SrcIt != MinBWs.end()) {
11739 SrcBWSz = SrcIt->second.first;
11740 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11741 SrcTy = getWidenedType(SrcSclTy, VL.size());
11742 }
11743 if (BWSz <= SrcBWSz) {
11744 if (BWSz < SrcBWSz)
11745 VecCost =
11746 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11748 LLVM_DEBUG({
11749 dbgs()
11750 << "SLP: alternate extension, which should be truncated.\n";
11751 E->dump();
11752 });
11753 return VecCost;
11754 }
11755 }
11756 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11758 VecCost +=
11759 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11761 }
11763 E->buildAltOpShuffleMask(
11764 [&](Instruction *I) {
11765 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11766 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11767 *TLI);
11768 },
11769 Mask);
11771 FinalVecTy, Mask, CostKind);
11772 // Patterns like [fadd,fsub] can be combined into a single instruction
11773 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11774 // need to take into account their order when looking for the most used
11775 // order.
11776 unsigned Opcode0 = E->getOpcode();
11777 unsigned Opcode1 = E->getAltOpcode();
11778 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11779 // If this pattern is supported by the target then we consider the
11780 // order.
11781 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11782 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11783 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11784 return AltVecCost < VecCost ? AltVecCost : VecCost;
11785 }
11786 // TODO: Check the reverse order too.
11787 return VecCost;
11788 };
11789 if (SLPReVec && !E->isAltShuffle())
11790 return GetCostDiff(
11791 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11792 // If a group uses mask in order, the shufflevector can be
11793 // eliminated by instcombine. Then the cost is 0.
11794 assert(isa<ShuffleVectorInst>(VL.front()) &&
11795 "Not supported shufflevector usage.");
11796 auto *SV = cast<ShuffleVectorInst>(VL.front());
11797 unsigned SVNumElements =
11798 cast<FixedVectorType>(SV->getOperand(0)->getType())
11799 ->getNumElements();
11800 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11801 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11802 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11803 int NextIndex = 0;
11804 if (!all_of(Group, [&](Value *V) {
11805 assert(isa<ShuffleVectorInst>(V) &&
11806 "Not supported shufflevector usage.");
11807 auto *SV = cast<ShuffleVectorInst>(V);
11808 int Index;
11809 [[maybe_unused]] bool IsExtractSubvectorMask =
11810 SV->isExtractSubvectorMask(Index);
11811 assert(IsExtractSubvectorMask &&
11812 "Not supported shufflevector usage.");
11813 if (NextIndex != Index)
11814 return false;
11815 NextIndex += SV->getShuffleMask().size();
11816 return true;
11817 }))
11818 return ::getShuffleCost(
11820 calculateShufflevectorMask(E->Scalars));
11821 }
11822 return TTI::TCC_Free;
11823 });
11824 return GetCostDiff(GetScalarCost, GetVectorCost);
11825 }
11826 case Instruction::Freeze:
11827 return CommonCost;
11828 default:
11829 llvm_unreachable("Unknown instruction");
11830 }
11831}
11832
11833bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11834 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11835 << VectorizableTree.size() << " is fully vectorizable .\n");
11836
11837 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11839 return TE->isGather() &&
11840 !any_of(TE->Scalars,
11841 [this](Value *V) { return EphValues.contains(V); }) &&
11842 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11843 TE->Scalars.size() < Limit ||
11844 ((TE->getOpcode() == Instruction::ExtractElement ||
11845 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11846 isFixedVectorShuffle(TE->Scalars, Mask)) ||
11847 (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
11848 any_of(TE->Scalars, IsaPred<LoadInst>));
11849 };
11850
11851 // We only handle trees of heights 1 and 2.
11852 if (VectorizableTree.size() == 1 &&
11853 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11854 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11855 (ForReduction &&
11856 AreVectorizableGathers(VectorizableTree[0].get(),
11857 VectorizableTree[0]->Scalars.size()) &&
11858 VectorizableTree[0]->getVectorFactor() > 2)))
11859 return true;
11860
11861 if (VectorizableTree.size() != 2)
11862 return false;
11863
11864 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11865 // with the second gather nodes if they have less scalar operands rather than
11866 // the initial tree element (may be profitable to shuffle the second gather)
11867 // or they are extractelements, which form shuffle.
11869 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11870 AreVectorizableGathers(VectorizableTree[1].get(),
11871 VectorizableTree[0]->Scalars.size()))
11872 return true;
11873
11874 // Gathering cost would be too much for tiny trees.
11875 if (VectorizableTree[0]->isGather() ||
11876 (VectorizableTree[1]->isGather() &&
11877 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11878 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11879 return false;
11880
11881 return true;
11882}
11883
11884static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
11886 bool MustMatchOrInst) {
11887 // Look past the root to find a source value. Arbitrarily follow the
11888 // path through operand 0 of any 'or'. Also, peek through optional
11889 // shift-left-by-multiple-of-8-bits.
11890 Value *ZextLoad = Root;
11891 const APInt *ShAmtC;
11892 bool FoundOr = false;
11893 while (!isa<ConstantExpr>(ZextLoad) &&
11894 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
11895 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
11896 ShAmtC->urem(8) == 0))) {
11897 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11898 ZextLoad = BinOp->getOperand(0);
11899 if (BinOp->getOpcode() == Instruction::Or)
11900 FoundOr = true;
11901 }
11902 // Check if the input is an extended load of the required or/shift expression.
11903 Value *Load;
11904 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11905 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
11906 return false;
11907
11908 // Require that the total load bit width is a legal integer type.
11909 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
11910 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
11911 Type *SrcTy = Load->getType();
11912 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
11913 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
11914 return false;
11915
11916 // Everything matched - assume that we can fold the whole sequence using
11917 // load combining.
11918 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
11919 << *(cast<Instruction>(Root)) << "\n");
11920
11921 return true;
11922}
11923
11925 if (RdxKind != RecurKind::Or)
11926 return false;
11927
11928 unsigned NumElts = VectorizableTree[0]->Scalars.size();
11929 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
11930 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
11931 /* MatchOr */ false);
11932}
11933
11935 // Peek through a final sequence of stores and check if all operations are
11936 // likely to be load-combined.
11937 unsigned NumElts = Stores.size();
11938 for (Value *Scalar : Stores) {
11939 Value *X;
11940 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
11941 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
11942 return false;
11943 }
11944 return true;
11945}
11946
11947bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
11948 if (!DebugCounter::shouldExecute(VectorizedGraphs))
11949 return true;
11950
11951 // Graph is empty - do nothing.
11952 if (VectorizableTree.empty()) {
11953 assert(ExternalUses.empty() && "We shouldn't have any external users");
11954
11955 return true;
11956 }
11957
11958 // No need to vectorize inserts of gathered values.
11959 if (VectorizableTree.size() == 2 &&
11960 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
11961 VectorizableTree[1]->isGather() &&
11962 (VectorizableTree[1]->getVectorFactor() <= 2 ||
11963 !(isSplat(VectorizableTree[1]->Scalars) ||
11964 allConstant(VectorizableTree[1]->Scalars))))
11965 return true;
11966
11967 // If the graph includes only PHI nodes and gathers, it is defnitely not
11968 // profitable for the vectorization, we can skip it, if the cost threshold is
11969 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
11970 // gathers/buildvectors.
11971 constexpr int Limit = 4;
11972 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
11973 !VectorizableTree.empty() &&
11974 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11975 return (TE->isGather() &&
11976 TE->getOpcode() != Instruction::ExtractElement &&
11977 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
11978 TE->getOpcode() == Instruction::PHI;
11979 }))
11980 return true;
11981
11982 // We can vectorize the tree if its size is greater than or equal to the
11983 // minimum size specified by the MinTreeSize command line option.
11984 if (VectorizableTree.size() >= MinTreeSize)
11985 return false;
11986
11987 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
11988 // can vectorize it if we can prove it fully vectorizable.
11989 if (isFullyVectorizableTinyTree(ForReduction))
11990 return false;
11991
11992 // Check if any of the gather node forms an insertelement buildvector
11993 // somewhere.
11994 bool IsAllowedSingleBVNode =
11995 VectorizableTree.size() > 1 ||
11996 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
11997 !VectorizableTree.front()->isAltShuffle() &&
11998 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
11999 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12000 allSameBlock(VectorizableTree.front()->Scalars));
12001 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12002 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12003 return isa<ExtractElementInst, UndefValue>(V) ||
12004 (IsAllowedSingleBVNode &&
12005 !V->hasNUsesOrMore(UsesLimit) &&
12006 any_of(V->users(), IsaPred<InsertElementInst>));
12007 });
12008 }))
12009 return false;
12010
12011 if (VectorizableTree.back()->isGather() &&
12012 VectorizableTree.back()->isAltShuffle() &&
12013 VectorizableTree.back()->getVectorFactor() > 2 &&
12014 allSameBlock(VectorizableTree.back()->Scalars) &&
12015 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12017 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12018 VectorizableTree.back()->getVectorFactor()),
12019 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12020 /*Insert=*/true, /*Extract=*/false,
12022 return false;
12023
12024 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12025 // vectorizable.
12026 return true;
12027}
12028
12031 constexpr unsigned SmallTree = 3;
12032 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12033 getCanonicalGraphSize() <= SmallTree &&
12034 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12035 [](const std::unique_ptr<TreeEntry> &TE) {
12036 return TE->isGather() &&
12037 TE->getOpcode() == Instruction::Load &&
12038 !allSameBlock(TE->Scalars);
12039 }) == 1)
12040 return true;
12041 return false;
12042 }
12043 bool Res = false;
12044 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12045 TreeEntry &E = *VectorizableTree[Idx];
12046 if (!E.isGather())
12047 continue;
12048 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12049 return false;
12050 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12051 continue;
12052 Res = true;
12053 }
12054 return Res;
12055}
12056
12058 // Walk from the bottom of the tree to the top, tracking which values are
12059 // live. When we see a call instruction that is not part of our tree,
12060 // query TTI to see if there is a cost to keeping values live over it
12061 // (for example, if spills and fills are required).
12062 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12064
12066 Instruction *PrevInst = nullptr;
12067
12068 // The entries in VectorizableTree are not necessarily ordered by their
12069 // position in basic blocks. Collect them and order them by dominance so later
12070 // instructions are guaranteed to be visited first. For instructions in
12071 // different basic blocks, we only scan to the beginning of the block, so
12072 // their order does not matter, as long as all instructions in a basic block
12073 // are grouped together. Using dominance ensures a deterministic order.
12074 SmallVector<Instruction *, 16> OrderedScalars;
12075 for (const auto &TEPtr : VectorizableTree) {
12076 if (TEPtr->State != TreeEntry::Vectorize)
12077 continue;
12078 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12079 if (!Inst)
12080 continue;
12081 OrderedScalars.push_back(Inst);
12082 }
12083 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12084 auto *NodeA = DT->getNode(A->getParent());
12085 auto *NodeB = DT->getNode(B->getParent());
12086 assert(NodeA && "Should only process reachable instructions");
12087 assert(NodeB && "Should only process reachable instructions");
12088 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12089 "Different nodes should have different DFS numbers");
12090 if (NodeA != NodeB)
12091 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12092 return B->comesBefore(A);
12093 });
12094
12095 for (Instruction *Inst : OrderedScalars) {
12096 if (!PrevInst) {
12097 PrevInst = Inst;
12098 continue;
12099 }
12100
12101 // Update LiveValues.
12102 LiveValues.erase(PrevInst);
12103 for (auto &J : PrevInst->operands()) {
12104 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12105 LiveValues.insert(cast<Instruction>(&*J));
12106 }
12107
12108 LLVM_DEBUG({
12109 dbgs() << "SLP: #LV: " << LiveValues.size();
12110 for (auto *X : LiveValues)
12111 dbgs() << " " << X->getName();
12112 dbgs() << ", Looking at ";
12113 Inst->dump();
12114 });
12115
12116 // Now find the sequence of instructions between PrevInst and Inst.
12117 unsigned NumCalls = 0;
12118 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12119 PrevInstIt =
12120 PrevInst->getIterator().getReverse();
12121 while (InstIt != PrevInstIt) {
12122 if (PrevInstIt == PrevInst->getParent()->rend()) {
12123 PrevInstIt = Inst->getParent()->rbegin();
12124 continue;
12125 }
12126
12127 auto NoCallIntrinsic = [this](Instruction *I) {
12128 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12129 if (II->isAssumeLikeIntrinsic())
12130 return true;
12131 FastMathFlags FMF;
12133 for (auto &ArgOp : II->args())
12134 Tys.push_back(ArgOp->getType());
12135 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12136 FMF = FPMO->getFastMathFlags();
12137 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12138 FMF);
12139 InstructionCost IntrCost =
12142 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12143 if (IntrCost < CallCost)
12144 return true;
12145 }
12146 return false;
12147 };
12148
12149 // Debug information does not impact spill cost.
12150 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12151 &*PrevInstIt != PrevInst)
12152 NumCalls++;
12153
12154 ++PrevInstIt;
12155 }
12156
12157 if (NumCalls) {
12159 for (auto *II : LiveValues) {
12160 auto *ScalarTy = II->getType();
12161 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12162 ScalarTy = VectorTy->getElementType();
12163 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12164 }
12165 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12166 }
12167
12168 PrevInst = Inst;
12169 }
12170
12171 return Cost;
12172}
12173
12174/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12175/// buildvector sequence.
12177 const InsertElementInst *IE2) {
12178 if (IE1 == IE2)
12179 return false;
12180 const auto *I1 = IE1;
12181 const auto *I2 = IE2;
12182 const InsertElementInst *PrevI1;
12183 const InsertElementInst *PrevI2;
12184 unsigned Idx1 = *getElementIndex(IE1);
12185 unsigned Idx2 = *getElementIndex(IE2);
12186 do {
12187 if (I2 == IE1)
12188 return true;
12189 if (I1 == IE2)
12190 return false;
12191 PrevI1 = I1;
12192 PrevI2 = I2;
12193 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12194 getElementIndex(I1).value_or(Idx2) != Idx2)
12195 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12196 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12197 getElementIndex(I2).value_or(Idx1) != Idx1)
12198 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12199 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12200 llvm_unreachable("Two different buildvectors not expected.");
12201}
12202
12203namespace {
12204/// Returns incoming Value *, if the requested type is Value * too, or a default
12205/// value, otherwise.
12206struct ValueSelect {
12207 template <typename U>
12208 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12209 return V;
12210 }
12211 template <typename U>
12212 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12213 return U();
12214 }
12215};
12216} // namespace
12217
12218/// Does the analysis of the provided shuffle masks and performs the requested
12219/// actions on the vectors with the given shuffle masks. It tries to do it in
12220/// several steps.
12221/// 1. If the Base vector is not undef vector, resizing the very first mask to
12222/// have common VF and perform action for 2 input vectors (including non-undef
12223/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12224/// and processed as a shuffle of 2 elements.
12225/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12226/// action only for 1 vector with the given mask, if it is not the identity
12227/// mask.
12228/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12229/// vectors, combing the masks properly between the steps.
12230template <typename T>
12232 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12233 function_ref<unsigned(T *)> GetVF,
12234 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12236 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12237 SmallVector<int> Mask(ShuffleMask.begin()->second);
12238 auto VMIt = std::next(ShuffleMask.begin());
12239 T *Prev = nullptr;
12240 SmallBitVector UseMask =
12241 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12242 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12243 if (!IsBaseUndef.all()) {
12244 // Base is not undef, need to combine it with the next subvectors.
12245 std::pair<T *, bool> Res =
12246 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12247 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12248 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12249 if (Mask[Idx] == PoisonMaskElem)
12250 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12251 else
12252 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12253 }
12254 auto *V = ValueSelect::get<T *>(Base);
12255 (void)V;
12256 assert((!V || GetVF(V) == Mask.size()) &&
12257 "Expected base vector of VF number of elements.");
12258 Prev = Action(Mask, {nullptr, Res.first});
12259 } else if (ShuffleMask.size() == 1) {
12260 // Base is undef and only 1 vector is shuffled - perform the action only for
12261 // single vector, if the mask is not the identity mask.
12262 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12263 /*ForSingleMask=*/true);
12264 if (Res.second)
12265 // Identity mask is found.
12266 Prev = Res.first;
12267 else
12268 Prev = Action(Mask, {ShuffleMask.begin()->first});
12269 } else {
12270 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12271 // shuffles step by step, combining shuffle between the steps.
12272 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12273 unsigned Vec2VF = GetVF(VMIt->first);
12274 if (Vec1VF == Vec2VF) {
12275 // No need to resize the input vectors since they are of the same size, we
12276 // can shuffle them directly.
12277 ArrayRef<int> SecMask = VMIt->second;
12278 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12279 if (SecMask[I] != PoisonMaskElem) {
12280 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12281 Mask[I] = SecMask[I] + Vec1VF;
12282 }
12283 }
12284 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12285 } else {
12286 // Vectors of different sizes - resize and reshuffle.
12287 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12288 /*ForSingleMask=*/false);
12289 std::pair<T *, bool> Res2 =
12290 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12291 ArrayRef<int> SecMask = VMIt->second;
12292 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12293 if (Mask[I] != PoisonMaskElem) {
12294 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12295 if (Res1.second)
12296 Mask[I] = I;
12297 } else if (SecMask[I] != PoisonMaskElem) {
12298 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12299 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12300 }
12301 }
12302 Prev = Action(Mask, {Res1.first, Res2.first});
12303 }
12304 VMIt = std::next(VMIt);
12305 }
12306 bool IsBaseNotUndef = !IsBaseUndef.all();
12307 (void)IsBaseNotUndef;
12308 // Perform requested actions for the remaining masks/vectors.
12309 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12310 // Shuffle other input vectors, if any.
12311 std::pair<T *, bool> Res =
12312 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12313 ArrayRef<int> SecMask = VMIt->second;
12314 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12315 if (SecMask[I] != PoisonMaskElem) {
12316 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12317 "Multiple uses of scalars.");
12318 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12319 } else if (Mask[I] != PoisonMaskElem) {
12320 Mask[I] = I;
12321 }
12322 }
12323 Prev = Action(Mask, {Prev, Res.first});
12324 }
12325 return Prev;
12326}
12327
12328namespace {
12329/// Data type for handling buildvector sequences with the reused scalars from
12330/// other tree entries.
12331template <typename T> struct ShuffledInsertData {
12332 /// List of insertelements to be replaced by shuffles.
12333 SmallVector<InsertElementInst *> InsertElements;
12334 /// The parent vectors and shuffle mask for the given list of inserts.
12336};
12337} // namespace
12338
12341 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12342 << VectorizableTree.size() << ".\n");
12343
12344 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12345
12346 SmallPtrSet<Value *, 4> CheckedExtracts;
12347 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12348 TreeEntry &TE = *VectorizableTree[I];
12349 // No need to count the cost for combined entries, they are combined and
12350 // just skip their cost.
12351 if (TE.State == TreeEntry::CombinedVectorize) {
12352 LLVM_DEBUG(
12353 dbgs() << "SLP: Skipping cost for combined node that starts with "
12354 << *TE.Scalars[0] << ".\n";
12355 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12356 continue;
12357 }
12358 if (TE.isGather()) {
12359 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12360 E && E->getVectorFactor() == TE.getVectorFactor() &&
12361 E->isSame(TE.Scalars)) {
12362 // Some gather nodes might be absolutely the same as some vectorizable
12363 // nodes after reordering, need to handle it.
12364 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12365 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12366 << "SLP: Current total cost = " << Cost << "\n");
12367 continue;
12368 }
12369 }
12370
12371 // Exclude cost of gather loads nodes which are not used. These nodes were
12372 // built as part of the final attempt to vectorize gathered loads.
12373 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12374 "Expected gather nodes with users only.");
12375
12376 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12377 Cost += C;
12378 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12379 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12380 << "SLP: Current total cost = " << Cost << "\n");
12381 }
12382
12383 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12384 InstructionCost ExtractCost = 0;
12386 SmallVector<APInt> DemandedElts;
12387 SmallDenseSet<Value *, 4> UsedInserts;
12389 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12391 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12392 // Keep track {Scalar, Index, User} tuple.
12393 // On AArch64, this helps in fusing a mov instruction, associated with
12394 // extractelement, with fmul in the backend so that extractelement is free.
12396 for (ExternalUser &EU : ExternalUses) {
12397 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12398 }
12399 for (ExternalUser &EU : ExternalUses) {
12400 // Uses by ephemeral values are free (because the ephemeral value will be
12401 // removed prior to code generation, and so the extraction will be
12402 // removed as well).
12403 if (EphValues.count(EU.User))
12404 continue;
12405
12406 // Used in unreachable blocks or in EH pads (rarely executed) or is
12407 // terminated with unreachable instruction.
12408 if (BasicBlock *UserParent =
12409 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12410 UserParent &&
12411 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12412 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12413 continue;
12414
12415 // We only add extract cost once for the same scalar.
12416 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12417 !ExtractCostCalculated.insert(EU.Scalar).second)
12418 continue;
12419
12420 // No extract cost for vector "scalar"
12421 if (isa<FixedVectorType>(EU.Scalar->getType()))
12422 continue;
12423
12424 // If found user is an insertelement, do not calculate extract cost but try
12425 // to detect it as a final shuffled/identity match.
12426 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12427 VU && VU->getOperand(1) == EU.Scalar) {
12428 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12429 if (!UsedInserts.insert(VU).second)
12430 continue;
12431 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12432 if (InsertIdx) {
12433 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12434 auto *It = find_if(
12435 ShuffledInserts,
12436 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12437 // Checks if 2 insertelements are from the same buildvector.
12438 InsertElementInst *VecInsert = Data.InsertElements.front();
12440 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12441 Value *Op0 = II->getOperand(0);
12442 if (getTreeEntry(II) && !getTreeEntry(Op0))
12443 return nullptr;
12444 return Op0;
12445 });
12446 });
12447 int VecId = -1;
12448 if (It == ShuffledInserts.end()) {
12449 auto &Data = ShuffledInserts.emplace_back();
12450 Data.InsertElements.emplace_back(VU);
12451 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12452 VecId = ShuffledInserts.size() - 1;
12453 auto It = MinBWs.find(ScalarTE);
12454 if (It != MinBWs.end() &&
12455 VectorCasts
12456 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12457 .second) {
12458 unsigned BWSz = It->second.first;
12459 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12460 unsigned VecOpcode;
12461 if (DstBWSz < BWSz)
12462 VecOpcode = Instruction::Trunc;
12463 else
12464 VecOpcode =
12465 It->second.second ? Instruction::SExt : Instruction::ZExt;
12468 VecOpcode, FTy,
12469 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12470 FTy->getNumElements()),
12472 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12473 << " for extending externally used vector with "
12474 "non-equal minimum bitwidth.\n");
12475 Cost += C;
12476 }
12477 } else {
12478 if (isFirstInsertElement(VU, It->InsertElements.front()))
12479 It->InsertElements.front() = VU;
12480 VecId = std::distance(ShuffledInserts.begin(), It);
12481 }
12482 int InIdx = *InsertIdx;
12483 SmallVectorImpl<int> &Mask =
12484 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12485 if (Mask.empty())
12486 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12487 Mask[InIdx] = EU.Lane;
12488 DemandedElts[VecId].setBit(InIdx);
12489 continue;
12490 }
12491 }
12492 }
12493
12495 // If we plan to rewrite the tree in a smaller type, we will need to sign
12496 // extend the extracted value back to the original type. Here, we account
12497 // for the extract and the added cost of the sign extend if needed.
12498 InstructionCost ExtraCost = TTI::TCC_Free;
12499 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12500 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12501 auto It = MinBWs.find(Entry);
12502 if (It != MinBWs.end()) {
12503 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12504 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12505 ? Instruction::ZExt
12506 : Instruction::SExt;
12507 VecTy = getWidenedType(MinTy, BundleWidth);
12508 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12509 VecTy, EU.Lane);
12510 } else {
12511 ExtraCost =
12512 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12513 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12514 }
12515 // Leave the scalar instructions as is if they are cheaper than extracts.
12516 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12517 Entry->getOpcode() == Instruction::Load) {
12518 // Checks if the user of the external scalar is phi in loop body.
12519 auto IsPhiInLoop = [&](const ExternalUser &U) {
12520 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12521 auto *I = cast<Instruction>(U.Scalar);
12522 const Loop *L = LI->getLoopFor(Phi->getParent());
12523 return L && (Phi->getParent() == I->getParent() ||
12524 L == LI->getLoopFor(I->getParent()));
12525 }
12526 return false;
12527 };
12528 if (!ValueToExtUses) {
12529 ValueToExtUses.emplace();
12530 for_each(enumerate(ExternalUses), [&](const auto &P) {
12531 // Ignore phis in loops.
12532 if (IsPhiInLoop(P.value()))
12533 return;
12534
12535 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12536 });
12537 }
12538 // Can use original instruction, if no operands vectorized or they are
12539 // marked as externally used already.
12540 auto *Inst = cast<Instruction>(EU.Scalar);
12541 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12542 auto OperandIsScalar = [&](Value *V) {
12543 if (!getTreeEntry(V)) {
12544 // Some extractelements might be not vectorized, but
12545 // transformed into shuffle and removed from the function,
12546 // consider it here.
12547 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12548 return !EE->hasOneUse() || !MustGather.contains(EE);
12549 return true;
12550 }
12551 return ValueToExtUses->contains(V);
12552 };
12553 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12554 bool CanBeUsedAsScalarCast = false;
12555 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12556 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12557 Op && all_of(Op->operands(), OperandIsScalar)) {
12558 InstructionCost OpCost =
12559 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12561 : 0;
12562 if (ScalarCost + OpCost <= ExtraCost) {
12563 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12564 ScalarCost += OpCost;
12565 }
12566 }
12567 }
12568 if (CanBeUsedAsScalar) {
12569 bool KeepScalar = ScalarCost <= ExtraCost;
12570 // Try to keep original scalar if the user is the phi node from the same
12571 // block as the root phis, currently vectorized. It allows to keep
12572 // better ordering info of PHIs, being vectorized currently.
12573 bool IsProfitablePHIUser =
12574 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12575 VectorizableTree.front()->Scalars.size() > 2)) &&
12576 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12577 !Inst->hasNUsesOrMore(UsesLimit) &&
12578 none_of(Inst->users(),
12579 [&](User *U) {
12580 auto *PHIUser = dyn_cast<PHINode>(U);
12581 return (!PHIUser ||
12582 PHIUser->getParent() !=
12583 cast<Instruction>(
12584 VectorizableTree.front()->getMainOp())
12585 ->getParent()) &&
12586 !getTreeEntry(U);
12587 }) &&
12588 count_if(Entry->Scalars, [&](Value *V) {
12589 return ValueToExtUses->contains(V);
12590 }) <= 2;
12591 if (IsProfitablePHIUser) {
12592 KeepScalar = true;
12593 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12594 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12595 (!GatheredLoadsEntriesFirst.has_value() ||
12596 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12597 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12598 return ValueToExtUses->contains(V);
12599 });
12600 auto It = ExtractsCount.find(Entry);
12601 if (It != ExtractsCount.end()) {
12602 assert(ScalarUsesCount >= It->getSecond().size() &&
12603 "Expected total number of external uses not less than "
12604 "number of scalar uses.");
12605 ScalarUsesCount -= It->getSecond().size();
12606 }
12607 // Keep original scalar if number of externally used instructions in
12608 // the same entry is not power of 2. It may help to do some extra
12609 // vectorization for now.
12610 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12611 }
12612 if (KeepScalar) {
12613 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12614 for_each(Inst->operands(), [&](Value *V) {
12615 auto It = ValueToExtUses->find(V);
12616 if (It != ValueToExtUses->end()) {
12617 // Replace all uses to avoid compiler crash.
12618 ExternalUses[It->second].User = nullptr;
12619 }
12620 });
12621 ExtraCost = ScalarCost;
12622 if (!IsPhiInLoop(EU))
12623 ExtractsCount[Entry].insert(Inst);
12624 if (CanBeUsedAsScalarCast) {
12625 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12626 // Update the users of the operands of the cast operand to avoid
12627 // compiler crash.
12628 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12629 for_each(IOp->operands(), [&](Value *V) {
12630 auto It = ValueToExtUses->find(V);
12631 if (It != ValueToExtUses->end()) {
12632 // Replace all uses to avoid compiler crash.
12633 ExternalUses[It->second].User = nullptr;
12634 }
12635 });
12636 }
12637 }
12638 }
12639 }
12640 }
12641
12642 ExtractCost += ExtraCost;
12643 }
12644 // Insert externals for extract of operands of casts to be emitted as scalars
12645 // instead of extractelement.
12646 for (Value *V : ScalarOpsFromCasts) {
12647 ExternalUsesAsOriginalScalar.insert(V);
12648 if (const TreeEntry *E = getTreeEntry(V)) {
12649 ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
12650 }
12651 }
12652 // Add reduced value cost, if resized.
12653 if (!VectorizedVals.empty()) {
12654 const TreeEntry &Root = *VectorizableTree.front();
12655 auto BWIt = MinBWs.find(&Root);
12656 if (BWIt != MinBWs.end()) {
12657 Type *DstTy = Root.Scalars.front()->getType();
12658 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12659 unsigned SrcSz =
12660 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12661 if (OriginalSz != SrcSz) {
12662 unsigned Opcode = Instruction::Trunc;
12663 if (OriginalSz > SrcSz)
12664 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12665 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12666 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12667 assert(SLPReVec && "Only supported by REVEC.");
12668 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12669 }
12670 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12673 }
12674 }
12675 }
12676
12677 InstructionCost SpillCost = getSpillCost();
12678 Cost += SpillCost + ExtractCost;
12679 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12680 bool) {
12681 InstructionCost C = 0;
12682 unsigned VF = Mask.size();
12683 unsigned VecVF = TE->getVectorFactor();
12684 if (VF != VecVF &&
12685 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12687 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12688 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12689 OrigMask.begin());
12691 getWidenedType(TE->getMainOp()->getType(), VecVF),
12692 OrigMask);
12693 LLVM_DEBUG(
12694 dbgs() << "SLP: Adding cost " << C
12695 << " for final shuffle of insertelement external users.\n";
12696 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12697 Cost += C;
12698 return std::make_pair(TE, true);
12699 }
12700 return std::make_pair(TE, false);
12701 };
12702 // Calculate the cost of the reshuffled vectors, if any.
12703 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12704 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12705 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12706 unsigned VF = 0;
12707 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12709 assert((TEs.size() == 1 || TEs.size() == 2) &&
12710 "Expected exactly 1 or 2 tree entries.");
12711 if (TEs.size() == 1) {
12712 if (VF == 0)
12713 VF = TEs.front()->getVectorFactor();
12714 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12715 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12716 !all_of(enumerate(Mask), [=](const auto &Data) {
12717 return Data.value() == PoisonMaskElem ||
12718 (Data.index() < VF &&
12719 static_cast<int>(Data.index()) == Data.value());
12720 })) {
12723 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12724 << " for final shuffle of insertelement "
12725 "external users.\n";
12726 TEs.front()->dump();
12727 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12728 Cost += C;
12729 }
12730 } else {
12731 if (VF == 0) {
12732 if (TEs.front() &&
12733 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12734 VF = TEs.front()->getVectorFactor();
12735 else
12736 VF = Mask.size();
12737 }
12738 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12741 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12742 << " for final shuffle of vector node and external "
12743 "insertelement users.\n";
12744 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12745 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12746 Cost += C;
12747 }
12748 VF = Mask.size();
12749 return TEs.back();
12750 };
12751 (void)performExtractsShuffleAction<const TreeEntry>(
12752 MutableArrayRef(Vector.data(), Vector.size()), Base,
12753 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12754 EstimateShufflesCost);
12756 cast<FixedVectorType>(
12757 ShuffledInserts[I].InsertElements.front()->getType()),
12758 DemandedElts[I],
12759 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12760 Cost -= InsertCost;
12761 }
12762
12763 // Add the cost for reduced value resize (if required).
12764 if (ReductionBitWidth != 0) {
12765 assert(UserIgnoreList && "Expected reduction tree.");
12766 const TreeEntry &E = *VectorizableTree.front();
12767 auto It = MinBWs.find(&E);
12768 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12769 unsigned SrcSize = It->second.first;
12770 unsigned DstSize = ReductionBitWidth;
12771 unsigned Opcode = Instruction::Trunc;
12772 if (SrcSize < DstSize) {
12773 bool IsArithmeticExtendedReduction =
12774 all_of(*UserIgnoreList, [](Value *V) {
12775 auto *I = cast<Instruction>(V);
12776 return is_contained({Instruction::Add, Instruction::FAdd,
12777 Instruction::Mul, Instruction::FMul,
12778 Instruction::And, Instruction::Or,
12779 Instruction::Xor},
12780 I->getOpcode());
12781 });
12782 if (IsArithmeticExtendedReduction)
12783 Opcode =
12784 Instruction::BitCast; // Handle it by getExtendedReductionCost
12785 else
12786 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12787 }
12788 if (Opcode != Instruction::BitCast) {
12789 auto *SrcVecTy =
12790 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12791 auto *DstVecTy =
12792 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12793 TTI::CastContextHint CCH = getCastContextHint(E);
12794 InstructionCost CastCost;
12795 switch (E.getOpcode()) {
12796 case Instruction::SExt:
12797 case Instruction::ZExt:
12798 case Instruction::Trunc: {
12799 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12800 CCH = getCastContextHint(*OpTE);
12801 break;
12802 }
12803 default:
12804 break;
12805 }
12806 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12808 Cost += CastCost;
12809 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12810 << " for final resize for reduction from " << SrcVecTy
12811 << " to " << DstVecTy << "\n";
12812 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12813 }
12814 }
12815 }
12816
12817#ifndef NDEBUG
12818 SmallString<256> Str;
12819 {
12821 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12822 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12823 << "SLP: Total Cost = " << Cost << ".\n";
12824 }
12825 LLVM_DEBUG(dbgs() << Str);
12826 if (ViewSLPTree)
12827 ViewGraph(this, "SLP" + F->getName(), false, Str);
12828#endif
12829
12830 return Cost;
12831}
12832
12833/// Tries to find extractelement instructions with constant indices from fixed
12834/// vector type and gather such instructions into a bunch, which highly likely
12835/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12836/// successful, the matched scalars are replaced by poison values in \p VL for
12837/// future analysis.
12838std::optional<TTI::ShuffleKind>
12839BoUpSLP::tryToGatherSingleRegisterExtractElements(
12841 // Scan list of gathered scalars for extractelements that can be represented
12842 // as shuffles.
12844 SmallVector<int> UndefVectorExtracts;
12845 for (int I = 0, E = VL.size(); I < E; ++I) {
12846 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12847 if (!EI) {
12848 if (isa<UndefValue>(VL[I]))
12849 UndefVectorExtracts.push_back(I);
12850 continue;
12851 }
12852 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12853 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12854 continue;
12855 std::optional<unsigned> Idx = getExtractIndex(EI);
12856 // Undefined index.
12857 if (!Idx) {
12858 UndefVectorExtracts.push_back(I);
12859 continue;
12860 }
12861 if (Idx >= VecTy->getNumElements()) {
12862 UndefVectorExtracts.push_back(I);
12863 continue;
12864 }
12865 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12866 ExtractMask.reset(*Idx);
12867 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12868 UndefVectorExtracts.push_back(I);
12869 continue;
12870 }
12871 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12872 }
12873 // Sort the vector operands by the maximum number of uses in extractelements.
12875 VectorOpToIdx.takeVector();
12876 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
12877 return P1.second.size() > P2.second.size();
12878 });
12879 // Find the best pair of the vectors or a single vector.
12880 const int UndefSz = UndefVectorExtracts.size();
12881 unsigned SingleMax = 0;
12882 unsigned PairMax = 0;
12883 if (!Vectors.empty()) {
12884 SingleMax = Vectors.front().second.size() + UndefSz;
12885 if (Vectors.size() > 1) {
12886 auto *ItNext = std::next(Vectors.begin());
12887 PairMax = SingleMax + ItNext->second.size();
12888 }
12889 }
12890 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12891 return std::nullopt;
12892 // Check if better to perform a shuffle of 2 vectors or just of a single
12893 // vector.
12894 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
12895 SmallVector<Value *> GatheredExtracts(
12896 VL.size(), PoisonValue::get(VL.front()->getType()));
12897 if (SingleMax >= PairMax && SingleMax) {
12898 for (int Idx : Vectors.front().second)
12899 std::swap(GatheredExtracts[Idx], VL[Idx]);
12900 } else if (!Vectors.empty()) {
12901 for (unsigned Idx : {0, 1})
12902 for (int Idx : Vectors[Idx].second)
12903 std::swap(GatheredExtracts[Idx], VL[Idx]);
12904 }
12905 // Add extracts from undefs too.
12906 for (int Idx : UndefVectorExtracts)
12907 std::swap(GatheredExtracts[Idx], VL[Idx]);
12908 // Check that gather of extractelements can be represented as just a
12909 // shuffle of a single/two vectors the scalars are extracted from.
12910 std::optional<TTI::ShuffleKind> Res =
12911 isFixedVectorShuffle(GatheredExtracts, Mask);
12912 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
12913 // TODO: try to check other subsets if possible.
12914 // Restore the original VL if attempt was not successful.
12915 copy(SavedVL, VL.begin());
12916 return std::nullopt;
12917 }
12918 // Restore unused scalars from mask, if some of the extractelements were not
12919 // selected for shuffle.
12920 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
12921 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
12922 isa<UndefValue>(GatheredExtracts[I])) {
12923 std::swap(VL[I], GatheredExtracts[I]);
12924 continue;
12925 }
12926 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12927 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
12928 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
12929 is_contained(UndefVectorExtracts, I))
12930 continue;
12931 }
12932 return Res;
12933}
12934
12935/// Tries to find extractelement instructions with constant indices from fixed
12936/// vector type and gather such instructions into a bunch, which highly likely
12937/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12938/// successful, the matched scalars are replaced by poison values in \p VL for
12939/// future analysis.
12941BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
12943 unsigned NumParts) const {
12944 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
12945 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
12946 Mask.assign(VL.size(), PoisonMaskElem);
12947 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
12948 for (unsigned Part : seq<unsigned>(NumParts)) {
12949 // Scan list of gathered scalars for extractelements that can be represented
12950 // as shuffles.
12952 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
12953 SmallVector<int> SubMask;
12954 std::optional<TTI::ShuffleKind> Res =
12955 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
12956 ShufflesRes[Part] = Res;
12957 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
12958 }
12959 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
12960 return Res.has_value();
12961 }))
12962 ShufflesRes.clear();
12963 return ShufflesRes;
12964}
12965
12966std::optional<TargetTransformInfo::ShuffleKind>
12967BoUpSLP::isGatherShuffledSingleRegisterEntry(
12968 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
12969 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
12970 Entries.clear();
12971 // TODO: currently checking only for Scalars in the tree entry, need to count
12972 // reused elements too for better cost estimation.
12973 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
12974 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
12975 : TE->UserTreeIndices.front();
12976 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
12977 const BasicBlock *TEInsertBlock = nullptr;
12978 // Main node of PHI entries keeps the correct order of operands/incoming
12979 // blocks.
12980 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
12981 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
12982 TEInsertPt = TEInsertBlock->getTerminator();
12983 } else {
12984 TEInsertBlock = TEInsertPt->getParent();
12985 }
12986 if (!DT->isReachableFromEntry(TEInsertBlock))
12987 return std::nullopt;
12988 auto *NodeUI = DT->getNode(TEInsertBlock);
12989 assert(NodeUI && "Should only process reachable instructions");
12990 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
12991 auto CheckOrdering = [&](const Instruction *InsertPt) {
12992 // Argument InsertPt is an instruction where vector code for some other
12993 // tree entry (one that shares one or more scalars with TE) is going to be
12994 // generated. This lambda returns true if insertion point of vector code
12995 // for the TE dominates that point (otherwise dependency is the other way
12996 // around). The other node is not limited to be of a gather kind. Gather
12997 // nodes are not scheduled and their vector code is inserted before their
12998 // first user. If user is PHI, that is supposed to be at the end of a
12999 // predecessor block. Otherwise it is the last instruction among scalars of
13000 // the user node. So, instead of checking dependency between instructions
13001 // themselves, we check dependency between their insertion points for vector
13002 // code (since each scalar instruction ends up as a lane of a vector
13003 // instruction).
13004 const BasicBlock *InsertBlock = InsertPt->getParent();
13005 auto *NodeEUI = DT->getNode(InsertBlock);
13006 if (!NodeEUI)
13007 return false;
13008 assert((NodeUI == NodeEUI) ==
13009 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13010 "Different nodes should have different DFS numbers");
13011 // Check the order of the gather nodes users.
13012 if (TEInsertPt->getParent() != InsertBlock &&
13013 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13014 return false;
13015 if (TEInsertPt->getParent() == InsertBlock &&
13016 TEInsertPt->comesBefore(InsertPt))
13017 return false;
13018 return true;
13019 };
13020 // Find all tree entries used by the gathered values. If no common entries
13021 // found - not a shuffle.
13022 // Here we build a set of tree nodes for each gathered value and trying to
13023 // find the intersection between these sets. If we have at least one common
13024 // tree node for each gathered value - we have just a permutation of the
13025 // single vector. If we have 2 different sets, we're in situation where we
13026 // have a permutation of 2 input vectors.
13028 DenseMap<Value *, int> UsedValuesEntry;
13029 for (Value *V : VL) {
13030 if (isConstant(V))
13031 continue;
13032 // Build a list of tree entries where V is used.
13034 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13035 if (TEPtr == TE || TEPtr->Idx == 0)
13036 continue;
13037 assert(any_of(TEPtr->Scalars,
13038 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13039 "Must contain at least single gathered value.");
13040 assert(TEPtr->UserTreeIndices.size() == 1 &&
13041 "Expected only single user of a gather node.");
13042 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13043
13044 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13045 const Instruction *InsertPt =
13046 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13047 : &getLastInstructionInBundle(UseEI.UserTE);
13048 if (TEInsertPt == InsertPt) {
13049 // If 2 gathers are operands of the same entry (regardless of whether
13050 // user is PHI or else), compare operands indices, use the earlier one
13051 // as the base.
13052 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13053 continue;
13054 // If the user instruction is used for some reason in different
13055 // vectorized nodes - make it depend on index.
13056 if (TEUseEI.UserTE != UseEI.UserTE &&
13057 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13058 continue;
13059 }
13060
13061 // Check if the user node of the TE comes after user node of TEPtr,
13062 // otherwise TEPtr depends on TE.
13063 if ((TEInsertBlock != InsertPt->getParent() ||
13064 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13065 !CheckOrdering(InsertPt))
13066 continue;
13067 VToTEs.insert(TEPtr);
13068 }
13069 if (const TreeEntry *VTE = getTreeEntry(V)) {
13070 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13071 if (VTE->State != TreeEntry::Vectorize) {
13072 auto It = MultiNodeScalars.find(V);
13073 if (It == MultiNodeScalars.end())
13074 continue;
13075 VTE = *It->getSecond().begin();
13076 // Iterate through all vectorized nodes.
13077 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
13078 return MTE->State == TreeEntry::Vectorize;
13079 });
13080 if (MIt == It->getSecond().end())
13081 continue;
13082 VTE = *MIt;
13083 }
13084 }
13085 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13086 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13087 continue;
13088 VToTEs.insert(VTE);
13089 }
13090 if (VToTEs.empty())
13091 continue;
13092 if (UsedTEs.empty()) {
13093 // The first iteration, just insert the list of nodes to vector.
13094 UsedTEs.push_back(VToTEs);
13095 UsedValuesEntry.try_emplace(V, 0);
13096 } else {
13097 // Need to check if there are any previously used tree nodes which use V.
13098 // If there are no such nodes, consider that we have another one input
13099 // vector.
13100 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13101 unsigned Idx = 0;
13102 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13103 // Do we have a non-empty intersection of previously listed tree entries
13104 // and tree entries using current V?
13105 set_intersect(VToTEs, Set);
13106 if (!VToTEs.empty()) {
13107 // Yes, write the new subset and continue analysis for the next
13108 // scalar.
13109 Set.swap(VToTEs);
13110 break;
13111 }
13112 VToTEs = SavedVToTEs;
13113 ++Idx;
13114 }
13115 // No non-empty intersection found - need to add a second set of possible
13116 // source vectors.
13117 if (Idx == UsedTEs.size()) {
13118 // If the number of input vectors is greater than 2 - not a permutation,
13119 // fallback to the regular gather.
13120 // TODO: support multiple reshuffled nodes.
13121 if (UsedTEs.size() == 2)
13122 continue;
13123 UsedTEs.push_back(SavedVToTEs);
13124 Idx = UsedTEs.size() - 1;
13125 }
13126 UsedValuesEntry.try_emplace(V, Idx);
13127 }
13128 }
13129
13130 if (UsedTEs.empty()) {
13131 Entries.clear();
13132 return std::nullopt;
13133 }
13134
13135 unsigned VF = 0;
13136 if (UsedTEs.size() == 1) {
13137 // Keep the order to avoid non-determinism.
13138 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13139 UsedTEs.front().end());
13140 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13141 return TE1->Idx < TE2->Idx;
13142 });
13143 // Try to find the perfect match in another gather node at first.
13144 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13145 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13146 });
13147 if (It != FirstEntries.end() &&
13148 ((*It)->getVectorFactor() == VL.size() ||
13149 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13150 TE->ReuseShuffleIndices.size() == VL.size() &&
13151 (*It)->isSame(TE->Scalars)))) {
13152 Entries.push_back(*It);
13153 if ((*It)->getVectorFactor() == VL.size()) {
13154 std::iota(std::next(Mask.begin(), Part * VL.size()),
13155 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13156 } else {
13157 SmallVector<int> CommonMask = TE->getCommonMask();
13158 copy(CommonMask, Mask.begin());
13159 }
13160 // Clear undef scalars.
13161 for (unsigned I : seq<unsigned>(VL.size()))
13162 if (isa<PoisonValue>(VL[I]))
13163 Mask[Part * VL.size() + I] = PoisonMaskElem;
13165 }
13166 // No perfect match, just shuffle, so choose the first tree node from the
13167 // tree.
13168 Entries.push_back(FirstEntries.front());
13169 } else {
13170 // Try to find nodes with the same vector factor.
13171 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13172 // Keep the order of tree nodes to avoid non-determinism.
13174 for (const TreeEntry *TE : UsedTEs.front()) {
13175 unsigned VF = TE->getVectorFactor();
13176 auto It = VFToTE.find(VF);
13177 if (It != VFToTE.end()) {
13178 if (It->second->Idx > TE->Idx)
13179 It->getSecond() = TE;
13180 continue;
13181 }
13182 VFToTE.try_emplace(VF, TE);
13183 }
13184 // Same, keep the order to avoid non-determinism.
13185 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13186 UsedTEs.back().end());
13187 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13188 return TE1->Idx < TE2->Idx;
13189 });
13190 for (const TreeEntry *TE : SecondEntries) {
13191 auto It = VFToTE.find(TE->getVectorFactor());
13192 if (It != VFToTE.end()) {
13193 VF = It->first;
13194 Entries.push_back(It->second);
13195 Entries.push_back(TE);
13196 break;
13197 }
13198 }
13199 // No 2 source vectors with the same vector factor - just choose 2 with max
13200 // index.
13201 if (Entries.empty()) {
13202 Entries.push_back(*llvm::max_element(
13203 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13204 return TE1->Idx < TE2->Idx;
13205 }));
13206 Entries.push_back(SecondEntries.front());
13207 VF = std::max(Entries.front()->getVectorFactor(),
13208 Entries.back()->getVectorFactor());
13209 }
13210 }
13211
13212 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13213 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13214 // vectorized.
13215 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13216 auto *PHI = cast<PHINode>(V);
13217 auto *PHI1 = cast<PHINode>(V1);
13218 // Check that all incoming values are compatible/from same parent (if they
13219 // are instructions).
13220 // The incoming values are compatible if they all are constants, or
13221 // instruction with the same/alternate opcodes from the same basic block.
13222 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13223 Value *In = PHI->getIncomingValue(I);
13224 Value *In1 = PHI1->getIncomingValue(I);
13225 if (isConstant(In) && isConstant(In1))
13226 continue;
13227 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
13228 return false;
13229 if (cast<Instruction>(In)->getParent() !=
13230 cast<Instruction>(In1)->getParent())
13231 return false;
13232 }
13233 return true;
13234 };
13235 // Check if the value can be ignored during analysis for shuffled gathers.
13236 // We suppose it is better to ignore instruction, which do not form splats,
13237 // are not vectorized/not extractelements (these instructions will be handled
13238 // by extractelements processing) or may form vector node in future.
13239 auto MightBeIgnored = [=](Value *V) {
13240 auto *I = dyn_cast<Instruction>(V);
13241 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13243 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13244 };
13245 // Check that the neighbor instruction may form a full vector node with the
13246 // current instruction V. It is possible, if they have same/alternate opcode
13247 // and same parent basic block.
13248 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13249 Value *V1 = VL[Idx];
13250 bool UsedInSameVTE = false;
13251 auto It = UsedValuesEntry.find(V1);
13252 if (It != UsedValuesEntry.end())
13253 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13254 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13255 getSameOpcode({V, V1}, *TLI).getOpcode() &&
13256 cast<Instruction>(V)->getParent() ==
13257 cast<Instruction>(V1)->getParent() &&
13258 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13259 };
13260 // Build a shuffle mask for better cost estimation and vector emission.
13261 SmallBitVector UsedIdxs(Entries.size());
13263 for (int I = 0, E = VL.size(); I < E; ++I) {
13264 Value *V = VL[I];
13265 auto It = UsedValuesEntry.find(V);
13266 if (It == UsedValuesEntry.end())
13267 continue;
13268 // Do not try to shuffle scalars, if they are constants, or instructions
13269 // that can be vectorized as a result of the following vector build
13270 // vectorization.
13271 if (isConstant(V) || (MightBeIgnored(V) &&
13272 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13273 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13274 continue;
13275 unsigned Idx = It->second;
13276 EntryLanes.emplace_back(Idx, I);
13277 UsedIdxs.set(Idx);
13278 }
13279 // Iterate through all shuffled scalars and select entries, which can be used
13280 // for final shuffle.
13282 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13283 if (!UsedIdxs.test(I))
13284 continue;
13285 // Fix the entry number for the given scalar. If it is the first entry, set
13286 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13287 // These indices are used when calculating final shuffle mask as the vector
13288 // offset.
13289 for (std::pair<unsigned, int> &Pair : EntryLanes)
13290 if (Pair.first == I)
13291 Pair.first = TempEntries.size();
13292 TempEntries.push_back(Entries[I]);
13293 }
13294 Entries.swap(TempEntries);
13295 if (EntryLanes.size() == Entries.size() &&
13296 !VL.equals(ArrayRef(TE->Scalars)
13297 .slice(Part * VL.size(),
13298 std::min<int>(VL.size(), TE->Scalars.size())))) {
13299 // We may have here 1 or 2 entries only. If the number of scalars is equal
13300 // to the number of entries, no need to do the analysis, it is not very
13301 // profitable. Since VL is not the same as TE->Scalars, it means we already
13302 // have some shuffles before. Cut off not profitable case.
13303 Entries.clear();
13304 return std::nullopt;
13305 }
13306 // Build the final mask, check for the identity shuffle, if possible.
13307 bool IsIdentity = Entries.size() == 1;
13308 // Pair.first is the offset to the vector, while Pair.second is the index of
13309 // scalar in the list.
13310 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13311 unsigned Idx = Part * VL.size() + Pair.second;
13312 Mask[Idx] =
13313 Pair.first * VF +
13314 (ForOrder ? std::distance(
13315 Entries[Pair.first]->Scalars.begin(),
13316 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13317 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13318 IsIdentity &= Mask[Idx] == Pair.second;
13319 }
13320 switch (Entries.size()) {
13321 case 1:
13322 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13324 break;
13325 case 2:
13326 if (EntryLanes.size() > 2 || VL.size() <= 2)
13328 break;
13329 default:
13330 break;
13331 }
13332 Entries.clear();
13333 // Clear the corresponding mask elements.
13334 std::fill(std::next(Mask.begin(), Part * VL.size()),
13335 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13336 return std::nullopt;
13337}
13338
13340BoUpSLP::isGatherShuffledEntry(
13341 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13342 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13343 bool ForOrder) {
13344 assert(NumParts > 0 && NumParts < VL.size() &&
13345 "Expected positive number of registers.");
13346 Entries.clear();
13347 // No need to check for the topmost gather node.
13348 if (TE == VectorizableTree.front().get() &&
13349 (!GatheredLoadsEntriesFirst.has_value() ||
13350 none_of(ArrayRef(VectorizableTree).drop_front(),
13351 [](const std::unique_ptr<TreeEntry> &TE) {
13352 return !TE->isGather();
13353 })))
13354 return {};
13355 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
13356 if (TE->isNonPowOf2Vec())
13357 return {};
13358 Mask.assign(VL.size(), PoisonMaskElem);
13359 assert((TE->UserTreeIndices.size() == 1 ||
13360 TE == VectorizableTree.front().get()) &&
13361 "Expected only single user of the gather node.");
13362 assert(VL.size() % NumParts == 0 &&
13363 "Number of scalars must be divisible by NumParts.");
13364 if (!TE->UserTreeIndices.empty() &&
13365 TE->UserTreeIndices.front().UserTE->isGather() &&
13366 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13367 assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
13368 isSplat(TE->Scalars)) &&
13369 "Expected splat or extractelements only node.");
13370 return {};
13371 }
13372 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13374 for (unsigned Part : seq<unsigned>(NumParts)) {
13375 ArrayRef<Value *> SubVL =
13376 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13377 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13378 std::optional<TTI::ShuffleKind> SubRes =
13379 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13380 ForOrder);
13381 if (!SubRes)
13382 SubEntries.clear();
13383 Res.push_back(SubRes);
13384 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13385 SubEntries.front()->getVectorFactor() == VL.size() &&
13386 (SubEntries.front()->isSame(TE->Scalars) ||
13387 SubEntries.front()->isSame(VL))) {
13388 SmallVector<const TreeEntry *> LocalSubEntries;
13389 LocalSubEntries.swap(SubEntries);
13390 Entries.clear();
13391 Res.clear();
13392 std::iota(Mask.begin(), Mask.end(), 0);
13393 // Clear undef scalars.
13394 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13395 if (isa<PoisonValue>(VL[I]))
13397 Entries.emplace_back(1, LocalSubEntries.front());
13399 return Res;
13400 }
13401 }
13402 if (all_of(Res,
13403 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13404 Entries.clear();
13405 return {};
13406 }
13407 return Res;
13408}
13409
13410InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13411 Type *ScalarTy) const {
13412 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13413 bool DuplicateNonConst = false;
13414 // Find the cost of inserting/extracting values from the vector.
13415 // Check if the same elements are inserted several times and count them as
13416 // shuffle candidates.
13417 APInt ShuffledElements = APInt::getZero(VL.size());
13418 DenseMap<Value *, unsigned> UniqueElements;
13421 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13422 if (V->getType() != ScalarTy) {
13423 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13425 V = nullptr;
13426 }
13427 if (!ForPoisonSrc)
13428 Cost +=
13429 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13430 I, Constant::getNullValue(VecTy), V);
13431 };
13432 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13433 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13434 Value *V = VL[I];
13435 // No need to shuffle duplicates for constants.
13436 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13437 ShuffledElements.setBit(I);
13438 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13439 continue;
13440 }
13441
13442 auto Res = UniqueElements.try_emplace(V, I);
13443 if (Res.second) {
13444 EstimateInsertCost(I, V);
13445 ShuffleMask[I] = I;
13446 continue;
13447 }
13448
13449 DuplicateNonConst = true;
13450 ShuffledElements.setBit(I);
13451 ShuffleMask[I] = Res.first->second;
13452 }
13453 if (ForPoisonSrc) {
13454 if (isa<FixedVectorType>(ScalarTy)) {
13455 assert(SLPReVec && "Only supported by REVEC.");
13456 // We don't need to insert elements one by one. Instead, we can insert the
13457 // entire vector into the destination.
13458 Cost = 0;
13459 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13460 for (unsigned I : seq<unsigned>(VL.size()))
13461 if (!ShuffledElements[I])
13463 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13464 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13465 } else {
13467 /*DemandedElts*/ ~ShuffledElements,
13468 /*Insert*/ true,
13469 /*Extract*/ false, CostKind, VL);
13470 }
13471 }
13472 if (DuplicateNonConst)
13474 VecTy, ShuffleMask);
13475 return Cost;
13476}
13477
13478Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13479 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13480 if (Res)
13481 return *Res;
13482 // Get the basic block this bundle is in. All instructions in the bundle
13483 // should be in this block (except for extractelement-like instructions with
13484 // constant indices or gathered loads).
13485 auto *Front = E->getMainOp();
13486 auto *BB = Front->getParent();
13487 assert(((GatheredLoadsEntriesFirst.has_value() &&
13488 E->getOpcode() == Instruction::Load && E->isGather() &&
13489 E->Idx < *GatheredLoadsEntriesFirst) ||
13490 all_of(E->Scalars,
13491 [=](Value *V) -> bool {
13492 if (E->getOpcode() == Instruction::GetElementPtr &&
13493 !isa<GetElementPtrInst>(V))
13494 return true;
13495 auto *I = dyn_cast<Instruction>(V);
13496 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13497 isVectorLikeInstWithConstOps(I);
13498 })) &&
13499 "Expected gathered loads or GEPs or instructions from same basic "
13500 "block.");
13501
13502 auto FindLastInst = [&]() {
13503 Instruction *LastInst = Front;
13504 for (Value *V : E->Scalars) {
13505 auto *I = dyn_cast<Instruction>(V);
13506 if (!I)
13507 continue;
13508 if (LastInst->getParent() == I->getParent()) {
13509 if (LastInst->comesBefore(I))
13510 LastInst = I;
13511 continue;
13512 }
13513 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13514 !isa<GetElementPtrInst>(I)) ||
13515 (isVectorLikeInstWithConstOps(LastInst) &&
13517 (GatheredLoadsEntriesFirst.has_value() &&
13518 E->getOpcode() == Instruction::Load && E->isGather() &&
13519 E->Idx < *GatheredLoadsEntriesFirst)) &&
13520 "Expected vector-like or non-GEP in GEP node insts only.");
13521 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13522 LastInst = I;
13523 continue;
13524 }
13525 if (!DT->isReachableFromEntry(I->getParent()))
13526 continue;
13527 auto *NodeA = DT->getNode(LastInst->getParent());
13528 auto *NodeB = DT->getNode(I->getParent());
13529 assert(NodeA && "Should only process reachable instructions");
13530 assert(NodeB && "Should only process reachable instructions");
13531 assert((NodeA == NodeB) ==
13532 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13533 "Different nodes should have different DFS numbers");
13534 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13535 LastInst = I;
13536 }
13537 BB = LastInst->getParent();
13538 return LastInst;
13539 };
13540
13541 auto FindFirstInst = [&]() {
13542 Instruction *FirstInst = Front;
13543 for (Value *V : E->Scalars) {
13544 auto *I = dyn_cast<Instruction>(V);
13545 if (!I)
13546 continue;
13547 if (FirstInst->getParent() == I->getParent()) {
13548 if (I->comesBefore(FirstInst))
13549 FirstInst = I;
13550 continue;
13551 }
13552 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13553 !isa<GetElementPtrInst>(I)) ||
13554 (isVectorLikeInstWithConstOps(FirstInst) &&
13556 "Expected vector-like or non-GEP in GEP node insts only.");
13557 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13558 FirstInst = I;
13559 continue;
13560 }
13561 if (!DT->isReachableFromEntry(I->getParent()))
13562 continue;
13563 auto *NodeA = DT->getNode(FirstInst->getParent());
13564 auto *NodeB = DT->getNode(I->getParent());
13565 assert(NodeA && "Should only process reachable instructions");
13566 assert(NodeB && "Should only process reachable instructions");
13567 assert((NodeA == NodeB) ==
13568 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13569 "Different nodes should have different DFS numbers");
13570 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13571 FirstInst = I;
13572 }
13573 return FirstInst;
13574 };
13575
13576 // Set insertpoint for gathered loads to the very first load.
13577 if (GatheredLoadsEntriesFirst.has_value() &&
13578 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13579 E->getOpcode() == Instruction::Load) {
13580 Res = FindFirstInst();
13581 return *Res;
13582 }
13583
13584 // Set the insert point to the beginning of the basic block if the entry
13585 // should not be scheduled.
13586 if (doesNotNeedToSchedule(E->Scalars) ||
13587 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13588 if ((E->getOpcode() == Instruction::GetElementPtr &&
13589 any_of(E->Scalars,
13590 [](Value *V) {
13591 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13592 })) ||
13593 all_of(E->Scalars,
13594 [](Value *V) {
13595 return isa<PoisonValue>(V) ||
13596 (!isVectorLikeInstWithConstOps(V) &&
13597 isUsedOutsideBlock(V));
13598 }) ||
13599 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13600 return isa<ExtractElementInst, UndefValue>(V) ||
13601 areAllOperandsNonInsts(V);
13602 })))
13603 Res = FindLastInst();
13604 else
13605 Res = FindFirstInst();
13606 return *Res;
13607 }
13608
13609 // Find the last instruction. The common case should be that BB has been
13610 // scheduled, and the last instruction is VL.back(). So we start with
13611 // VL.back() and iterate over schedule data until we reach the end of the
13612 // bundle. The end of the bundle is marked by null ScheduleData.
13613 if (BlocksSchedules.count(BB) && !E->isGather()) {
13614 Value *V = E->isOneOf(E->Scalars.back());
13616 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13617 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13618 if (Bundle && Bundle->isPartOfBundle())
13619 for (; Bundle; Bundle = Bundle->NextInBundle)
13620 Res = Bundle->Inst;
13621 }
13622
13623 // LastInst can still be null at this point if there's either not an entry
13624 // for BB in BlocksSchedules or there's no ScheduleData available for
13625 // VL.back(). This can be the case if buildTree_rec aborts for various
13626 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13627 // size is reached, etc.). ScheduleData is initialized in the scheduling
13628 // "dry-run".
13629 //
13630 // If this happens, we can still find the last instruction by brute force. We
13631 // iterate forwards from Front (inclusive) until we either see all
13632 // instructions in the bundle or reach the end of the block. If Front is the
13633 // last instruction in program order, LastInst will be set to Front, and we
13634 // will visit all the remaining instructions in the block.
13635 //
13636 // One of the reasons we exit early from buildTree_rec is to place an upper
13637 // bound on compile-time. Thus, taking an additional compile-time hit here is
13638 // not ideal. However, this should be exceedingly rare since it requires that
13639 // we both exit early from buildTree_rec and that the bundle be out-of-order
13640 // (causing us to iterate all the way to the end of the block).
13641 if (!Res)
13642 Res = FindLastInst();
13643 assert(Res && "Failed to find last instruction in bundle");
13644 return *Res;
13645}
13646
13647void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13648 auto *Front = E->getMainOp();
13649 Instruction *LastInst = &getLastInstructionInBundle(E);
13650 assert(LastInst && "Failed to find last instruction in bundle");
13651 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13652 // If the instruction is PHI, set the insert point after all the PHIs.
13653 bool IsPHI = isa<PHINode>(LastInst);
13654 if (IsPHI)
13655 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13656 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13657 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13658 } else {
13659 // Set the insertion point after the last instruction in the bundle. Set the
13660 // debug location to Front.
13661 Builder.SetInsertPoint(
13662 LastInst->getParent(),
13664 }
13665 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13666}
13667
13668Value *BoUpSLP::gather(
13669 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13670 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13671 // List of instructions/lanes from current block and/or the blocks which are
13672 // part of the current loop. These instructions will be inserted at the end to
13673 // make it possible to optimize loops and hoist invariant instructions out of
13674 // the loops body with better chances for success.
13676 SmallSet<int, 4> PostponedIndices;
13677 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13678 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13680 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13681 InsertBB = InsertBB->getSinglePredecessor();
13682 return InsertBB && InsertBB == InstBB;
13683 };
13684 for (int I = 0, E = VL.size(); I < E; ++I) {
13685 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13686 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13687 getTreeEntry(Inst) ||
13688 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13689 PostponedIndices.insert(I).second)
13690 PostponedInsts.emplace_back(Inst, I);
13691 }
13692
13693 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13694 Type *Ty) {
13695 Value *Scalar = V;
13696 if (Scalar->getType() != Ty) {
13697 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13698 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13699 Value *V = Scalar;
13700 if (auto *CI = dyn_cast<CastInst>(Scalar);
13701 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13702 Value *Op = CI->getOperand(0);
13703 if (auto *IOp = dyn_cast<Instruction>(Op);
13704 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13705 V = Op;
13706 }
13707 Scalar = Builder.CreateIntCast(
13708 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13709 }
13710
13711 Instruction *InsElt;
13712 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13713 assert(SLPReVec && "FixedVectorType is not expected.");
13714 Vec = InsElt = Builder.CreateInsertVector(
13715 Vec->getType(), Vec, Scalar,
13716 Builder.getInt64(Pos * VecTy->getNumElements()));
13717 auto *II = dyn_cast<IntrinsicInst>(InsElt);
13718 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13719 return Vec;
13720 } else {
13721 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13722 InsElt = dyn_cast<InsertElementInst>(Vec);
13723 if (!InsElt)
13724 return Vec;
13725 }
13726 GatherShuffleExtractSeq.insert(InsElt);
13727 CSEBlocks.insert(InsElt->getParent());
13728 // Add to our 'need-to-extract' list.
13729 if (isa<Instruction>(V)) {
13730 if (TreeEntry *Entry = getTreeEntry(V)) {
13731 // Find which lane we need to extract.
13732 User *UserOp = nullptr;
13733 if (Scalar != V) {
13734 if (auto *SI = dyn_cast<Instruction>(Scalar))
13735 UserOp = SI;
13736 } else {
13737 UserOp = InsElt;
13738 }
13739 if (UserOp) {
13740 unsigned FoundLane = Entry->findLaneForValue(V);
13741 ExternalUses.emplace_back(V, UserOp, FoundLane);
13742 }
13743 }
13744 }
13745 return Vec;
13746 };
13747 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13748 Value *Vec = PoisonValue::get(VecTy);
13749 SmallVector<int> NonConsts;
13751 std::iota(Mask.begin(), Mask.end(), 0);
13752 Value *OriginalRoot = Root;
13753 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13754 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13755 SV->getOperand(0)->getType() == VecTy) {
13756 Root = SV->getOperand(0);
13757 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
13758 }
13759 // Insert constant values at first.
13760 for (int I = 0, E = VL.size(); I < E; ++I) {
13761 if (PostponedIndices.contains(I))
13762 continue;
13763 if (!isConstant(VL[I])) {
13764 NonConsts.push_back(I);
13765 continue;
13766 }
13767 if (isa<PoisonValue>(VL[I]))
13768 continue;
13769 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
13770 Mask[I] = I + E;
13771 }
13772 if (Root) {
13773 if (isa<PoisonValue>(Vec)) {
13774 Vec = OriginalRoot;
13775 } else {
13776 Vec = CreateShuffle(Root, Vec, Mask);
13777 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
13778 OI && OI->hasNUses(0) &&
13779 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13780 return TE->VectorizedValue == OI;
13781 }))
13782 eraseInstruction(OI);
13783 }
13784 }
13785 // Insert non-constant values.
13786 for (int I : NonConsts)
13787 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
13788 // Append instructions, which are/may be part of the loop, in the end to make
13789 // it possible to hoist non-loop-based instructions.
13790 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
13791 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
13792
13793 return Vec;
13794}
13795
13796/// Merges shuffle masks and emits final shuffle instruction, if required. It
13797/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13798/// when the actual shuffle instruction is generated only if this is actually
13799/// required. Otherwise, the shuffle instruction emission is delayed till the
13800/// end of the process, to reduce the number of emitted instructions and further
13801/// analysis/transformations.
13802/// The class also will look through the previously emitted shuffle instructions
13803/// and properly mark indices in mask as undef.
13804/// For example, given the code
13805/// \code
13806/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
13807/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
13808/// \endcode
13809/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
13810/// look through %s1 and %s2 and emit
13811/// \code
13812/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
13813/// \endcode
13814/// instead.
13815/// If 2 operands are of different size, the smallest one will be resized and
13816/// the mask recalculated properly.
13817/// For example, given the code
13818/// \code
13819/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
13820/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
13821/// \endcode
13822/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
13823/// look through %s1 and %s2 and emit
13824/// \code
13825/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
13826/// \endcode
13827/// instead.
13828class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
13829 bool IsFinalized = false;
13830 /// Combined mask for all applied operands and masks. It is built during
13831 /// analysis and actual emission of shuffle vector instructions.
13832 SmallVector<int> CommonMask;
13833 /// List of operands for the shuffle vector instruction. It hold at max 2
13834 /// operands, if the 3rd is going to be added, the first 2 are combined into
13835 /// shuffle with \p CommonMask mask, the first operand sets to be the
13836 /// resulting shuffle and the second operand sets to be the newly added
13837 /// operand. The \p CommonMask is transformed in the proper way after that.
13838 SmallVector<Value *, 2> InVectors;
13839 IRBuilderBase &Builder;
13840 BoUpSLP &R;
13841
13842 class ShuffleIRBuilder {
13843 IRBuilderBase &Builder;
13844 /// Holds all of the instructions that we gathered.
13845 SetVector<Instruction *> &GatherShuffleExtractSeq;
13846 /// A list of blocks that we are going to CSE.
13847 DenseSet<BasicBlock *> &CSEBlocks;
13848 /// Data layout.
13849 const DataLayout &DL;
13850
13851 public:
13852 ShuffleIRBuilder(IRBuilderBase &Builder,
13853 SetVector<Instruction *> &GatherShuffleExtractSeq,
13854 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
13855 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
13856 CSEBlocks(CSEBlocks), DL(DL) {}
13857 ~ShuffleIRBuilder() = default;
13858 /// Creates shufflevector for the 2 operands with the given mask.
13859 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
13860 if (V1->getType() != V2->getType()) {
13862 V1->getType()->isIntOrIntVectorTy() &&
13863 "Expected integer vector types only.");
13864 if (V1->getType() != V2->getType()) {
13865 if (cast<VectorType>(V2->getType())
13866 ->getElementType()
13867 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
13868 ->getElementType()
13869 ->getIntegerBitWidth())
13870 V2 = Builder.CreateIntCast(
13871 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
13872 else
13873 V1 = Builder.CreateIntCast(
13874 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
13875 }
13876 }
13877 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
13878 if (auto *I = dyn_cast<Instruction>(Vec)) {
13879 GatherShuffleExtractSeq.insert(I);
13880 CSEBlocks.insert(I->getParent());
13881 }
13882 return Vec;
13883 }
13884 /// Creates permutation of the single vector operand with the given mask, if
13885 /// it is not identity mask.
13886 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
13887 if (Mask.empty())
13888 return V1;
13889 unsigned VF = Mask.size();
13890 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
13891 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
13892 return V1;
13893 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
13894 if (auto *I = dyn_cast<Instruction>(Vec)) {
13895 GatherShuffleExtractSeq.insert(I);
13896 CSEBlocks.insert(I->getParent());
13897 }
13898 return Vec;
13899 }
13900 Value *createIdentity(Value *V) { return V; }
13901 Value *createPoison(Type *Ty, unsigned VF) {
13902 return PoisonValue::get(getWidenedType(Ty, VF));
13903 }
13904 /// Resizes 2 input vector to match the sizes, if the they are not equal
13905 /// yet. The smallest vector is resized to the size of the larger vector.
13906 void resizeToMatch(Value *&V1, Value *&V2) {
13907 if (V1->getType() == V2->getType())
13908 return;
13909 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13910 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
13911 int VF = std::max(V1VF, V2VF);
13912 int MinVF = std::min(V1VF, V2VF);
13913 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
13914 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
13915 0);
13916 Value *&Op = MinVF == V1VF ? V1 : V2;
13917 Op = Builder.CreateShuffleVector(Op, IdentityMask);
13918 if (auto *I = dyn_cast<Instruction>(Op)) {
13919 GatherShuffleExtractSeq.insert(I);
13920 CSEBlocks.insert(I->getParent());
13921 }
13922 if (MinVF == V1VF)
13923 V1 = Op;
13924 else
13925 V2 = Op;
13926 }
13927 };
13928
13929 /// Smart shuffle instruction emission, walks through shuffles trees and
13930 /// tries to find the best matching vector for the actual shuffle
13931 /// instruction.
13932 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
13933 assert(V1 && "Expected at least one vector value.");
13934 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
13935 R.CSEBlocks, *R.DL);
13936 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
13937 ShuffleBuilder);
13938 }
13939
13940 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
13941 /// shuffle emission.
13942 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
13943 ArrayRef<int> Mask) {
13944 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13945 if (Mask[Idx] != PoisonMaskElem)
13946 CommonMask[Idx] = Idx;
13947 }
13948
13949 /// Cast value \p V to the vector type with the same number of elements, but
13950 /// the base type \p ScalarTy.
13951 Value *castToScalarTyElem(Value *V,
13952 std::optional<bool> IsSigned = std::nullopt) {
13953 auto *VecTy = cast<VectorType>(V->getType());
13954 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
13955 if (VecTy->getElementType() == ScalarTy->getScalarType())
13956 return V;
13957 return Builder.CreateIntCast(
13958 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
13959 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
13960 }
13961
13962public:
13964 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
13965
13966 /// Adjusts extractelements after reusing them.
13967 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13968 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13969 unsigned NumParts, bool &UseVecBaseAsInput) {
13970 UseVecBaseAsInput = false;
13971 SmallPtrSet<Value *, 4> UniqueBases;
13972 Value *VecBase = nullptr;
13973 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13974 if (!E->ReorderIndices.empty()) {
13975 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13976 E->ReorderIndices.end());
13977 reorderScalars(VL, ReorderMask);
13978 }
13979 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
13980 int Idx = Mask[I];
13981 if (Idx == PoisonMaskElem)
13982 continue;
13983 auto *EI = cast<ExtractElementInst>(VL[I]);
13984 VecBase = EI->getVectorOperand();
13985 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
13986 VecBase = TE->VectorizedValue;
13987 assert(VecBase && "Expected vectorized value.");
13988 UniqueBases.insert(VecBase);
13989 // If the only one use is vectorized - can delete the extractelement
13990 // itself.
13991 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
13992 (NumParts != 1 && count(VL, EI) > 1) ||
13993 any_of(EI->users(), [&](User *U) {
13994 const TreeEntry *UTE = R.getTreeEntry(U);
13995 return !UTE || R.MultiNodeScalars.contains(U) ||
13996 (isa<GetElementPtrInst>(U) &&
13997 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
13998 count_if(R.VectorizableTree,
13999 [&](const std::unique_ptr<TreeEntry> &TE) {
14000 return any_of(TE->UserTreeIndices,
14001 [&](const EdgeInfo &Edge) {
14002 return Edge.UserTE == UTE;
14003 }) &&
14004 is_contained(VL, EI);
14005 }) != 1;
14006 }))
14007 continue;
14008 R.eraseInstruction(EI);
14009 }
14010 if (NumParts == 1 || UniqueBases.size() == 1) {
14011 assert(VecBase && "Expected vectorized value.");
14012 return castToScalarTyElem(VecBase);
14013 }
14014 UseVecBaseAsInput = true;
14015 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14016 for (auto [I, Idx] : enumerate(Mask))
14017 if (Idx != PoisonMaskElem)
14018 Idx = I;
14019 };
14020 // Perform multi-register vector shuffle, joining them into a single virtual
14021 // long vector.
14022 // Need to shuffle each part independently and then insert all this parts
14023 // into a long virtual vector register, forming the original vector.
14024 Value *Vec = nullptr;
14025 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14026 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14027 for (unsigned Part : seq<unsigned>(NumParts)) {
14028 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14029 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14030 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14031 constexpr int MaxBases = 2;
14032 SmallVector<Value *, MaxBases> Bases(MaxBases);
14033 auto VLMask = zip(SubVL, SubMask);
14034 const unsigned VF = std::accumulate(
14035 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14036 if (std::get<1>(D) == PoisonMaskElem)
14037 return S;
14038 Value *VecOp =
14039 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14040 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14041 VecOp = TE->VectorizedValue;
14042 assert(VecOp && "Expected vectorized value.");
14043 const unsigned Size =
14044 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14045 return std::max(S, Size);
14046 });
14047 for (const auto [V, I] : VLMask) {
14048 if (I == PoisonMaskElem)
14049 continue;
14050 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14051 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14052 VecOp = TE->VectorizedValue;
14053 assert(VecOp && "Expected vectorized value.");
14054 VecOp = castToScalarTyElem(VecOp);
14055 Bases[I / VF] = VecOp;
14056 }
14057 if (!Bases.front())
14058 continue;
14059 Value *SubVec;
14060 if (Bases.back()) {
14061 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14062 TransformToIdentity(SubMask);
14063 } else {
14064 SubVec = Bases.front();
14065 }
14066 if (!Vec) {
14067 Vec = SubVec;
14068 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14069 [&](unsigned P) {
14070 ArrayRef<int> SubMask =
14071 Mask.slice(P * SliceSize,
14072 getNumElems(Mask.size(),
14073 SliceSize, P));
14074 return all_of(SubMask, [](int Idx) {
14075 return Idx == PoisonMaskElem;
14076 });
14077 })) &&
14078 "Expected first part or all previous parts masked.");
14079 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14080 } else {
14081 unsigned NewVF =
14082 cast<FixedVectorType>(Vec->getType())->getNumElements();
14083 if (Vec->getType() != SubVec->getType()) {
14084 unsigned SubVecVF =
14085 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14086 NewVF = std::max(NewVF, SubVecVF);
14087 }
14088 // Adjust SubMask.
14089 for (int &Idx : SubMask)
14090 if (Idx != PoisonMaskElem)
14091 Idx += NewVF;
14092 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14093 Vec = createShuffle(Vec, SubVec, VecMask);
14094 TransformToIdentity(VecMask);
14095 }
14096 }
14097 copy(VecMask, Mask.begin());
14098 return Vec;
14099 }
14100 /// Checks if the specified entry \p E needs to be delayed because of its
14101 /// dependency nodes.
14102 std::optional<Value *>
14103 needToDelay(const TreeEntry *E,
14105 // No need to delay emission if all deps are ready.
14106 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14107 return all_of(
14108 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14109 }))
14110 return std::nullopt;
14111 // Postpone gather emission, will be emitted after the end of the
14112 // process to keep correct order.
14113 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14114 return Builder.CreateAlignedLoad(
14115 ResVecTy,
14117 MaybeAlign());
14118 }
14119 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14120 /// shuffling.
14121 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14122 Value *V1 = E1.VectorizedValue;
14123 if (V1->getType()->isIntOrIntVectorTy())
14124 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14125 if (isa<PoisonValue>(V))
14126 return false;
14127 return !isKnownNonNegative(
14128 V, SimplifyQuery(*R.DL));
14129 }));
14130 Value *V2 = E2.VectorizedValue;
14131 if (V2->getType()->isIntOrIntVectorTy())
14132 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14133 if (isa<PoisonValue>(V))
14134 return false;
14135 return !isKnownNonNegative(
14136 V, SimplifyQuery(*R.DL));
14137 }));
14138 add(V1, V2, Mask);
14139 }
14140 /// Adds single input vector (in form of tree entry) and the mask for its
14141 /// shuffling.
14142 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14143 Value *V1 = E1.VectorizedValue;
14144 if (V1->getType()->isIntOrIntVectorTy())
14145 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14146 if (isa<PoisonValue>(V))
14147 return false;
14148 return !isKnownNonNegative(
14149 V, SimplifyQuery(*R.DL));
14150 }));
14151 add(V1, Mask);
14152 }
14153 /// Adds 2 input vectors and the mask for their shuffling.
14154 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14155 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14156 assert(isa<FixedVectorType>(V1->getType()) &&
14157 isa<FixedVectorType>(V2->getType()) &&
14158 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14159 V1 = castToScalarTyElem(V1);
14160 V2 = castToScalarTyElem(V2);
14161 if (InVectors.empty()) {
14162 InVectors.push_back(V1);
14163 InVectors.push_back(V2);
14164 CommonMask.assign(Mask.begin(), Mask.end());
14165 return;
14166 }
14167 Value *Vec = InVectors.front();
14168 if (InVectors.size() == 2) {
14169 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14170 transformMaskAfterShuffle(CommonMask, CommonMask);
14171 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14172 Mask.size()) {
14173 Vec = createShuffle(Vec, nullptr, CommonMask);
14174 transformMaskAfterShuffle(CommonMask, CommonMask);
14175 }
14176 V1 = createShuffle(V1, V2, Mask);
14177 unsigned VF = std::max(getVF(V1), getVF(Vec));
14178 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14179 if (Mask[Idx] != PoisonMaskElem)
14180 CommonMask[Idx] = Idx + VF;
14181 InVectors.front() = Vec;
14182 if (InVectors.size() == 2)
14183 InVectors.back() = V1;
14184 else
14185 InVectors.push_back(V1);
14186 }
14187 /// Adds another one input vector and the mask for the shuffling.
14188 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14189 assert(isa<FixedVectorType>(V1->getType()) &&
14190 "castToScalarTyElem expects V1 to be FixedVectorType");
14191 V1 = castToScalarTyElem(V1);
14192 if (InVectors.empty()) {
14193 InVectors.push_back(V1);
14194 CommonMask.assign(Mask.begin(), Mask.end());
14195 return;
14196 }
14197 const auto *It = find(InVectors, V1);
14198 if (It == InVectors.end()) {
14199 if (InVectors.size() == 2 ||
14200 InVectors.front()->getType() != V1->getType()) {
14201 Value *V = InVectors.front();
14202 if (InVectors.size() == 2) {
14203 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14204 transformMaskAfterShuffle(CommonMask, CommonMask);
14205 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14206 CommonMask.size()) {
14207 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14208 transformMaskAfterShuffle(CommonMask, CommonMask);
14209 }
14210 unsigned VF = std::max(CommonMask.size(), Mask.size());
14211 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14212 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14213 CommonMask[Idx] =
14214 V->getType() != V1->getType()
14215 ? Idx + VF
14216 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14217 ->getNumElements();
14218 if (V->getType() != V1->getType())
14219 V1 = createShuffle(V1, nullptr, Mask);
14220 InVectors.front() = V;
14221 if (InVectors.size() == 2)
14222 InVectors.back() = V1;
14223 else
14224 InVectors.push_back(V1);
14225 return;
14226 }
14227 // Check if second vector is required if the used elements are already
14228 // used from the first one.
14229 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14230 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14231 InVectors.push_back(V1);
14232 break;
14233 }
14234 }
14235 int VF = getVF(V1);
14236 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14237 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14238 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14239 }
14240 /// Adds another one input vector and the mask for the shuffling.
14242 SmallVector<int> NewMask;
14243 inversePermutation(Order, NewMask);
14244 add(V1, NewMask);
14245 }
14246 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14247 Value *Root = nullptr) {
14248 return R.gather(VL, Root, ScalarTy,
14249 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14250 return createShuffle(V1, V2, Mask);
14251 });
14252 }
14253 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14254 /// Finalize emission of the shuffles.
14255 /// \param Action the action (if any) to be performed before final applying of
14256 /// the \p ExtMask mask.
14257 Value *
14259 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14260 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14261 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14262 IsFinalized = true;
14263 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
14264 SmallVector<int> NewExtMask(ExtMask);
14265 if (ScalarTyNumElements != 1) {
14266 assert(SLPReVec && "FixedVectorType is not expected.");
14267 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask);
14268 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask);
14269 ExtMask = NewExtMask;
14270 }
14271 if (Action) {
14272 Value *Vec = InVectors.front();
14273 if (InVectors.size() == 2) {
14274 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14275 InVectors.pop_back();
14276 } else {
14277 Vec = createShuffle(Vec, nullptr, CommonMask);
14278 }
14279 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14280 if (CommonMask[Idx] != PoisonMaskElem)
14281 CommonMask[Idx] = Idx;
14282 assert(VF > 0 &&
14283 "Expected vector length for the final value before action.");
14284 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14285 if (VecVF < VF) {
14286 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14287 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14288 Vec = createShuffle(Vec, nullptr, ResizeMask);
14289 }
14290 Action(Vec, CommonMask);
14291 InVectors.front() = Vec;
14292 }
14293 if (!SubVectors.empty()) {
14294 Value *Vec = InVectors.front();
14295 if (InVectors.size() == 2) {
14296 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14297 InVectors.pop_back();
14298 } else {
14299 Vec = createShuffle(Vec, nullptr, CommonMask);
14300 }
14301 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14302 if (CommonMask[Idx] != PoisonMaskElem)
14303 CommonMask[Idx] = Idx;
14304 auto CreateSubVectors = [&](Value *Vec,
14305 SmallVectorImpl<int> &CommonMask) {
14306 for (auto [E, Idx] : SubVectors) {
14307 Value *V = E->VectorizedValue;
14308 if (V->getType()->isIntOrIntVectorTy())
14309 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14310 if (isa<PoisonValue>(V))
14311 return false;
14312 return !isKnownNonNegative(
14313 V, SimplifyQuery(*R.DL));
14314 }));
14315 unsigned InsertionIndex = Idx * ScalarTyNumElements;
14316 const unsigned SubVecVF =
14317 cast<FixedVectorType>(V->getType())->getNumElements();
14318 if (InsertionIndex % SubVecVF == 0) {
14319 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
14320 Builder.getInt64(InsertionIndex));
14321 } else {
14322 // Create shuffle, insertvector requires that index is multiple of
14323 // the subvectors length.
14324 const unsigned VecVF =
14325 cast<FixedVectorType>(Vec->getType())->getNumElements();
14327 std::iota(Mask.begin(), Mask.end(), 0);
14328 for (unsigned I : seq<unsigned>(
14329 InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements))
14330 Mask[I] = I - Idx + VecVF;
14331 Vec = createShuffle(Vec, V, Mask);
14332 }
14333 if (!CommonMask.empty()) {
14334 std::iota(
14335 std::next(CommonMask.begin(), InsertionIndex),
14336 std::next(CommonMask.begin(),
14337 (Idx + E->getVectorFactor()) * ScalarTyNumElements),
14338 InsertionIndex);
14339 }
14340 }
14341 return Vec;
14342 };
14343 if (SubVectorsMask.empty()) {
14344 Vec = CreateSubVectors(Vec, CommonMask);
14345 } else {
14346 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14347 copy(SubVectorsMask, SVMask.begin());
14348 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14349 if (I2 != PoisonMaskElem) {
14350 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14351 I1 = I2 + CommonMask.size();
14352 }
14353 }
14354 Value *InsertVec =
14355 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14356 Vec = createShuffle(InsertVec, Vec, SVMask);
14357 for (unsigned I : seq<unsigned>(CommonMask.size())) {
14358 if (SVMask[I] != PoisonMaskElem)
14359 CommonMask[I] = I;
14360 }
14361 }
14362 InVectors.front() = Vec;
14363 }
14364
14365 if (!ExtMask.empty()) {
14366 if (CommonMask.empty()) {
14367 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14368 } else {
14369 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14370 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14371 if (ExtMask[I] == PoisonMaskElem)
14372 continue;
14373 NewMask[I] = CommonMask[ExtMask[I]];
14374 }
14375 CommonMask.swap(NewMask);
14376 }
14377 }
14378 if (CommonMask.empty()) {
14379 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14380 return InVectors.front();
14381 }
14382 if (InVectors.size() == 2)
14383 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14384 return createShuffle(InVectors.front(), nullptr, CommonMask);
14385 }
14386
14388 assert((IsFinalized || CommonMask.empty()) &&
14389 "Shuffle construction must be finalized.");
14390 }
14391};
14392
14393BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14394 unsigned NodeIdx) {
14395 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14396 InstructionsState S = getSameOpcode(VL, *TLI);
14397 // Special processing for GEPs bundle, which may include non-gep values.
14398 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
14399 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14400 if (It != VL.end())
14401 S = getSameOpcode(*It, *TLI);
14402 }
14403 if (!S.getOpcode())
14404 return nullptr;
14405 auto CheckSameVE = [&](const TreeEntry *VE) {
14406 return VE->isSame(VL) &&
14407 (any_of(VE->UserTreeIndices,
14408 [E, NodeIdx](const EdgeInfo &EI) {
14409 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14410 }) ||
14411 any_of(VectorizableTree,
14412 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14413 return TE->isOperandGatherNode(
14414 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14415 VE->isSame(TE->Scalars);
14416 }));
14417 };
14418 TreeEntry *VE = getTreeEntry(S.getMainOp());
14419 if (VE && CheckSameVE(VE))
14420 return VE;
14421 auto It = MultiNodeScalars.find(S.getMainOp());
14422 if (It != MultiNodeScalars.end()) {
14423 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
14424 return TE != VE && CheckSameVE(TE);
14425 });
14426 if (I != It->getSecond().end())
14427 return *I;
14428 }
14429 return nullptr;
14430}
14431
14432Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14433 bool PostponedPHIs) {
14434 ValueList &VL = E->getOperand(NodeIdx);
14435 const unsigned VF = VL.size();
14436 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14437 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14438 // V may be affected by MinBWs.
14439 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14440 // factor is the number of elements, not their type.
14441 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14442 unsigned NumElements = getNumElements(VL.front()->getType());
14443 ShuffleInstructionBuilder ShuffleBuilder(
14444 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14445 : ScalarTy,
14446 Builder, *this);
14447 ShuffleBuilder.add(V, Mask);
14449 E->CombinedEntriesWithIndices.size());
14450 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14451 [&](const auto &P) {
14452 return std::make_pair(VectorizableTree[P.first].get(),
14453 P.second);
14454 });
14455 assert((E->CombinedEntriesWithIndices.empty() ||
14456 E->ReorderIndices.empty()) &&
14457 "Expected either combined subnodes or reordering");
14458 return ShuffleBuilder.finalize({}, SubVectors, {});
14459 };
14460 Value *V = vectorizeTree(VE, PostponedPHIs);
14461 if (VF * getNumElements(VL[0]->getType()) !=
14462 cast<FixedVectorType>(V->getType())->getNumElements()) {
14463 if (!VE->ReuseShuffleIndices.empty()) {
14464 // Reshuffle to get only unique values.
14465 // If some of the scalars are duplicated in the vectorization
14466 // tree entry, we do not vectorize them but instead generate a
14467 // mask for the reuses. But if there are several users of the
14468 // same entry, they may have different vectorization factors.
14469 // This is especially important for PHI nodes. In this case, we
14470 // need to adapt the resulting instruction for the user
14471 // vectorization factor and have to reshuffle it again to take
14472 // only unique elements of the vector. Without this code the
14473 // function incorrectly returns reduced vector instruction with
14474 // the same elements, not with the unique ones.
14475
14476 // block:
14477 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14478 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14479 // ... (use %2)
14480 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14481 // br %block
14483 for (auto [I, V] : enumerate(VL)) {
14484 if (isa<PoisonValue>(V))
14485 continue;
14486 Mask[I] = VE->findLaneForValue(V);
14487 }
14488 V = FinalShuffle(V, Mask);
14489 } else {
14490 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14491 "Expected vectorization factor less "
14492 "than original vector size.");
14493 SmallVector<int> UniformMask(VF, 0);
14494 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14495 V = FinalShuffle(V, UniformMask);
14496 }
14497 }
14498 // Need to update the operand gather node, if actually the operand is not a
14499 // vectorized node, but the buildvector/gather node, which matches one of
14500 // the vectorized nodes.
14501 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14502 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14503 }) == VE->UserTreeIndices.end()) {
14504 auto *It =
14505 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14506 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14507 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14508 });
14509 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14510 (*It)->VectorizedValue = V;
14511 }
14512 return V;
14513 }
14514
14515 // Find the corresponding gather entry and vectorize it.
14516 // Allows to be more accurate with tree/graph transformations, checks for the
14517 // correctness of the transformations in many cases.
14518 auto *I = find_if(VectorizableTree,
14519 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14520 return TE->isOperandGatherNode({E, NodeIdx});
14521 });
14522 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14523 assert(I->get()->UserTreeIndices.size() == 1 &&
14524 "Expected only single user for the gather node.");
14525 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14526 return vectorizeTree(I->get(), PostponedPHIs);
14527}
14528
14529template <typename BVTy, typename ResTy, typename... Args>
14530ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14531 Args &...Params) {
14532 assert(E->isGather() && "Expected gather node.");
14533 unsigned VF = E->getVectorFactor();
14534
14535 bool NeedFreeze = false;
14536 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14537 E->ReuseShuffleIndices.end());
14538 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14539 // Clear values, to be replaced by insertvector instructions.
14540 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14541 for_each(MutableArrayRef(GatheredScalars)
14542 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14543 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14545 E->CombinedEntriesWithIndices.size());
14546 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14547 [&](const auto &P) {
14548 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14549 });
14550 // Build a mask out of the reorder indices and reorder scalars per this
14551 // mask.
14552 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14553 E->ReorderIndices.end());
14554 if (!ReorderMask.empty())
14555 reorderScalars(GatheredScalars, ReorderMask);
14556 SmallVector<int> SubVectorsMask;
14557 inversePermutation(E->ReorderIndices, SubVectorsMask);
14558 // Transform non-clustered elements in the mask to poison (-1).
14559 // "Clustered" operations will be reordered using this mask later.
14560 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14561 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14562 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14563 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14564 } else {
14565 SubVectorsMask.clear();
14566 }
14567 SmallVector<Value *> StoredGS(GatheredScalars);
14568 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14569 unsigned I, unsigned SliceSize,
14570 bool IsNotPoisonous) {
14571 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14572 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14573 }))
14574 return false;
14575 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14576 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14577 if (UserTE->getNumOperands() != 2)
14578 return false;
14579 if (!IsNotPoisonous) {
14580 auto *It =
14581 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14582 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14583 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14584 }) != TE->UserTreeIndices.end();
14585 });
14586 if (It == VectorizableTree.end())
14587 return false;
14588 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14589 if (!(*It)->ReorderIndices.empty()) {
14590 inversePermutation((*It)->ReorderIndices, ReorderMask);
14591 reorderScalars(GS, ReorderMask);
14592 }
14593 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14594 Value *V0 = std::get<0>(P);
14595 Value *V1 = std::get<1>(P);
14596 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14597 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14598 is_contained(E->Scalars, V1));
14599 }))
14600 return false;
14601 }
14602 int Idx;
14603 if ((Mask.size() < InputVF &&
14605 Idx == 0) ||
14606 (Mask.size() == InputVF &&
14607 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14608 std::iota(
14609 std::next(Mask.begin(), I * SliceSize),
14610 std::next(Mask.begin(),
14611 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14612 0);
14613 } else {
14614 unsigned IVal =
14615 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14616 std::fill(
14617 std::next(Mask.begin(), I * SliceSize),
14618 std::next(Mask.begin(),
14619 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14620 IVal);
14621 }
14622 return true;
14623 };
14624 BVTy ShuffleBuilder(ScalarTy, Params...);
14625 ResTy Res = ResTy();
14627 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14629 Value *ExtractVecBase = nullptr;
14630 bool UseVecBaseAsInput = false;
14633 Type *OrigScalarTy = GatheredScalars.front()->getType();
14634 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14635 unsigned NumParts = TTI->getNumberOfParts(VecTy);
14636 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14637 VecTy->getNumElements() % NumParts != 0 ||
14639 VecTy->getNumElements() / NumParts))
14640 NumParts = 1;
14641 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14642 // Check for gathered extracts.
14643 bool Resized = false;
14644 ExtractShuffles =
14645 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14646 if (!ExtractShuffles.empty()) {
14647 SmallVector<const TreeEntry *> ExtractEntries;
14648 for (auto [Idx, I] : enumerate(ExtractMask)) {
14649 if (I == PoisonMaskElem)
14650 continue;
14651 if (const auto *TE = getTreeEntry(
14652 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14653 ExtractEntries.push_back(TE);
14654 }
14655 if (std::optional<ResTy> Delayed =
14656 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14657 // Delay emission of gathers which are not ready yet.
14658 PostponedGathers.insert(E);
14659 // Postpone gather emission, will be emitted after the end of the
14660 // process to keep correct order.
14661 return *Delayed;
14662 }
14663 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14664 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14665 ExtractVecBase = VecBase;
14666 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14667 if (VF == VecBaseTy->getNumElements() &&
14668 GatheredScalars.size() != VF) {
14669 Resized = true;
14670 GatheredScalars.append(VF - GatheredScalars.size(),
14671 PoisonValue::get(OrigScalarTy));
14672 }
14673 }
14674 }
14675 // Gather extracts after we check for full matched gathers only.
14676 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
14677 ((E->getOpcode() == Instruction::Load ||
14678 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14679 any_of(E->Scalars,
14680 [this](Value *V) {
14681 return isa<LoadInst>(V) && getTreeEntry(V);
14682 })) ||
14683 E->isAltShuffle() ||
14684 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14685 isSplat(E->Scalars) ||
14686 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14687 GatherShuffles =
14688 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14689 }
14690 if (!GatherShuffles.empty()) {
14691 if (std::optional<ResTy> Delayed =
14692 ShuffleBuilder.needToDelay(E, Entries)) {
14693 // Delay emission of gathers which are not ready yet.
14694 PostponedGathers.insert(E);
14695 // Postpone gather emission, will be emitted after the end of the
14696 // process to keep correct order.
14697 return *Delayed;
14698 }
14699 if (GatherShuffles.size() == 1 &&
14700 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14701 Entries.front().front()->isSame(E->Scalars)) {
14702 // Perfect match in the graph, will reuse the previously vectorized
14703 // node. Cost is 0.
14704 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14705 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14706 // Restore the mask for previous partially matched values.
14707 Mask.resize(E->Scalars.size());
14708 const TreeEntry *FrontTE = Entries.front().front();
14709 if (FrontTE->ReorderIndices.empty() &&
14710 ((FrontTE->ReuseShuffleIndices.empty() &&
14711 E->Scalars.size() == FrontTE->Scalars.size()) ||
14712 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14713 std::iota(Mask.begin(), Mask.end(), 0);
14714 } else {
14715 for (auto [I, V] : enumerate(E->Scalars)) {
14716 if (isa<PoisonValue>(V)) {
14718 continue;
14719 }
14720 Mask[I] = FrontTE->findLaneForValue(V);
14721 }
14722 }
14723 ShuffleBuilder.add(*FrontTE, Mask);
14724 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
14725 SubVectorsMask);
14726 return Res;
14727 }
14728 if (!Resized) {
14729 if (GatheredScalars.size() != VF &&
14730 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14731 return any_of(TEs, [&](const TreeEntry *TE) {
14732 return TE->getVectorFactor() == VF;
14733 });
14734 }))
14735 GatheredScalars.append(VF - GatheredScalars.size(),
14736 PoisonValue::get(OrigScalarTy));
14737 }
14738 // Remove shuffled elements from list of gathers.
14739 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14740 if (Mask[I] != PoisonMaskElem)
14741 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14742 }
14743 }
14744 }
14745 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14746 SmallVectorImpl<int> &ReuseMask,
14747 bool IsRootPoison) {
14748 // For splats with can emit broadcasts instead of gathers, so try to find
14749 // such sequences.
14750 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
14751 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14752 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
14753 SmallVector<int> UndefPos;
14754 DenseMap<Value *, unsigned> UniquePositions;
14755 // Gather unique non-const values and all constant values.
14756 // For repeated values, just shuffle them.
14757 int NumNonConsts = 0;
14758 int SinglePos = 0;
14759 for (auto [I, V] : enumerate(Scalars)) {
14760 if (isa<UndefValue>(V)) {
14761 if (!isa<PoisonValue>(V)) {
14762 ReuseMask[I] = I;
14763 UndefPos.push_back(I);
14764 }
14765 continue;
14766 }
14767 if (isConstant(V)) {
14768 ReuseMask[I] = I;
14769 continue;
14770 }
14771 ++NumNonConsts;
14772 SinglePos = I;
14773 Value *OrigV = V;
14774 Scalars[I] = PoisonValue::get(OrigScalarTy);
14775 if (IsSplat) {
14776 Scalars.front() = OrigV;
14777 ReuseMask[I] = 0;
14778 } else {
14779 const auto Res = UniquePositions.try_emplace(OrigV, I);
14780 Scalars[Res.first->second] = OrigV;
14781 ReuseMask[I] = Res.first->second;
14782 }
14783 }
14784 if (NumNonConsts == 1) {
14785 // Restore single insert element.
14786 if (IsSplat) {
14787 ReuseMask.assign(VF, PoisonMaskElem);
14788 std::swap(Scalars.front(), Scalars[SinglePos]);
14789 if (!UndefPos.empty() && UndefPos.front() == 0)
14790 Scalars.front() = UndefValue::get(OrigScalarTy);
14791 }
14792 ReuseMask[SinglePos] = SinglePos;
14793 } else if (!UndefPos.empty() && IsSplat) {
14794 // For undef values, try to replace them with the simple broadcast.
14795 // We can do it if the broadcasted value is guaranteed to be
14796 // non-poisonous, or by freezing the incoming scalar value first.
14797 auto *It = find_if(Scalars, [this, E](Value *V) {
14798 return !isa<UndefValue>(V) &&
14799 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
14800 (E->UserTreeIndices.size() == 1 &&
14801 any_of(V->uses(), [E](const Use &U) {
14802 // Check if the value already used in the same operation in
14803 // one of the nodes already.
14804 return E->UserTreeIndices.front().EdgeIdx !=
14805 U.getOperandNo() &&
14806 is_contained(
14807 E->UserTreeIndices.front().UserTE->Scalars,
14808 U.getUser());
14809 })));
14810 });
14811 if (It != Scalars.end()) {
14812 // Replace undefs by the non-poisoned scalars and emit broadcast.
14813 int Pos = std::distance(Scalars.begin(), It);
14814 for (int I : UndefPos) {
14815 // Set the undef position to the non-poisoned scalar.
14816 ReuseMask[I] = Pos;
14817 // Replace the undef by the poison, in the mask it is replaced by
14818 // non-poisoned scalar already.
14819 if (I != Pos)
14820 Scalars[I] = PoisonValue::get(OrigScalarTy);
14821 }
14822 } else {
14823 // Replace undefs by the poisons, emit broadcast and then emit
14824 // freeze.
14825 for (int I : UndefPos) {
14826 ReuseMask[I] = PoisonMaskElem;
14827 if (isa<UndefValue>(Scalars[I]))
14828 Scalars[I] = PoisonValue::get(OrigScalarTy);
14829 }
14830 NeedFreeze = true;
14831 }
14832 }
14833 };
14834 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
14835 bool IsNonPoisoned = true;
14836 bool IsUsedInExpr = true;
14837 Value *Vec1 = nullptr;
14838 if (!ExtractShuffles.empty()) {
14839 // Gather of extractelements can be represented as just a shuffle of
14840 // a single/two vectors the scalars are extracted from.
14841 // Find input vectors.
14842 Value *Vec2 = nullptr;
14843 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
14844 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
14845 ExtractMask[I] = PoisonMaskElem;
14846 }
14847 if (UseVecBaseAsInput) {
14848 Vec1 = ExtractVecBase;
14849 } else {
14850 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
14851 if (ExtractMask[I] == PoisonMaskElem)
14852 continue;
14853 if (isa<UndefValue>(E->Scalars[I]))
14854 continue;
14855 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
14856 Value *VecOp = EI->getVectorOperand();
14857 if (const auto *TE = getTreeEntry(VecOp))
14858 if (TE->VectorizedValue)
14859 VecOp = TE->VectorizedValue;
14860 if (!Vec1) {
14861 Vec1 = VecOp;
14862 } else if (Vec1 != VecOp) {
14863 assert((!Vec2 || Vec2 == VecOp) &&
14864 "Expected only 1 or 2 vectors shuffle.");
14865 Vec2 = VecOp;
14866 }
14867 }
14868 }
14869 if (Vec2) {
14870 IsUsedInExpr = false;
14871 IsNonPoisoned &=
14873 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
14874 } else if (Vec1) {
14875 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1);
14876 IsUsedInExpr &= FindReusedSplat(
14877 ExtractMask,
14878 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
14879 ExtractMask.size(), IsNotPoisonedVec);
14880 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
14881 IsNonPoisoned &= IsNotPoisonedVec;
14882 } else {
14883 IsUsedInExpr = false;
14884 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
14885 /*ForExtracts=*/true);
14886 }
14887 }
14888 if (!GatherShuffles.empty()) {
14889 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
14890 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14891 for (const auto [I, TEs] : enumerate(Entries)) {
14892 if (TEs.empty()) {
14893 assert(!GatherShuffles[I] &&
14894 "No shuffles with empty entries list expected.");
14895 continue;
14896 }
14897 assert((TEs.size() == 1 || TEs.size() == 2) &&
14898 "Expected shuffle of 1 or 2 entries.");
14899 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
14900 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
14901 VecMask.assign(VecMask.size(), PoisonMaskElem);
14902 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
14903 if (TEs.size() == 1) {
14904 bool IsNotPoisonedVec =
14905 TEs.front()->VectorizedValue
14906 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue)
14907 : true;
14908 IsUsedInExpr &=
14909 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
14910 SliceSize, IsNotPoisonedVec);
14911 ShuffleBuilder.add(*TEs.front(), VecMask);
14912 IsNonPoisoned &= IsNotPoisonedVec;
14913 } else {
14914 IsUsedInExpr = false;
14915 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
14916 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
14917 IsNonPoisoned &=
14918 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
14919 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
14920 }
14921 }
14922 }
14923 // Try to figure out best way to combine values: build a shuffle and insert
14924 // elements or just build several shuffles.
14925 // Insert non-constant scalars.
14926 SmallVector<Value *> NonConstants(GatheredScalars);
14927 int EMSz = ExtractMask.size();
14928 int MSz = Mask.size();
14929 // Try to build constant vector and shuffle with it only if currently we
14930 // have a single permutation and more than 1 scalar constants.
14931 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
14932 bool IsIdentityShuffle =
14933 ((UseVecBaseAsInput ||
14934 all_of(ExtractShuffles,
14935 [](const std::optional<TTI::ShuffleKind> &SK) {
14936 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
14938 })) &&
14939 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
14940 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
14941 (!GatherShuffles.empty() &&
14942 all_of(GatherShuffles,
14943 [](const std::optional<TTI::ShuffleKind> &SK) {
14944 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
14946 }) &&
14947 none_of(Mask, [&](int I) { return I >= MSz; }) &&
14949 bool EnoughConstsForShuffle =
14950 IsSingleShuffle &&
14951 (none_of(GatheredScalars,
14952 [](Value *V) {
14953 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14954 }) ||
14955 any_of(GatheredScalars,
14956 [](Value *V) {
14957 return isa<Constant>(V) && !isa<UndefValue>(V);
14958 })) &&
14959 (!IsIdentityShuffle ||
14960 (GatheredScalars.size() == 2 &&
14961 any_of(GatheredScalars,
14962 [](Value *V) { return !isa<UndefValue>(V); })) ||
14963 count_if(GatheredScalars, [](Value *V) {
14964 return isa<Constant>(V) && !isa<PoisonValue>(V);
14965 }) > 1);
14966 // NonConstants array contains just non-constant values, GatheredScalars
14967 // contains only constant to build final vector and then shuffle.
14968 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
14969 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
14970 NonConstants[I] = PoisonValue::get(OrigScalarTy);
14971 else
14972 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14973 }
14974 // Generate constants for final shuffle and build a mask for them.
14975 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
14976 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
14977 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
14978 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
14979 ShuffleBuilder.add(BV, BVMask);
14980 }
14981 if (all_of(NonConstants, [=](Value *V) {
14982 return isa<PoisonValue>(V) ||
14983 (IsSingleShuffle && ((IsIdentityShuffle &&
14984 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
14985 }))
14986 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
14987 SubVectorsMask);
14988 else
14989 Res = ShuffleBuilder.finalize(
14990 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
14991 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
14992 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
14993 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
14994 });
14995 } else if (!allConstant(GatheredScalars)) {
14996 // Gather unique scalars and all constants.
14997 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
14998 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
14999 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15000 ShuffleBuilder.add(BV, ReuseMask);
15001 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15002 SubVectorsMask);
15003 } else {
15004 // Gather all constants.
15005 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15006 for (auto [I, V] : enumerate(GatheredScalars)) {
15007 if (!isa<PoisonValue>(V))
15008 Mask[I] = I;
15009 }
15010 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15011 ShuffleBuilder.add(BV, Mask);
15012 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15013 SubVectorsMask);
15014 }
15015
15016 if (NeedFreeze)
15017 Res = ShuffleBuilder.createFreeze(Res);
15018 return Res;
15019}
15020
15021Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15022 bool PostponedPHIs) {
15023 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15024 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15025 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15026 Builder, *this);
15027}
15028
15029/// \returns \p I after propagating metadata from \p VL only for instructions in
15030/// \p VL.
15033 for (Value *V : VL)
15034 if (isa<Instruction>(V))
15035 Insts.push_back(V);
15036 return llvm::propagateMetadata(Inst, Insts);
15037}
15038
15039Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15040 IRBuilderBase::InsertPointGuard Guard(Builder);
15041
15042 if (E->VectorizedValue &&
15043 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15044 E->isAltShuffle())) {
15045 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15046 return E->VectorizedValue;
15047 }
15048
15049 Value *V = E->Scalars.front();
15050 Type *ScalarTy = V->getType();
15051 if (!isa<CmpInst>(V))
15052 ScalarTy = getValueType(V);
15053 auto It = MinBWs.find(E);
15054 if (It != MinBWs.end()) {
15055 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15056 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15057 if (VecTy)
15058 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15059 }
15060 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15061 if (E->isGather()) {
15062 // Set insert point for non-reduction initial nodes.
15063 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15064 setInsertPointAfterBundle(E);
15065 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15066 E->VectorizedValue = Vec;
15067 return Vec;
15068 }
15069
15070 bool IsReverseOrder =
15071 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15072 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15073 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15074 if (E->getOpcode() == Instruction::Store &&
15075 E->State == TreeEntry::Vectorize) {
15077 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15078 E->ReorderIndices.size());
15079 ShuffleBuilder.add(V, Mask);
15080 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15081 ShuffleBuilder.addOrdered(V, {});
15082 } else {
15083 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15084 }
15086 E->CombinedEntriesWithIndices.size());
15087 transform(
15088 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15089 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15090 });
15091 assert(
15092 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15093 "Expected either combined subnodes or reordering");
15094 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15095 };
15096
15097 assert(!E->isGather() && "Unhandled state");
15098 unsigned ShuffleOrOp =
15099 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15100 Instruction *VL0 = E->getMainOp();
15101 auto GetOperandSignedness = [&](unsigned Idx) {
15102 const TreeEntry *OpE = getOperandEntry(E, Idx);
15103 bool IsSigned = false;
15104 auto It = MinBWs.find(OpE);
15105 if (It != MinBWs.end())
15106 IsSigned = It->second.second;
15107 else
15108 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15109 if (isa<PoisonValue>(V))
15110 return false;
15111 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15112 });
15113 return IsSigned;
15114 };
15115 switch (ShuffleOrOp) {
15116 case Instruction::PHI: {
15117 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15118 E != VectorizableTree.front().get() ||
15119 !E->UserTreeIndices.empty()) &&
15120 "PHI reordering is free.");
15121 if (PostponedPHIs && E->VectorizedValue)
15122 return E->VectorizedValue;
15123 auto *PH = cast<PHINode>(VL0);
15124 Builder.SetInsertPoint(PH->getParent(),
15125 PH->getParent()->getFirstNonPHIIt());
15126 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15127 if (PostponedPHIs || !E->VectorizedValue) {
15128 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15129 E->PHI = NewPhi;
15130 Value *V = NewPhi;
15131
15132 // Adjust insertion point once all PHI's have been generated.
15133 Builder.SetInsertPoint(PH->getParent(),
15134 PH->getParent()->getFirstInsertionPt());
15135 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15136
15137 V = FinalShuffle(V, E);
15138
15139 E->VectorizedValue = V;
15140 if (PostponedPHIs)
15141 return V;
15142 }
15143 PHINode *NewPhi = cast<PHINode>(E->PHI);
15144 // If phi node is fully emitted - exit.
15145 if (NewPhi->getNumIncomingValues() != 0)
15146 return NewPhi;
15147
15148 // PHINodes may have multiple entries from the same block. We want to
15149 // visit every block once.
15151
15152 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15154 BasicBlock *IBB = PH->getIncomingBlock(I);
15155
15156 // Stop emission if all incoming values are generated.
15157 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15158 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15159 return NewPhi;
15160 }
15161
15162 if (!VisitedBBs.insert(IBB).second) {
15163 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15164 continue;
15165 }
15166
15167 Builder.SetInsertPoint(IBB->getTerminator());
15168 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15169 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15170 if (VecTy != Vec->getType()) {
15171 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15172 MinBWs.contains(getOperandEntry(E, I))) &&
15173 "Expected item in MinBWs.");
15174 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15175 }
15176 NewPhi->addIncoming(Vec, IBB);
15177 }
15178
15179 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15180 "Invalid number of incoming values");
15181 assert(E->VectorizedValue && "Expected vectorized value.");
15182 return E->VectorizedValue;
15183 }
15184
15185 case Instruction::ExtractElement: {
15186 Value *V = E->getSingleOperand(0);
15187 if (const TreeEntry *TE = getTreeEntry(V))
15188 V = TE->VectorizedValue;
15189 setInsertPointAfterBundle(E);
15190 V = FinalShuffle(V, E);
15191 E->VectorizedValue = V;
15192 return V;
15193 }
15194 case Instruction::ExtractValue: {
15195 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15196 Builder.SetInsertPoint(LI);
15197 Value *Ptr = LI->getPointerOperand();
15198 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15199 Value *NewV = ::propagateMetadata(V, E->Scalars);
15200 NewV = FinalShuffle(NewV, E);
15201 E->VectorizedValue = NewV;
15202 return NewV;
15203 }
15204 case Instruction::InsertElement: {
15205 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15206 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15207 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15208 ArrayRef<Value *> Op = E->getOperand(1);
15209 Type *ScalarTy = Op.front()->getType();
15210 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15211 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15212 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15213 assert(Res.first > 0 && "Expected item in MinBWs.");
15214 V = Builder.CreateIntCast(
15215 V,
15217 ScalarTy,
15218 cast<FixedVectorType>(V->getType())->getNumElements()),
15219 Res.second);
15220 }
15221
15222 // Create InsertVector shuffle if necessary
15223 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15224 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15225 }));
15226 const unsigned NumElts =
15227 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15228 const unsigned NumScalars = E->Scalars.size();
15229
15230 unsigned Offset = *getElementIndex(VL0);
15231 assert(Offset < NumElts && "Failed to find vector index offset");
15232
15233 // Create shuffle to resize vector
15235 if (!E->ReorderIndices.empty()) {
15236 inversePermutation(E->ReorderIndices, Mask);
15237 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15238 } else {
15239 Mask.assign(NumElts, PoisonMaskElem);
15240 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15241 }
15242 // Create InsertVector shuffle if necessary
15243 bool IsIdentity = true;
15244 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15245 Mask.swap(PrevMask);
15246 for (unsigned I = 0; I < NumScalars; ++I) {
15247 Value *Scalar = E->Scalars[PrevMask[I]];
15248 unsigned InsertIdx = *getElementIndex(Scalar);
15249 IsIdentity &= InsertIdx - Offset == I;
15250 Mask[InsertIdx - Offset] = I;
15251 }
15252 if (!IsIdentity || NumElts != NumScalars) {
15253 Value *V2 = nullptr;
15254 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
15255 SmallVector<int> InsertMask(Mask);
15256 if (NumElts != NumScalars && Offset == 0) {
15257 // Follow all insert element instructions from the current buildvector
15258 // sequence.
15259 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15260 do {
15261 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15262 if (!InsertIdx)
15263 break;
15264 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15265 InsertMask[*InsertIdx] = *InsertIdx;
15266 if (!Ins->hasOneUse())
15267 break;
15268 Ins = dyn_cast_or_null<InsertElementInst>(
15269 Ins->getUniqueUndroppableUser());
15270 } while (Ins);
15271 SmallBitVector UseMask =
15272 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15273 SmallBitVector IsFirstPoison =
15274 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15275 SmallBitVector IsFirstUndef =
15276 isUndefVector(FirstInsert->getOperand(0), UseMask);
15277 if (!IsFirstPoison.all()) {
15278 unsigned Idx = 0;
15279 for (unsigned I = 0; I < NumElts; I++) {
15280 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15281 IsFirstUndef.test(I)) {
15282 if (IsVNonPoisonous) {
15283 InsertMask[I] = I < NumScalars ? I : 0;
15284 continue;
15285 }
15286 if (!V2)
15287 V2 = UndefValue::get(V->getType());
15288 if (Idx >= NumScalars)
15289 Idx = NumScalars - 1;
15290 InsertMask[I] = NumScalars + Idx;
15291 ++Idx;
15292 } else if (InsertMask[I] != PoisonMaskElem &&
15293 Mask[I] == PoisonMaskElem) {
15294 InsertMask[I] = PoisonMaskElem;
15295 }
15296 }
15297 } else {
15298 InsertMask = Mask;
15299 }
15300 }
15301 if (!V2)
15302 V2 = PoisonValue::get(V->getType());
15303 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15304 if (auto *I = dyn_cast<Instruction>(V)) {
15305 GatherShuffleExtractSeq.insert(I);
15306 CSEBlocks.insert(I->getParent());
15307 }
15308 }
15309
15310 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15311 for (unsigned I = 0; I < NumElts; I++) {
15312 if (Mask[I] != PoisonMaskElem)
15313 InsertMask[Offset + I] = I;
15314 }
15315 SmallBitVector UseMask =
15316 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15317 SmallBitVector IsFirstUndef =
15318 isUndefVector(FirstInsert->getOperand(0), UseMask);
15319 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15320 NumElts != NumScalars) {
15321 if (IsFirstUndef.all()) {
15322 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15323 SmallBitVector IsFirstPoison =
15324 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15325 if (!IsFirstPoison.all()) {
15326 for (unsigned I = 0; I < NumElts; I++) {
15327 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15328 InsertMask[I] = I + NumElts;
15329 }
15330 }
15331 V = Builder.CreateShuffleVector(
15332 V,
15333 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15334 : FirstInsert->getOperand(0),
15335 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15336 if (auto *I = dyn_cast<Instruction>(V)) {
15337 GatherShuffleExtractSeq.insert(I);
15338 CSEBlocks.insert(I->getParent());
15339 }
15340 }
15341 } else {
15342 SmallBitVector IsFirstPoison =
15343 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15344 for (unsigned I = 0; I < NumElts; I++) {
15345 if (InsertMask[I] == PoisonMaskElem)
15346 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15347 else
15348 InsertMask[I] += NumElts;
15349 }
15350 V = Builder.CreateShuffleVector(
15351 FirstInsert->getOperand(0), V, InsertMask,
15352 cast<Instruction>(E->Scalars.back())->getName());
15353 if (auto *I = dyn_cast<Instruction>(V)) {
15354 GatherShuffleExtractSeq.insert(I);
15355 CSEBlocks.insert(I->getParent());
15356 }
15357 }
15358 }
15359
15360 ++NumVectorInstructions;
15361 E->VectorizedValue = V;
15362 return V;
15363 }
15364 case Instruction::ZExt:
15365 case Instruction::SExt:
15366 case Instruction::FPToUI:
15367 case Instruction::FPToSI:
15368 case Instruction::FPExt:
15369 case Instruction::PtrToInt:
15370 case Instruction::IntToPtr:
15371 case Instruction::SIToFP:
15372 case Instruction::UIToFP:
15373 case Instruction::Trunc:
15374 case Instruction::FPTrunc:
15375 case Instruction::BitCast: {
15376 setInsertPointAfterBundle(E);
15377
15378 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15379 if (E->VectorizedValue) {
15380 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15381 return E->VectorizedValue;
15382 }
15383
15384 auto *CI = cast<CastInst>(VL0);
15385 Instruction::CastOps VecOpcode = CI->getOpcode();
15386 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15387 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15388 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15389 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15390 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15391 // Check if the values are candidates to demote.
15392 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15393 if (SrcIt != MinBWs.end())
15394 SrcBWSz = SrcIt->second.first;
15395 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15396 if (BWSz == SrcBWSz) {
15397 VecOpcode = Instruction::BitCast;
15398 } else if (BWSz < SrcBWSz) {
15399 VecOpcode = Instruction::Trunc;
15400 } else if (It != MinBWs.end()) {
15401 assert(BWSz > SrcBWSz && "Invalid cast!");
15402 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15403 } else if (SrcIt != MinBWs.end()) {
15404 assert(BWSz > SrcBWSz && "Invalid cast!");
15405 VecOpcode =
15406 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15407 }
15408 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15409 !SrcIt->second.second) {
15410 VecOpcode = Instruction::UIToFP;
15411 }
15412 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15413 ? InVec
15414 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15415 V = FinalShuffle(V, E);
15416
15417 E->VectorizedValue = V;
15418 ++NumVectorInstructions;
15419 return V;
15420 }
15421 case Instruction::FCmp:
15422 case Instruction::ICmp: {
15423 setInsertPointAfterBundle(E);
15424
15425 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15426 if (E->VectorizedValue) {
15427 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15428 return E->VectorizedValue;
15429 }
15430 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15431 if (E->VectorizedValue) {
15432 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15433 return E->VectorizedValue;
15434 }
15435 if (L->getType() != R->getType()) {
15436 assert((getOperandEntry(E, 0)->isGather() ||
15437 getOperandEntry(E, 1)->isGather() ||
15438 MinBWs.contains(getOperandEntry(E, 0)) ||
15439 MinBWs.contains(getOperandEntry(E, 1))) &&
15440 "Expected item in MinBWs.");
15441 if (cast<VectorType>(L->getType())
15442 ->getElementType()
15443 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15444 ->getElementType()
15445 ->getIntegerBitWidth()) {
15446 Type *CastTy = R->getType();
15447 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15448 } else {
15449 Type *CastTy = L->getType();
15450 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15451 }
15452 }
15453
15454 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15455 Value *V = Builder.CreateCmp(P0, L, R);
15456 propagateIRFlags(V, E->Scalars, VL0);
15457 // Do not cast for cmps.
15458 VecTy = cast<FixedVectorType>(V->getType());
15459 V = FinalShuffle(V, E);
15460
15461 E->VectorizedValue = V;
15462 ++NumVectorInstructions;
15463 return V;
15464 }
15465 case Instruction::Select: {
15466 setInsertPointAfterBundle(E);
15467
15468 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15469 if (E->VectorizedValue) {
15470 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15471 return E->VectorizedValue;
15472 }
15473 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15474 if (E->VectorizedValue) {
15475 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15476 return E->VectorizedValue;
15477 }
15478 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15479 if (E->VectorizedValue) {
15480 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15481 return E->VectorizedValue;
15482 }
15483 if (True->getType() != VecTy || False->getType() != VecTy) {
15484 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15485 getOperandEntry(E, 2)->isGather() ||
15486 MinBWs.contains(getOperandEntry(E, 1)) ||
15487 MinBWs.contains(getOperandEntry(E, 2))) &&
15488 "Expected item in MinBWs.");
15489 if (True->getType() != VecTy)
15490 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15491 if (False->getType() != VecTy)
15492 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15493 }
15494
15495 unsigned CondNumElements = getNumElements(Cond->getType());
15496 unsigned TrueNumElements = getNumElements(True->getType());
15497 assert(TrueNumElements >= CondNumElements &&
15498 TrueNumElements % CondNumElements == 0 &&
15499 "Cannot vectorize Instruction::Select");
15500 assert(TrueNumElements == getNumElements(False->getType()) &&
15501 "Cannot vectorize Instruction::Select");
15502 if (CondNumElements != TrueNumElements) {
15503 // When the return type is i1 but the source is fixed vector type, we
15504 // need to duplicate the condition value.
15505 Cond = Builder.CreateShuffleVector(
15506 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15507 CondNumElements));
15508 }
15509 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15510 "Cannot vectorize Instruction::Select");
15511 Value *V = Builder.CreateSelect(Cond, True, False);
15512 V = FinalShuffle(V, E);
15513
15514 E->VectorizedValue = V;
15515 ++NumVectorInstructions;
15516 return V;
15517 }
15518 case Instruction::FNeg: {
15519 setInsertPointAfterBundle(E);
15520
15521 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15522
15523 if (E->VectorizedValue) {
15524 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15525 return E->VectorizedValue;
15526 }
15527
15528 Value *V = Builder.CreateUnOp(
15529 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15530 propagateIRFlags(V, E->Scalars, VL0);
15531 if (auto *I = dyn_cast<Instruction>(V))
15532 V = ::propagateMetadata(I, E->Scalars);
15533
15534 V = FinalShuffle(V, E);
15535
15536 E->VectorizedValue = V;
15537 ++NumVectorInstructions;
15538
15539 return V;
15540 }
15541 case Instruction::Freeze: {
15542 setInsertPointAfterBundle(E);
15543
15544 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15545
15546 if (E->VectorizedValue) {
15547 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15548 return E->VectorizedValue;
15549 }
15550
15551 if (Op->getType() != VecTy) {
15552 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15553 MinBWs.contains(getOperandEntry(E, 0))) &&
15554 "Expected item in MinBWs.");
15555 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15556 }
15557 Value *V = Builder.CreateFreeze(Op);
15558 V = FinalShuffle(V, E);
15559
15560 E->VectorizedValue = V;
15561 ++NumVectorInstructions;
15562
15563 return V;
15564 }
15565 case Instruction::Add:
15566 case Instruction::FAdd:
15567 case Instruction::Sub:
15568 case Instruction::FSub:
15569 case Instruction::Mul:
15570 case Instruction::FMul:
15571 case Instruction::UDiv:
15572 case Instruction::SDiv:
15573 case Instruction::FDiv:
15574 case Instruction::URem:
15575 case Instruction::SRem:
15576 case Instruction::FRem:
15577 case Instruction::Shl:
15578 case Instruction::LShr:
15579 case Instruction::AShr:
15580 case Instruction::And:
15581 case Instruction::Or:
15582 case Instruction::Xor: {
15583 setInsertPointAfterBundle(E);
15584
15585 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15586 if (E->VectorizedValue) {
15587 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15588 return E->VectorizedValue;
15589 }
15590 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15591 if (E->VectorizedValue) {
15592 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15593 return E->VectorizedValue;
15594 }
15595 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15596 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15597 ArrayRef<Value *> Ops = E->getOperand(I);
15598 if (all_of(Ops, [&](Value *Op) {
15599 auto *CI = dyn_cast<ConstantInt>(Op);
15600 return CI && CI->getValue().countr_one() >= It->second.first;
15601 })) {
15602 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15603 E->VectorizedValue = V;
15604 ++NumVectorInstructions;
15605 return V;
15606 }
15607 }
15608 }
15609 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15610 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15611 getOperandEntry(E, 1)->isGather() ||
15612 MinBWs.contains(getOperandEntry(E, 0)) ||
15613 MinBWs.contains(getOperandEntry(E, 1))) &&
15614 "Expected item in MinBWs.");
15615 if (LHS->getType() != VecTy)
15616 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15617 if (RHS->getType() != VecTy)
15618 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15619 }
15620
15621 Value *V = Builder.CreateBinOp(
15622 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15623 RHS);
15624 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15625 if (auto *I = dyn_cast<Instruction>(V)) {
15626 V = ::propagateMetadata(I, E->Scalars);
15627 // Drop nuw flags for abs(sub(commutative), true).
15628 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15629 any_of(E->Scalars, [](Value *V) {
15630 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15631 }))
15632 I->setHasNoUnsignedWrap(/*b=*/false);
15633 }
15634
15635 V = FinalShuffle(V, E);
15636
15637 E->VectorizedValue = V;
15638 ++NumVectorInstructions;
15639
15640 return V;
15641 }
15642 case Instruction::Load: {
15643 // Loads are inserted at the head of the tree because we don't want to
15644 // sink them all the way down past store instructions.
15645 setInsertPointAfterBundle(E);
15646
15647 LoadInst *LI = cast<LoadInst>(VL0);
15648 Instruction *NewLI;
15649 Value *PO = LI->getPointerOperand();
15650 if (E->State == TreeEntry::Vectorize) {
15651 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15652 } else if (E->State == TreeEntry::StridedVectorize) {
15653 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15654 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15655 PO = IsReverseOrder ? PtrN : Ptr0;
15656 std::optional<int> Diff = getPointersDiff(
15657 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15658 Type *StrideTy = DL->getIndexType(PO->getType());
15659 Value *StrideVal;
15660 if (Diff) {
15661 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15662 StrideVal =
15663 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15664 DL->getTypeAllocSize(ScalarTy));
15665 } else {
15666 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15667 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15668 return cast<LoadInst>(V)->getPointerOperand();
15669 });
15670 OrdersType Order;
15671 std::optional<Value *> Stride =
15672 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15673 &*Builder.GetInsertPoint());
15674 Value *NewStride =
15675 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15676 StrideVal = Builder.CreateMul(
15677 NewStride,
15678 ConstantInt::get(
15679 StrideTy,
15680 (IsReverseOrder ? -1 : 1) *
15681 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15682 }
15683 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15684 auto *Inst = Builder.CreateIntrinsic(
15685 Intrinsic::experimental_vp_strided_load,
15686 {VecTy, PO->getType(), StrideTy},
15687 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15688 Builder.getInt32(E->Scalars.size())});
15689 Inst->addParamAttr(
15690 /*ArgNo=*/0,
15691 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15692 NewLI = Inst;
15693 } else {
15694 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15695 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15696 if (E->VectorizedValue) {
15697 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15698 return E->VectorizedValue;
15699 }
15700 if (isa<FixedVectorType>(ScalarTy)) {
15701 assert(SLPReVec && "FixedVectorType is not expected.");
15702 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15703 // to expand VecPtr if ScalarTy is a vector type.
15704 unsigned ScalarTyNumElements =
15705 cast<FixedVectorType>(ScalarTy)->getNumElements();
15706 unsigned VecTyNumElements =
15707 cast<FixedVectorType>(VecTy)->getNumElements();
15708 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15709 "Cannot expand getelementptr.");
15710 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15711 SmallVector<Constant *> Indices(VecTyNumElements);
15712 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15713 return Builder.getInt64(I % ScalarTyNumElements);
15714 });
15715 VecPtr = Builder.CreateGEP(
15716 VecTy->getElementType(),
15717 Builder.CreateShuffleVector(
15718 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15719 ConstantVector::get(Indices));
15720 }
15721 // Use the minimum alignment of the gathered loads.
15722 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15723 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15724 }
15725 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15726
15727 V = FinalShuffle(V, E);
15728 E->VectorizedValue = V;
15729 ++NumVectorInstructions;
15730 return V;
15731 }
15732 case Instruction::Store: {
15733 auto *SI = cast<StoreInst>(VL0);
15734
15735 setInsertPointAfterBundle(E);
15736
15737 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15738 if (VecValue->getType() != VecTy)
15739 VecValue =
15740 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15741 VecValue = FinalShuffle(VecValue, E);
15742
15743 Value *Ptr = SI->getPointerOperand();
15744 Instruction *ST;
15745 if (E->State == TreeEntry::Vectorize) {
15746 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
15747 } else {
15748 assert(E->State == TreeEntry::StridedVectorize &&
15749 "Expected either strided or consecutive stores.");
15750 if (!E->ReorderIndices.empty()) {
15751 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15752 Ptr = SI->getPointerOperand();
15753 }
15754 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15755 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
15756 auto *Inst = Builder.CreateIntrinsic(
15757 Intrinsic::experimental_vp_strided_store,
15758 {VecTy, Ptr->getType(), StrideTy},
15759 {VecValue, Ptr,
15760 ConstantInt::get(
15761 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15762 Builder.getAllOnesMask(VecTy->getElementCount()),
15763 Builder.getInt32(E->Scalars.size())});
15764 Inst->addParamAttr(
15765 /*ArgNo=*/1,
15766 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15767 ST = Inst;
15768 }
15769
15770 Value *V = ::propagateMetadata(ST, E->Scalars);
15771
15772 E->VectorizedValue = V;
15773 ++NumVectorInstructions;
15774 return V;
15775 }
15776 case Instruction::GetElementPtr: {
15777 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15778 setInsertPointAfterBundle(E);
15779
15780 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15781 if (E->VectorizedValue) {
15782 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15783 return E->VectorizedValue;
15784 }
15785
15786 SmallVector<Value *> OpVecs;
15787 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
15788 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15789 if (E->VectorizedValue) {
15790 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15791 return E->VectorizedValue;
15792 }
15793 OpVecs.push_back(OpVec);
15794 }
15795
15796 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
15797 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
15799 for (Value *V : E->Scalars) {
15800 if (isa<GetElementPtrInst>(V))
15801 GEPs.push_back(V);
15802 }
15803 V = ::propagateMetadata(I, GEPs);
15804 }
15805
15806 V = FinalShuffle(V, E);
15807
15808 E->VectorizedValue = V;
15809 ++NumVectorInstructions;
15810
15811 return V;
15812 }
15813 case Instruction::Call: {
15814 CallInst *CI = cast<CallInst>(VL0);
15815 setInsertPointAfterBundle(E);
15816
15818
15820 CI, ID, VecTy->getNumElements(),
15821 It != MinBWs.end() ? It->second.first : 0, TTI);
15822 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15823 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
15824 VecCallCosts.first <= VecCallCosts.second;
15825
15826 Value *ScalarArg = nullptr;
15827 SmallVector<Value *> OpVecs;
15828 SmallVector<Type *, 2> TysForDecl;
15829 // Add return type if intrinsic is overloaded on it.
15830 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
15831 TysForDecl.push_back(VecTy);
15832 auto *CEI = cast<CallInst>(VL0);
15833 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
15834 ValueList OpVL;
15835 // Some intrinsics have scalar arguments. This argument should not be
15836 // vectorized.
15837 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
15838 ScalarArg = CEI->getArgOperand(I);
15839 // if decided to reduce bitwidth of abs intrinsic, it second argument
15840 // must be set false (do not return poison, if value issigned min).
15841 if (ID == Intrinsic::abs && It != MinBWs.end() &&
15842 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
15843 ScalarArg = Builder.getFalse();
15844 OpVecs.push_back(ScalarArg);
15846 TysForDecl.push_back(ScalarArg->getType());
15847 continue;
15848 }
15849
15850 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
15851 if (E->VectorizedValue) {
15852 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15853 return E->VectorizedValue;
15854 }
15855 ScalarArg = CEI->getArgOperand(I);
15856 if (cast<VectorType>(OpVec->getType())->getElementType() !=
15857 ScalarArg->getType()->getScalarType() &&
15858 It == MinBWs.end()) {
15859 auto *CastTy =
15860 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
15861 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
15862 } else if (It != MinBWs.end()) {
15863 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
15864 }
15865 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
15866 OpVecs.push_back(OpVec);
15867 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
15868 TysForDecl.push_back(OpVec->getType());
15869 }
15870
15871 Function *CF;
15872 if (!UseIntrinsic) {
15873 VFShape Shape =
15876 static_cast<unsigned>(VecTy->getNumElements())),
15877 false /*HasGlobalPred*/);
15878 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
15879 } else {
15880 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
15881 }
15882
15884 CI->getOperandBundlesAsDefs(OpBundles);
15885 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
15886
15887 propagateIRFlags(V, E->Scalars, VL0);
15888 V = FinalShuffle(V, E);
15889
15890 E->VectorizedValue = V;
15891 ++NumVectorInstructions;
15892 return V;
15893 }
15894 case Instruction::ShuffleVector: {
15895 Value *V;
15896 if (SLPReVec && !E->isAltShuffle()) {
15897 setInsertPointAfterBundle(E);
15898 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
15899 if (E->VectorizedValue) {
15900 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15901 return E->VectorizedValue;
15902 }
15903 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
15904 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
15905 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
15906 "Not supported shufflevector usage.");
15907 SmallVector<int> NewMask(ThisMask.size());
15908 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
15909 return SVSrc->getShuffleMask()[Mask];
15910 });
15911 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
15912 } else {
15913 V = Builder.CreateShuffleVector(Src, ThisMask);
15914 }
15915 propagateIRFlags(V, E->Scalars, VL0);
15916 if (auto *I = dyn_cast<Instruction>(V))
15917 V = ::propagateMetadata(I, E->Scalars);
15918 V = FinalShuffle(V, E);
15919 } else {
15920 assert(E->isAltShuffle() &&
15921 ((Instruction::isBinaryOp(E->getOpcode()) &&
15922 Instruction::isBinaryOp(E->getAltOpcode())) ||
15923 (Instruction::isCast(E->getOpcode()) &&
15924 Instruction::isCast(E->getAltOpcode())) ||
15925 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15926 "Invalid Shuffle Vector Operand");
15927
15928 Value *LHS = nullptr, *RHS = nullptr;
15929 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
15930 setInsertPointAfterBundle(E);
15931 LHS = vectorizeOperand(E, 0, PostponedPHIs);
15932 if (E->VectorizedValue) {
15933 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15934 return E->VectorizedValue;
15935 }
15936 RHS = vectorizeOperand(E, 1, PostponedPHIs);
15937 } else {
15938 setInsertPointAfterBundle(E);
15939 LHS = vectorizeOperand(E, 0, PostponedPHIs);
15940 }
15941 if (E->VectorizedValue) {
15942 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15943 return E->VectorizedValue;
15944 }
15945 if (LHS && RHS &&
15946 ((Instruction::isBinaryOp(E->getOpcode()) &&
15947 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
15948 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
15949 assert((It != MinBWs.end() ||
15950 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
15951 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
15952 MinBWs.contains(getOperandEntry(E, 0)) ||
15953 MinBWs.contains(getOperandEntry(E, 1))) &&
15954 "Expected item in MinBWs.");
15955 Type *CastTy = VecTy;
15956 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
15957 if (cast<VectorType>(LHS->getType())
15958 ->getElementType()
15959 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
15960 ->getElementType()
15961 ->getIntegerBitWidth())
15962 CastTy = RHS->getType();
15963 else
15964 CastTy = LHS->getType();
15965 }
15966 if (LHS->getType() != CastTy)
15967 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
15968 if (RHS->getType() != CastTy)
15969 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
15970 }
15971
15972 Value *V0, *V1;
15973 if (Instruction::isBinaryOp(E->getOpcode())) {
15974 V0 = Builder.CreateBinOp(
15975 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
15976 V1 = Builder.CreateBinOp(
15977 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
15978 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15979 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
15980 auto *AltCI = cast<CmpInst>(E->getAltOp());
15981 CmpInst::Predicate AltPred = AltCI->getPredicate();
15982 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
15983 } else {
15984 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
15985 unsigned SrcBWSz = DL->getTypeSizeInBits(
15986 cast<VectorType>(LHS->getType())->getElementType());
15987 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15988 if (BWSz <= SrcBWSz) {
15989 if (BWSz < SrcBWSz)
15990 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
15991 assert(LHS->getType() == VecTy &&
15992 "Expected same type as operand.");
15993 if (auto *I = dyn_cast<Instruction>(LHS))
15994 LHS = ::propagateMetadata(I, E->Scalars);
15995 LHS = FinalShuffle(LHS, E);
15996 E->VectorizedValue = LHS;
15997 ++NumVectorInstructions;
15998 return LHS;
15999 }
16000 }
16001 V0 = Builder.CreateCast(
16002 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16003 V1 = Builder.CreateCast(
16004 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16005 }
16006 // Add V0 and V1 to later analysis to try to find and remove matching
16007 // instruction, if any.
16008 for (Value *V : {V0, V1}) {
16009 if (auto *I = dyn_cast<Instruction>(V)) {
16010 GatherShuffleExtractSeq.insert(I);
16011 CSEBlocks.insert(I->getParent());
16012 }
16013 }
16014
16015 // Create shuffle to take alternate operations from the vector.
16016 // Also, gather up main and alt scalar ops to propagate IR flags to
16017 // each vector operation.
16018 ValueList OpScalars, AltScalars;
16020 E->buildAltOpShuffleMask(
16021 [E, this](Instruction *I) {
16022 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16023 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16024 *TLI);
16025 },
16026 Mask, &OpScalars, &AltScalars);
16027
16028 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16029 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16030 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16031 // Drop nuw flags for abs(sub(commutative), true).
16032 if (auto *I = dyn_cast<Instruction>(Vec);
16033 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16034 any_of(E->Scalars, [](Value *V) {
16035 if (isa<PoisonValue>(V))
16036 return false;
16037 auto *IV = cast<Instruction>(V);
16038 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16039 }))
16040 I->setHasNoUnsignedWrap(/*b=*/false);
16041 };
16042 DropNuwFlag(V0, E->getOpcode());
16043 DropNuwFlag(V1, E->getAltOpcode());
16044
16045 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16046 assert(SLPReVec && "FixedVectorType is not expected.");
16048 }
16049 V = Builder.CreateShuffleVector(V0, V1, Mask);
16050 if (auto *I = dyn_cast<Instruction>(V)) {
16051 V = ::propagateMetadata(I, E->Scalars);
16052 GatherShuffleExtractSeq.insert(I);
16053 CSEBlocks.insert(I->getParent());
16054 }
16055 }
16056
16057 E->VectorizedValue = V;
16058 ++NumVectorInstructions;
16059
16060 return V;
16061 }
16062 default:
16063 llvm_unreachable("unknown inst");
16064 }
16065 return nullptr;
16066}
16067
16069 ExtraValueToDebugLocsMap ExternallyUsedValues;
16070 return vectorizeTree(ExternallyUsedValues);
16071}
16072
16073Value *
16075 Instruction *ReductionRoot) {
16076 // All blocks must be scheduled before any instructions are inserted.
16077 for (auto &BSIter : BlocksSchedules) {
16078 scheduleBlock(BSIter.second.get());
16079 }
16080 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16081 // need to rebuild it.
16082 EntryToLastInstruction.clear();
16083
16084 if (ReductionRoot)
16085 Builder.SetInsertPoint(ReductionRoot->getParent(),
16086 ReductionRoot->getIterator());
16087 else
16088 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16089
16090 // Emit gathered loads first to emit better code for the users of those
16091 // gathered loads.
16092 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16093 if (GatheredLoadsEntriesFirst.has_value() &&
16094 TE->Idx >= *GatheredLoadsEntriesFirst &&
16095 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16096 assert((!TE->UserTreeIndices.empty() ||
16097 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16098 "Expected gathered load node.");
16099 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16100 }
16101 }
16102 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16103 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16104 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16105 if (TE->State == TreeEntry::Vectorize &&
16106 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16107 TE->VectorizedValue)
16108 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16109 // Run through the list of postponed gathers and emit them, replacing the temp
16110 // emitted allocas with actual vector instructions.
16111 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16113 for (const TreeEntry *E : PostponedNodes) {
16114 auto *TE = const_cast<TreeEntry *>(E);
16115 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16116 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16117 TE->UserTreeIndices.front().EdgeIdx)) &&
16118 VecTE->isSame(TE->Scalars))
16119 // Found gather node which is absolutely the same as one of the
16120 // vectorized nodes. It may happen after reordering.
16121 continue;
16122 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16123 TE->VectorizedValue = nullptr;
16124 auto *UserI =
16125 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16126 // If user is a PHI node, its vector code have to be inserted right before
16127 // block terminator. Since the node was delayed, there were some unresolved
16128 // dependencies at the moment when stab instruction was emitted. In a case
16129 // when any of these dependencies turn out an operand of another PHI, coming
16130 // from this same block, position of a stab instruction will become invalid.
16131 // The is because source vector that supposed to feed this gather node was
16132 // inserted at the end of the block [after stab instruction]. So we need
16133 // to adjust insertion point again to the end of block.
16134 if (isa<PHINode>(UserI)) {
16135 // Insert before all users.
16136 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16137 for (User *U : PrevVec->users()) {
16138 if (U == UserI)
16139 continue;
16140 auto *UI = dyn_cast<Instruction>(U);
16141 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16142 continue;
16143 if (UI->comesBefore(InsertPt))
16144 InsertPt = UI;
16145 }
16146 Builder.SetInsertPoint(InsertPt);
16147 } else {
16148 Builder.SetInsertPoint(PrevVec);
16149 }
16150 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16151 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16152 if (auto *VecI = dyn_cast<Instruction>(Vec);
16153 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16154 Builder.GetInsertPoint()->comesBefore(VecI))
16155 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16156 Builder.GetInsertPoint());
16157 if (Vec->getType() != PrevVec->getType()) {
16158 assert(Vec->getType()->isIntOrIntVectorTy() &&
16159 PrevVec->getType()->isIntOrIntVectorTy() &&
16160 "Expected integer vector types only.");
16161 std::optional<bool> IsSigned;
16162 for (Value *V : TE->Scalars) {
16163 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16164 auto It = MinBWs.find(BaseTE);
16165 if (It != MinBWs.end()) {
16166 IsSigned = IsSigned.value_or(false) || It->second.second;
16167 if (*IsSigned)
16168 break;
16169 }
16170 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16171 auto It = MinBWs.find(MNTE);
16172 if (It != MinBWs.end()) {
16173 IsSigned = IsSigned.value_or(false) || It->second.second;
16174 if (*IsSigned)
16175 break;
16176 }
16177 }
16178 if (IsSigned.value_or(false))
16179 break;
16180 // Scan through gather nodes.
16181 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16182 auto It = MinBWs.find(BVE);
16183 if (It != MinBWs.end()) {
16184 IsSigned = IsSigned.value_or(false) || It->second.second;
16185 if (*IsSigned)
16186 break;
16187 }
16188 }
16189 if (IsSigned.value_or(false))
16190 break;
16191 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16192 IsSigned =
16193 IsSigned.value_or(false) ||
16194 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16195 continue;
16196 }
16197 if (IsSigned.value_or(false))
16198 break;
16199 }
16200 }
16201 if (IsSigned.value_or(false)) {
16202 // Final attempt - check user node.
16203 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16204 if (It != MinBWs.end())
16205 IsSigned = It->second.second;
16206 }
16207 assert(IsSigned &&
16208 "Expected user node or perfect diamond match in MinBWs.");
16209 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16210 }
16211 PrevVec->replaceAllUsesWith(Vec);
16212 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16213 // Replace the stub vector node, if it was used before for one of the
16214 // buildvector nodes already.
16215 auto It = PostponedValues.find(PrevVec);
16216 if (It != PostponedValues.end()) {
16217 for (TreeEntry *VTE : It->getSecond())
16218 VTE->VectorizedValue = Vec;
16219 }
16220 eraseInstruction(PrevVec);
16221 }
16222
16223 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16224 << " values .\n");
16225
16227 // Maps vector instruction to original insertelement instruction
16228 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16229 // Maps extract Scalar to the corresponding extractelement instruction in the
16230 // basic block. Only one extractelement per block should be emitted.
16232 ScalarToEEs;
16233 SmallDenseSet<Value *, 4> UsedInserts;
16235 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16237 // Extract all of the elements with the external uses.
16238 for (const auto &ExternalUse : ExternalUses) {
16239 Value *Scalar = ExternalUse.Scalar;
16240 llvm::User *User = ExternalUse.User;
16241
16242 // Skip users that we already RAUW. This happens when one instruction
16243 // has multiple uses of the same value.
16244 if (User && !is_contained(Scalar->users(), User))
16245 continue;
16246 TreeEntry *E = getTreeEntry(Scalar);
16247 assert(E && "Invalid scalar");
16248 assert(!E->isGather() && "Extracting from a gather list");
16249 // Non-instruction pointers are not deleted, just skip them.
16250 if (E->getOpcode() == Instruction::GetElementPtr &&
16251 !isa<GetElementPtrInst>(Scalar))
16252 continue;
16253
16254 Value *Vec = E->VectorizedValue;
16255 assert(Vec && "Can't find vectorizable value");
16256
16257 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16258 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16259 if (Scalar->getType() != Vec->getType()) {
16260 Value *Ex = nullptr;
16261 Value *ExV = nullptr;
16262 auto *Inst = dyn_cast<Instruction>(Scalar);
16263 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16264 auto It = ScalarToEEs.find(Scalar);
16265 if (It != ScalarToEEs.end()) {
16266 // No need to emit many extracts, just move the only one in the
16267 // current block.
16268 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16269 : Builder.GetInsertBlock());
16270 if (EEIt != It->second.end()) {
16271 Value *PrevV = EEIt->second.first;
16272 if (auto *I = dyn_cast<Instruction>(PrevV);
16273 I && !ReplaceInst &&
16274 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16275 Builder.GetInsertPoint()->comesBefore(I)) {
16276 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16277 Builder.GetInsertPoint());
16278 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16279 CI->moveAfter(I);
16280 }
16281 Ex = PrevV;
16282 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16283 }
16284 }
16285 if (!Ex) {
16286 // "Reuse" the existing extract to improve final codegen.
16287 if (ReplaceInst) {
16288 // Leave the instruction as is, if it cheaper extracts and all
16289 // operands are scalar.
16290 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16291 IgnoredExtracts.insert(EE);
16292 Ex = EE;
16293 } else {
16294 auto *CloneInst = Inst->clone();
16295 CloneInst->insertBefore(Inst);
16296 if (Inst->hasName())
16297 CloneInst->takeName(Inst);
16298 Ex = CloneInst;
16299 }
16300 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16301 ES && isa<Instruction>(Vec)) {
16302 Value *V = ES->getVectorOperand();
16303 auto *IVec = cast<Instruction>(Vec);
16304 if (const TreeEntry *ETE = getTreeEntry(V))
16305 V = ETE->VectorizedValue;
16306 if (auto *IV = dyn_cast<Instruction>(V);
16307 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16308 IV->comesBefore(IVec))
16309 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16310 else
16311 Ex = Builder.CreateExtractElement(Vec, Lane);
16312 } else if (auto *VecTy =
16313 dyn_cast<FixedVectorType>(Scalar->getType())) {
16314 assert(SLPReVec && "FixedVectorType is not expected.");
16315 unsigned VecTyNumElements = VecTy->getNumElements();
16316 // When REVEC is enabled, we need to extract a vector.
16317 // Note: The element size of Scalar may be different from the
16318 // element size of Vec.
16319 Ex = Builder.CreateExtractVector(
16321 VecTyNumElements),
16322 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
16323 } else {
16324 Ex = Builder.CreateExtractElement(Vec, Lane);
16325 }
16326 // If necessary, sign-extend or zero-extend ScalarRoot
16327 // to the larger type.
16328 ExV = Ex;
16329 if (Scalar->getType() != Ex->getType())
16330 ExV = Builder.CreateIntCast(
16331 Ex, Scalar->getType(),
16332 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16333 auto *I = dyn_cast<Instruction>(Ex);
16334 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16335 : &F->getEntryBlock(),
16336 std::make_pair(Ex, ExV));
16337 }
16338 // The then branch of the previous if may produce constants, since 0
16339 // operand might be a constant.
16340 if (auto *ExI = dyn_cast<Instruction>(Ex);
16341 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16342 GatherShuffleExtractSeq.insert(ExI);
16343 CSEBlocks.insert(ExI->getParent());
16344 }
16345 return ExV;
16346 }
16347 assert(isa<FixedVectorType>(Scalar->getType()) &&
16348 isa<InsertElementInst>(Scalar) &&
16349 "In-tree scalar of vector type is not insertelement?");
16350 auto *IE = cast<InsertElementInst>(Scalar);
16351 VectorToInsertElement.try_emplace(Vec, IE);
16352 return Vec;
16353 };
16354 // If User == nullptr, the Scalar remains as scalar in vectorized
16355 // instructions or is used as extra arg. Generate ExtractElement instruction
16356 // and update the record for this scalar in ExternallyUsedValues.
16357 if (!User) {
16358 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16359 continue;
16360 assert((ExternallyUsedValues.count(Scalar) ||
16361 Scalar->hasNUsesOrMore(UsesLimit) ||
16362 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16363 any_of(Scalar->users(),
16364 [&](llvm::User *U) {
16365 if (ExternalUsesAsOriginalScalar.contains(U))
16366 return true;
16367 TreeEntry *UseEntry = getTreeEntry(U);
16368 return UseEntry &&
16369 (UseEntry->State == TreeEntry::Vectorize ||
16370 UseEntry->State ==
16371 TreeEntry::StridedVectorize) &&
16372 (E->State == TreeEntry::Vectorize ||
16373 E->State == TreeEntry::StridedVectorize) &&
16374 doesInTreeUserNeedToExtract(
16375 Scalar, getRootEntryInstruction(*UseEntry),
16376 TLI, TTI);
16377 })) &&
16378 "Scalar with nullptr User must be registered in "
16379 "ExternallyUsedValues map or remain as scalar in vectorized "
16380 "instructions");
16381 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16382 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16383 if (PHI->getParent()->isLandingPad())
16384 Builder.SetInsertPoint(
16385 PHI->getParent(),
16386 std::next(
16387 PHI->getParent()->getLandingPadInst()->getIterator()));
16388 else
16389 Builder.SetInsertPoint(PHI->getParent(),
16390 PHI->getParent()->getFirstNonPHIIt());
16391 } else {
16392 Builder.SetInsertPoint(VecI->getParent(),
16393 std::next(VecI->getIterator()));
16394 }
16395 } else {
16396 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16397 }
16398 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16399 // Required to update internally referenced instructions.
16400 if (Scalar != NewInst) {
16401 assert((!isa<ExtractElementInst>(Scalar) ||
16402 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16403 "Extractelements should not be replaced.");
16404 Scalar->replaceAllUsesWith(NewInst);
16405 }
16406 continue;
16407 }
16408
16409 if (auto *VU = dyn_cast<InsertElementInst>(User);
16410 VU && VU->getOperand(1) == Scalar) {
16411 // Skip if the scalar is another vector op or Vec is not an instruction.
16412 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16413 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16414 if (!UsedInserts.insert(VU).second)
16415 continue;
16416 // Need to use original vector, if the root is truncated.
16417 auto BWIt = MinBWs.find(E);
16418 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16419 auto *ScalarTy = FTy->getElementType();
16420 auto Key = std::make_pair(Vec, ScalarTy);
16421 auto VecIt = VectorCasts.find(Key);
16422 if (VecIt == VectorCasts.end()) {
16423 IRBuilderBase::InsertPointGuard Guard(Builder);
16424 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16425 if (IVec->getParent()->isLandingPad())
16426 Builder.SetInsertPoint(IVec->getParent(),
16427 std::next(IVec->getParent()
16428 ->getLandingPadInst()
16429 ->getIterator()));
16430 else
16431 Builder.SetInsertPoint(
16432 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16433 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16434 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16435 }
16436 Vec = Builder.CreateIntCast(
16437 Vec,
16439 ScalarTy,
16440 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16441 BWIt->second.second);
16442 VectorCasts.try_emplace(Key, Vec);
16443 } else {
16444 Vec = VecIt->second;
16445 }
16446 }
16447
16448 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16449 if (InsertIdx) {
16450 auto *It = find_if(
16451 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16452 // Checks if 2 insertelements are from the same buildvector.
16453 InsertElementInst *VecInsert = Data.InsertElements.front();
16455 VU, VecInsert,
16456 [](InsertElementInst *II) { return II->getOperand(0); });
16457 });
16458 unsigned Idx = *InsertIdx;
16459 if (It == ShuffledInserts.end()) {
16460 (void)ShuffledInserts.emplace_back();
16461 It = std::next(ShuffledInserts.begin(),
16462 ShuffledInserts.size() - 1);
16463 }
16464 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16465 if (Mask.empty())
16466 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16467 Mask[Idx] = ExternalUse.Lane;
16468 It->InsertElements.push_back(cast<InsertElementInst>(User));
16469 continue;
16470 }
16471 }
16472 }
16473 }
16474
16475 // Generate extracts for out-of-tree users.
16476 // Find the insertion point for the extractelement lane.
16477 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16478 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16479 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16480 if (PH->getIncomingValue(I) == Scalar) {
16481 Instruction *IncomingTerminator =
16482 PH->getIncomingBlock(I)->getTerminator();
16483 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16484 Builder.SetInsertPoint(VecI->getParent(),
16485 std::next(VecI->getIterator()));
16486 } else {
16487 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16488 }
16489 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16490 PH->setOperand(I, NewInst);
16491 }
16492 }
16493 } else {
16494 Builder.SetInsertPoint(cast<Instruction>(User));
16495 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16496 User->replaceUsesOfWith(Scalar, NewInst);
16497 }
16498 } else {
16499 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16500 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16501 User->replaceUsesOfWith(Scalar, NewInst);
16502 }
16503
16504 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16505 }
16506
16507 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16508 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16509 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16510 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16511 for (int I = 0, E = Mask.size(); I < E; ++I) {
16512 if (Mask[I] < VF)
16513 CombinedMask1[I] = Mask[I];
16514 else
16515 CombinedMask2[I] = Mask[I] - VF;
16516 }
16517 ShuffleInstructionBuilder ShuffleBuilder(
16518 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16519 ShuffleBuilder.add(V1, CombinedMask1);
16520 if (V2)
16521 ShuffleBuilder.add(V2, CombinedMask2);
16522 return ShuffleBuilder.finalize({}, {}, {});
16523 };
16524
16525 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16526 bool ForSingleMask) {
16527 unsigned VF = Mask.size();
16528 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16529 if (VF != VecVF) {
16530 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16531 Vec = CreateShuffle(Vec, nullptr, Mask);
16532 return std::make_pair(Vec, true);
16533 }
16534 if (!ForSingleMask) {
16535 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16536 for (unsigned I = 0; I < VF; ++I) {
16537 if (Mask[I] != PoisonMaskElem)
16538 ResizeMask[Mask[I]] = Mask[I];
16539 }
16540 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16541 }
16542 }
16543
16544 return std::make_pair(Vec, false);
16545 };
16546 // Perform shuffling of the vectorize tree entries for better handling of
16547 // external extracts.
16548 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16549 // Find the first and the last instruction in the list of insertelements.
16550 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16551 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16552 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16553 Builder.SetInsertPoint(LastInsert);
16554 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16555 Value *NewInst = performExtractsShuffleAction<Value>(
16556 MutableArrayRef(Vector.data(), Vector.size()),
16557 FirstInsert->getOperand(0),
16558 [](Value *Vec) {
16559 return cast<VectorType>(Vec->getType())
16560 ->getElementCount()
16561 .getKnownMinValue();
16562 },
16563 ResizeToVF,
16564 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16565 ArrayRef<Value *> Vals) {
16566 assert((Vals.size() == 1 || Vals.size() == 2) &&
16567 "Expected exactly 1 or 2 input values.");
16568 if (Vals.size() == 1) {
16569 // Do not create shuffle if the mask is a simple identity
16570 // non-resizing mask.
16571 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16572 ->getNumElements() ||
16573 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16574 return CreateShuffle(Vals.front(), nullptr, Mask);
16575 return Vals.front();
16576 }
16577 return CreateShuffle(Vals.front() ? Vals.front()
16578 : FirstInsert->getOperand(0),
16579 Vals.back(), Mask);
16580 });
16581 auto It = ShuffledInserts[I].InsertElements.rbegin();
16582 // Rebuild buildvector chain.
16583 InsertElementInst *II = nullptr;
16584 if (It != ShuffledInserts[I].InsertElements.rend())
16585 II = *It;
16587 while (It != ShuffledInserts[I].InsertElements.rend()) {
16588 assert(II && "Must be an insertelement instruction.");
16589 if (*It == II)
16590 ++It;
16591 else
16592 Inserts.push_back(cast<Instruction>(II));
16593 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16594 }
16595 for (Instruction *II : reverse(Inserts)) {
16596 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16597 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16598 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16599 II->moveAfter(NewI);
16600 NewInst = II;
16601 }
16602 LastInsert->replaceAllUsesWith(NewInst);
16603 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16604 IE->replaceUsesOfWith(IE->getOperand(0),
16605 PoisonValue::get(IE->getOperand(0)->getType()));
16606 IE->replaceUsesOfWith(IE->getOperand(1),
16607 PoisonValue::get(IE->getOperand(1)->getType()));
16608 eraseInstruction(IE);
16609 }
16610 CSEBlocks.insert(LastInsert->getParent());
16611 }
16612
16613 SmallVector<Instruction *> RemovedInsts;
16614 // For each vectorized value:
16615 for (auto &TEPtr : VectorizableTree) {
16616 TreeEntry *Entry = TEPtr.get();
16617
16618 // No need to handle users of gathered values.
16619 if (Entry->isGather())
16620 continue;
16621
16622 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16623
16624 // For each lane:
16625 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16626 Value *Scalar = Entry->Scalars[Lane];
16627
16628 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16629 !isa<GetElementPtrInst>(Scalar))
16630 continue;
16631 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16632 EE && IgnoredExtracts.contains(EE))
16633 continue;
16634 if (isa<PoisonValue>(Scalar))
16635 continue;
16636#ifndef NDEBUG
16637 Type *Ty = Scalar->getType();
16638 if (!Ty->isVoidTy()) {
16639 for (User *U : Scalar->users()) {
16640 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16641
16642 // It is legal to delete users in the ignorelist.
16643 assert((getTreeEntry(U) ||
16644 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16645 (isa_and_nonnull<Instruction>(U) &&
16646 isDeleted(cast<Instruction>(U)))) &&
16647 "Deleting out-of-tree value");
16648 }
16649 }
16650#endif
16651 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16652 auto *I = cast<Instruction>(Scalar);
16653 RemovedInsts.push_back(I);
16654 }
16655 }
16656
16657 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16658 // new vector instruction.
16659 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16660 V->mergeDIAssignID(RemovedInsts);
16661
16662 // Clear up reduction references, if any.
16663 if (UserIgnoreList) {
16664 for (Instruction *I : RemovedInsts) {
16665 const TreeEntry *IE = getTreeEntry(I);
16666 if (IE->Idx != 0 &&
16667 !(VectorizableTree.front()->isGather() &&
16668 !IE->UserTreeIndices.empty() &&
16669 (ValueToGatherNodes.lookup(I).contains(
16670 VectorizableTree.front().get()) ||
16671 any_of(IE->UserTreeIndices,
16672 [&](const EdgeInfo &EI) {
16673 return EI.UserTE == VectorizableTree.front().get() &&
16674 EI.EdgeIdx == UINT_MAX;
16675 }))) &&
16676 !(GatheredLoadsEntriesFirst.has_value() &&
16677 IE->Idx >= *GatheredLoadsEntriesFirst &&
16678 VectorizableTree.front()->isGather() &&
16679 is_contained(VectorizableTree.front()->Scalars, I)))
16680 continue;
16681 SmallVector<SelectInst *> LogicalOpSelects;
16682 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16683 // Do not replace condition of the logical op in form select <cond>.
16684 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16685 (match(U.getUser(), m_LogicalAnd()) ||
16686 match(U.getUser(), m_LogicalOr())) &&
16687 U.getOperandNo() == 0;
16688 if (IsPoisoningLogicalOp) {
16689 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16690 return false;
16691 }
16692 return UserIgnoreList->contains(U.getUser());
16693 });
16694 // Replace conditions of the poisoning logical ops with the non-poison
16695 // constant value.
16696 for (SelectInst *SI : LogicalOpSelects)
16697 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16698 }
16699 }
16700 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16701 // cache correctness.
16702 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16703 // - instructions are not deleted until later.
16704 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16705
16706 Builder.ClearInsertionPoint();
16707 InstrElementSize.clear();
16708
16709 const TreeEntry &RootTE = *VectorizableTree.front();
16710 Value *Vec = RootTE.VectorizedValue;
16711 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16712 It != MinBWs.end() &&
16713 ReductionBitWidth != It->second.first) {
16714 IRBuilder<>::InsertPointGuard Guard(Builder);
16715 Builder.SetInsertPoint(ReductionRoot->getParent(),
16716 ReductionRoot->getIterator());
16717 Vec = Builder.CreateIntCast(
16718 Vec,
16719 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16720 cast<VectorType>(Vec->getType())->getElementCount()),
16721 It->second.second);
16722 }
16723 return Vec;
16724}
16725
16727 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16728 << " gather sequences instructions.\n");
16729 // LICM InsertElementInst sequences.
16730 for (Instruction *I : GatherShuffleExtractSeq) {
16731 if (isDeleted(I))
16732 continue;
16733
16734 // Check if this block is inside a loop.
16735 Loop *L = LI->getLoopFor(I->getParent());
16736 if (!L)
16737 continue;
16738
16739 // Check if it has a preheader.
16740 BasicBlock *PreHeader = L->getLoopPreheader();
16741 if (!PreHeader)
16742 continue;
16743
16744 // If the vector or the element that we insert into it are
16745 // instructions that are defined in this basic block then we can't
16746 // hoist this instruction.
16747 if (any_of(I->operands(), [L](Value *V) {
16748 auto *OpI = dyn_cast<Instruction>(V);
16749 return OpI && L->contains(OpI);
16750 }))
16751 continue;
16752
16753 // We can hoist this instruction. Move it to the pre-header.
16754 I->moveBefore(PreHeader->getTerminator());
16755 CSEBlocks.insert(PreHeader);
16756 }
16757
16758 // Make a list of all reachable blocks in our CSE queue.
16760 CSEWorkList.reserve(CSEBlocks.size());
16761 for (BasicBlock *BB : CSEBlocks)
16762 if (DomTreeNode *N = DT->getNode(BB)) {
16764 CSEWorkList.push_back(N);
16765 }
16766
16767 // Sort blocks by domination. This ensures we visit a block after all blocks
16768 // dominating it are visited.
16769 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
16770 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
16771 "Different nodes should have different DFS numbers");
16772 return A->getDFSNumIn() < B->getDFSNumIn();
16773 });
16774
16775 // Less defined shuffles can be replaced by the more defined copies.
16776 // Between two shuffles one is less defined if it has the same vector operands
16777 // and its mask indeces are the same as in the first one or undefs. E.g.
16778 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
16779 // poison, <0, 0, 0, 0>.
16780 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
16781 SmallVectorImpl<int> &NewMask) {
16782 if (I1->getType() != I2->getType())
16783 return false;
16784 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16785 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16786 if (!SI1 || !SI2)
16787 return I1->isIdenticalTo(I2);
16788 if (SI1->isIdenticalTo(SI2))
16789 return true;
16790 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
16791 if (SI1->getOperand(I) != SI2->getOperand(I))
16792 return false;
16793 // Check if the second instruction is more defined than the first one.
16794 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
16795 ArrayRef<int> SM1 = SI1->getShuffleMask();
16796 // Count trailing undefs in the mask to check the final number of used
16797 // registers.
16798 unsigned LastUndefsCnt = 0;
16799 for (int I = 0, E = NewMask.size(); I < E; ++I) {
16800 if (SM1[I] == PoisonMaskElem)
16801 ++LastUndefsCnt;
16802 else
16803 LastUndefsCnt = 0;
16804 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
16805 NewMask[I] != SM1[I])
16806 return false;
16807 if (NewMask[I] == PoisonMaskElem)
16808 NewMask[I] = SM1[I];
16809 }
16810 // Check if the last undefs actually change the final number of used vector
16811 // registers.
16812 return SM1.size() - LastUndefsCnt > 1 &&
16813 TTI->getNumberOfParts(SI1->getType()) ==
16815 getWidenedType(SI1->getType()->getElementType(),
16816 SM1.size() - LastUndefsCnt));
16817 };
16818 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
16819 // instructions. TODO: We can further optimize this scan if we split the
16820 // instructions into different buckets based on the insert lane.
16822 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
16823 assert(*I &&
16824 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
16825 "Worklist not sorted properly!");
16826 BasicBlock *BB = (*I)->getBlock();
16827 // For all instructions in blocks containing gather sequences:
16828 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
16829 if (isDeleted(&In))
16830 continue;
16831 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
16832 !GatherShuffleExtractSeq.contains(&In))
16833 continue;
16834
16835 // Check if we can replace this instruction with any of the
16836 // visited instructions.
16837 bool Replaced = false;
16838 for (Instruction *&V : Visited) {
16839 SmallVector<int> NewMask;
16840 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
16841 DT->dominates(V->getParent(), In.getParent())) {
16842 In.replaceAllUsesWith(V);
16843 eraseInstruction(&In);
16844 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
16845 if (!NewMask.empty())
16846 SI->setShuffleMask(NewMask);
16847 Replaced = true;
16848 break;
16849 }
16850 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
16851 GatherShuffleExtractSeq.contains(V) &&
16852 IsIdenticalOrLessDefined(V, &In, NewMask) &&
16853 DT->dominates(In.getParent(), V->getParent())) {
16854 In.moveAfter(V);
16855 V->replaceAllUsesWith(&In);
16857 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
16858 if (!NewMask.empty())
16859 SI->setShuffleMask(NewMask);
16860 V = &In;
16861 Replaced = true;
16862 break;
16863 }
16864 }
16865 if (!Replaced) {
16866 assert(!is_contained(Visited, &In));
16867 Visited.push_back(&In);
16868 }
16869 }
16870 }
16871 CSEBlocks.clear();
16872 GatherShuffleExtractSeq.clear();
16873}
16874
16875BoUpSLP::ScheduleData *
16876BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
16877 ScheduleData *Bundle = nullptr;
16878 ScheduleData *PrevInBundle = nullptr;
16879 for (Value *V : VL) {
16881 continue;
16882 ScheduleData *BundleMember = getScheduleData(V);
16883 assert(BundleMember &&
16884 "no ScheduleData for bundle member "
16885 "(maybe not in same basic block)");
16886 assert(BundleMember->isSchedulingEntity() &&
16887 "bundle member already part of other bundle");
16888 if (PrevInBundle) {
16889 PrevInBundle->NextInBundle = BundleMember;
16890 } else {
16891 Bundle = BundleMember;
16892 }
16893
16894 // Group the instructions to a bundle.
16895 BundleMember->FirstInBundle = Bundle;
16896 PrevInBundle = BundleMember;
16897 }
16898 assert(Bundle && "Failed to find schedule bundle");
16899 return Bundle;
16900}
16901
16902// Groups the instructions to a bundle (which is then a single scheduling entity)
16903// and schedules instructions until the bundle gets ready.
16904std::optional<BoUpSLP::ScheduleData *>
16905BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
16906 const InstructionsState &S) {
16907 // No need to schedule PHIs, insertelement, extractelement and extractvalue
16908 // instructions.
16909 if (isa<PHINode>(S.getMainOp()) ||
16911 return nullptr;
16912
16913 // Initialize the instruction bundle.
16914 Instruction *OldScheduleEnd = ScheduleEnd;
16915 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
16916
16917 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
16918 ScheduleData *Bundle) {
16919 // The scheduling region got new instructions at the lower end (or it is a
16920 // new region for the first bundle). This makes it necessary to
16921 // recalculate all dependencies.
16922 // It is seldom that this needs to be done a second time after adding the
16923 // initial bundle to the region.
16924 if (ScheduleEnd != OldScheduleEnd) {
16925 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
16926 if (ScheduleData *SD = getScheduleData(I))
16927 SD->clearDependencies();
16928 ReSchedule = true;
16929 }
16930 if (Bundle) {
16931 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
16932 << " in block " << BB->getName() << "\n");
16933 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
16934 }
16935
16936 if (ReSchedule) {
16937 resetSchedule();
16938 initialFillReadyList(ReadyInsts);
16939 }
16940
16941 // Now try to schedule the new bundle or (if no bundle) just calculate
16942 // dependencies. As soon as the bundle is "ready" it means that there are no
16943 // cyclic dependencies and we can schedule it. Note that's important that we
16944 // don't "schedule" the bundle yet (see cancelScheduling).
16945 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
16946 !ReadyInsts.empty()) {
16947 ScheduleData *Picked = ReadyInsts.pop_back_val();
16948 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
16949 "must be ready to schedule");
16950 schedule(Picked, ReadyInsts);
16951 }
16952 };
16953
16954 // Make sure that the scheduling region contains all
16955 // instructions of the bundle.
16956 for (Value *V : VL) {
16958 continue;
16959 if (!extendSchedulingRegion(V, S)) {
16960 // If the scheduling region got new instructions at the lower end (or it
16961 // is a new region for the first bundle). This makes it necessary to
16962 // recalculate all dependencies.
16963 // Otherwise the compiler may crash trying to incorrectly calculate
16964 // dependencies and emit instruction in the wrong order at the actual
16965 // scheduling.
16966 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
16967 return std::nullopt;
16968 }
16969 }
16970
16971 bool ReSchedule = false;
16972 for (Value *V : VL) {
16974 continue;
16975 ScheduleData *BundleMember = getScheduleData(V);
16976 assert(BundleMember &&
16977 "no ScheduleData for bundle member (maybe not in same basic block)");
16978
16979 // Make sure we don't leave the pieces of the bundle in the ready list when
16980 // whole bundle might not be ready.
16981 ReadyInsts.remove(BundleMember);
16982
16983 if (!BundleMember->IsScheduled)
16984 continue;
16985 // A bundle member was scheduled as single instruction before and now
16986 // needs to be scheduled as part of the bundle. We just get rid of the
16987 // existing schedule.
16988 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
16989 << " was already scheduled\n");
16990 ReSchedule = true;
16991 }
16992
16993 auto *Bundle = buildBundle(VL);
16994 TryScheduleBundleImpl(ReSchedule, Bundle);
16995 if (!Bundle->isReady()) {
16996 cancelScheduling(VL, S.getMainOp());
16997 return std::nullopt;
16998 }
16999 return Bundle;
17000}
17001
17002void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17003 Value *OpValue) {
17004 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17006 return;
17007
17008 if (doesNotNeedToBeScheduled(OpValue))
17009 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17010 ScheduleData *Bundle = getScheduleData(OpValue);
17011 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17012 assert(!Bundle->IsScheduled &&
17013 "Can't cancel bundle which is already scheduled");
17014 assert(Bundle->isSchedulingEntity() &&
17015 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17016 "tried to unbundle something which is not a bundle");
17017
17018 // Remove the bundle from the ready list.
17019 if (Bundle->isReady())
17020 ReadyInsts.remove(Bundle);
17021
17022 // Un-bundle: make single instructions out of the bundle.
17023 ScheduleData *BundleMember = Bundle;
17024 while (BundleMember) {
17025 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17026 BundleMember->FirstInBundle = BundleMember;
17027 ScheduleData *Next = BundleMember->NextInBundle;
17028 BundleMember->NextInBundle = nullptr;
17029 BundleMember->TE = nullptr;
17030 if (BundleMember->unscheduledDepsInBundle() == 0) {
17031 ReadyInsts.insert(BundleMember);
17032 }
17033 BundleMember = Next;
17034 }
17035}
17036
17037BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17038 // Allocate a new ScheduleData for the instruction.
17039 if (ChunkPos >= ChunkSize) {
17040 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17041 ChunkPos = 0;
17042 }
17043 return &(ScheduleDataChunks.back()[ChunkPos++]);
17044}
17045
17046bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17047 Value *V, const InstructionsState &S) {
17048 Instruction *I = dyn_cast<Instruction>(V);
17049 assert(I && "bundle member must be an instruction");
17050 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17052 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17053 "be scheduled");
17054 if (getScheduleData(I))
17055 return true;
17056 if (!ScheduleStart) {
17057 // It's the first instruction in the new region.
17058 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17059 ScheduleStart = I;
17060 ScheduleEnd = I->getNextNode();
17061 assert(ScheduleEnd && "tried to vectorize a terminator?");
17062 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17063 return true;
17064 }
17065 // Search up and down at the same time, because we don't know if the new
17066 // instruction is above or below the existing scheduling region.
17067 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17068 // against the budget. Otherwise debug info could affect codegen.
17070 ++ScheduleStart->getIterator().getReverse();
17071 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17072 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17073 BasicBlock::iterator LowerEnd = BB->end();
17074 auto IsAssumeLikeIntr = [](const Instruction &I) {
17075 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17076 return II->isAssumeLikeIntrinsic();
17077 return false;
17078 };
17079 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17080 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17081 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17082 &*DownIter != I) {
17083 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17084 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17085 return false;
17086 }
17087
17088 ++UpIter;
17089 ++DownIter;
17090
17091 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17092 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17093 }
17094 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17095 assert(I->getParent() == ScheduleStart->getParent() &&
17096 "Instruction is in wrong basic block.");
17097 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17098 ScheduleStart = I;
17099 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17100 << "\n");
17101 return true;
17102 }
17103 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17104 "Expected to reach top of the basic block or instruction down the "
17105 "lower end.");
17106 assert(I->getParent() == ScheduleEnd->getParent() &&
17107 "Instruction is in wrong basic block.");
17108 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17109 nullptr);
17110 ScheduleEnd = I->getNextNode();
17111 assert(ScheduleEnd && "tried to vectorize a terminator?");
17112 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17113 return true;
17114}
17115
17116void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17117 Instruction *ToI,
17118 ScheduleData *PrevLoadStore,
17119 ScheduleData *NextLoadStore) {
17120 ScheduleData *CurrentLoadStore = PrevLoadStore;
17121 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17122 // No need to allocate data for non-schedulable instructions.
17124 continue;
17125 ScheduleData *SD = ScheduleDataMap.lookup(I);
17126 if (!SD) {
17127 SD = allocateScheduleDataChunks();
17128 ScheduleDataMap[I] = SD;
17129 }
17130 assert(!isInSchedulingRegion(SD) &&
17131 "new ScheduleData already in scheduling region");
17132 SD->init(SchedulingRegionID, I);
17133
17134 if (I->mayReadOrWriteMemory() &&
17135 (!isa<IntrinsicInst>(I) ||
17136 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17137 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17138 Intrinsic::pseudoprobe))) {
17139 // Update the linked list of memory accessing instructions.
17140 if (CurrentLoadStore) {
17141 CurrentLoadStore->NextLoadStore = SD;
17142 } else {
17143 FirstLoadStoreInRegion = SD;
17144 }
17145 CurrentLoadStore = SD;
17146 }
17147
17148 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17149 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17150 RegionHasStackSave = true;
17151 }
17152 if (NextLoadStore) {
17153 if (CurrentLoadStore)
17154 CurrentLoadStore->NextLoadStore = NextLoadStore;
17155 } else {
17156 LastLoadStoreInRegion = CurrentLoadStore;
17157 }
17158}
17159
17160void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17161 bool InsertInReadyList,
17162 BoUpSLP *SLP) {
17163 assert(SD->isSchedulingEntity());
17164
17166 WorkList.push_back(SD);
17167
17168 while (!WorkList.empty()) {
17169 ScheduleData *SD = WorkList.pop_back_val();
17170 for (ScheduleData *BundleMember = SD; BundleMember;
17171 BundleMember = BundleMember->NextInBundle) {
17172 assert(isInSchedulingRegion(BundleMember));
17173 if (BundleMember->hasValidDependencies())
17174 continue;
17175
17176 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17177 << "\n");
17178 BundleMember->Dependencies = 0;
17179 BundleMember->resetUnscheduledDeps();
17180
17181 // Handle def-use chain dependencies.
17182 for (User *U : BundleMember->Inst->users()) {
17183 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17184 BundleMember->Dependencies++;
17185 ScheduleData *DestBundle = UseSD->FirstInBundle;
17186 if (!DestBundle->IsScheduled)
17187 BundleMember->incrementUnscheduledDeps(1);
17188 if (!DestBundle->hasValidDependencies())
17189 WorkList.push_back(DestBundle);
17190 }
17191 }
17192
17193 auto MakeControlDependent = [&](Instruction *I) {
17194 auto *DepDest = getScheduleData(I);
17195 assert(DepDest && "must be in schedule window");
17196 DepDest->ControlDependencies.push_back(BundleMember);
17197 BundleMember->Dependencies++;
17198 ScheduleData *DestBundle = DepDest->FirstInBundle;
17199 if (!DestBundle->IsScheduled)
17200 BundleMember->incrementUnscheduledDeps(1);
17201 if (!DestBundle->hasValidDependencies())
17202 WorkList.push_back(DestBundle);
17203 };
17204
17205 // Any instruction which isn't safe to speculate at the beginning of the
17206 // block is control dependend on any early exit or non-willreturn call
17207 // which proceeds it.
17208 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17209 for (Instruction *I = BundleMember->Inst->getNextNode();
17210 I != ScheduleEnd; I = I->getNextNode()) {
17211 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17212 continue;
17213
17214 // Add the dependency
17215 MakeControlDependent(I);
17216
17218 // Everything past here must be control dependent on I.
17219 break;
17220 }
17221 }
17222
17223 if (RegionHasStackSave) {
17224 // If we have an inalloc alloca instruction, it needs to be scheduled
17225 // after any preceeding stacksave. We also need to prevent any alloca
17226 // from reordering above a preceeding stackrestore.
17227 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17228 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17229 for (Instruction *I = BundleMember->Inst->getNextNode();
17230 I != ScheduleEnd; I = I->getNextNode()) {
17231 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17232 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17233 // Any allocas past here must be control dependent on I, and I
17234 // must be memory dependend on BundleMember->Inst.
17235 break;
17236
17237 if (!isa<AllocaInst>(I))
17238 continue;
17239
17240 // Add the dependency
17241 MakeControlDependent(I);
17242 }
17243 }
17244
17245 // In addition to the cases handle just above, we need to prevent
17246 // allocas and loads/stores from moving below a stacksave or a
17247 // stackrestore. Avoiding moving allocas below stackrestore is currently
17248 // thought to be conservatism. Moving loads/stores below a stackrestore
17249 // can lead to incorrect code.
17250 if (isa<AllocaInst>(BundleMember->Inst) ||
17251 BundleMember->Inst->mayReadOrWriteMemory()) {
17252 for (Instruction *I = BundleMember->Inst->getNextNode();
17253 I != ScheduleEnd; I = I->getNextNode()) {
17254 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17255 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17256 continue;
17257
17258 // Add the dependency
17259 MakeControlDependent(I);
17260 break;
17261 }
17262 }
17263 }
17264
17265 // Handle the memory dependencies (if any).
17266 ScheduleData *DepDest = BundleMember->NextLoadStore;
17267 if (!DepDest)
17268 continue;
17269 Instruction *SrcInst = BundleMember->Inst;
17270 assert(SrcInst->mayReadOrWriteMemory() &&
17271 "NextLoadStore list for non memory effecting bundle?");
17272 MemoryLocation SrcLoc = getLocation(SrcInst);
17273 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17274 unsigned NumAliased = 0;
17275 unsigned DistToSrc = 1;
17276
17277 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17278 assert(isInSchedulingRegion(DepDest));
17279
17280 // We have two limits to reduce the complexity:
17281 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17282 // SLP->isAliased (which is the expensive part in this loop).
17283 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17284 // the whole loop (even if the loop is fast, it's quadratic).
17285 // It's important for the loop break condition (see below) to
17286 // check this limit even between two read-only instructions.
17287 if (DistToSrc >= MaxMemDepDistance ||
17288 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17289 (NumAliased >= AliasedCheckLimit ||
17290 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17291
17292 // We increment the counter only if the locations are aliased
17293 // (instead of counting all alias checks). This gives a better
17294 // balance between reduced runtime and accurate dependencies.
17295 NumAliased++;
17296
17297 DepDest->MemoryDependencies.push_back(BundleMember);
17298 BundleMember->Dependencies++;
17299 ScheduleData *DestBundle = DepDest->FirstInBundle;
17300 if (!DestBundle->IsScheduled) {
17301 BundleMember->incrementUnscheduledDeps(1);
17302 }
17303 if (!DestBundle->hasValidDependencies()) {
17304 WorkList.push_back(DestBundle);
17305 }
17306 }
17307
17308 // Example, explaining the loop break condition: Let's assume our
17309 // starting instruction is i0 and MaxMemDepDistance = 3.
17310 //
17311 // +--------v--v--v
17312 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17313 // +--------^--^--^
17314 //
17315 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17316 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17317 // Previously we already added dependencies from i3 to i6,i7,i8
17318 // (because of MaxMemDepDistance). As we added a dependency from
17319 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17320 // and we can abort this loop at i6.
17321 if (DistToSrc >= 2 * MaxMemDepDistance)
17322 break;
17323 DistToSrc++;
17324 }
17325 }
17326 if (InsertInReadyList && SD->isReady()) {
17327 ReadyInsts.insert(SD);
17328 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17329 << "\n");
17330 }
17331 }
17332}
17333
17334void BoUpSLP::BlockScheduling::resetSchedule() {
17335 assert(ScheduleStart &&
17336 "tried to reset schedule on block which has not been scheduled");
17337 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17338 if (ScheduleData *SD = getScheduleData(I)) {
17339 assert(isInSchedulingRegion(SD) &&
17340 "ScheduleData not in scheduling region");
17341 SD->IsScheduled = false;
17342 SD->resetUnscheduledDeps();
17343 }
17344 }
17345 ReadyInsts.clear();
17346}
17347
17348void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17349 if (!BS->ScheduleStart)
17350 return;
17351
17352 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17353
17354 // A key point - if we got here, pre-scheduling was able to find a valid
17355 // scheduling of the sub-graph of the scheduling window which consists
17356 // of all vector bundles and their transitive users. As such, we do not
17357 // need to reschedule anything *outside of* that subgraph.
17358
17359 BS->resetSchedule();
17360
17361 // For the real scheduling we use a more sophisticated ready-list: it is
17362 // sorted by the original instruction location. This lets the final schedule
17363 // be as close as possible to the original instruction order.
17364 // WARNING: If changing this order causes a correctness issue, that means
17365 // there is some missing dependence edge in the schedule data graph.
17366 struct ScheduleDataCompare {
17367 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17368 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17369 }
17370 };
17371 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17372
17373 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17374 // and fill the ready-list with initial instructions.
17375 int Idx = 0;
17376 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17377 I = I->getNextNode()) {
17378 if (ScheduleData *SD = BS->getScheduleData(I)) {
17379 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17380 (void)SDTE;
17382 SD->isPartOfBundle() ==
17383 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17384 "scheduler and vectorizer bundle mismatch");
17385 SD->FirstInBundle->SchedulingPriority = Idx++;
17386
17387 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17388 BS->calculateDependencies(SD, false, this);
17389 }
17390 }
17391 BS->initialFillReadyList(ReadyInsts);
17392
17393 Instruction *LastScheduledInst = BS->ScheduleEnd;
17394
17395 // Do the "real" scheduling.
17396 while (!ReadyInsts.empty()) {
17397 ScheduleData *Picked = *ReadyInsts.begin();
17398 ReadyInsts.erase(ReadyInsts.begin());
17399
17400 // Move the scheduled instruction(s) to their dedicated places, if not
17401 // there yet.
17402 for (ScheduleData *BundleMember = Picked; BundleMember;
17403 BundleMember = BundleMember->NextInBundle) {
17404 Instruction *PickedInst = BundleMember->Inst;
17405 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17406 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17407 LastScheduledInst = PickedInst;
17408 }
17409
17410 BS->schedule(Picked, ReadyInsts);
17411 }
17412
17413 // Check that we didn't break any of our invariants.
17414#ifdef EXPENSIVE_CHECKS
17415 BS->verify();
17416#endif
17417
17418#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17419 // Check that all schedulable entities got scheduled
17420 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17421 ScheduleData *SD = BS->getScheduleData(I);
17422 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17423 assert(SD->IsScheduled && "must be scheduled at this point");
17424 }
17425#endif
17426
17427 // Avoid duplicate scheduling of the block.
17428 BS->ScheduleStart = nullptr;
17429}
17430
17432 // If V is a store, just return the width of the stored value (or value
17433 // truncated just before storing) without traversing the expression tree.
17434 // This is the common case.
17435 if (auto *Store = dyn_cast<StoreInst>(V))
17436 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17437
17438 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17439 return getVectorElementSize(IEI->getOperand(1));
17440
17441 auto E = InstrElementSize.find(V);
17442 if (E != InstrElementSize.end())
17443 return E->second;
17444
17445 // If V is not a store, we can traverse the expression tree to find loads
17446 // that feed it. The type of the loaded value may indicate a more suitable
17447 // width than V's type. We want to base the vector element size on the width
17448 // of memory operations where possible.
17451 if (auto *I = dyn_cast<Instruction>(V)) {
17452 Worklist.emplace_back(I, I->getParent(), 0);
17453 Visited.insert(I);
17454 }
17455
17456 // Traverse the expression tree in bottom-up order looking for loads. If we
17457 // encounter an instruction we don't yet handle, we give up.
17458 auto Width = 0u;
17459 Value *FirstNonBool = nullptr;
17460 while (!Worklist.empty()) {
17461 auto [I, Parent, Level] = Worklist.pop_back_val();
17462
17463 // We should only be looking at scalar instructions here. If the current
17464 // instruction has a vector type, skip.
17465 auto *Ty = I->getType();
17466 if (isa<VectorType>(Ty))
17467 continue;
17468 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17469 FirstNonBool = I;
17470 if (Level > RecursionMaxDepth)
17471 continue;
17472
17473 // If the current instruction is a load, update MaxWidth to reflect the
17474 // width of the loaded value.
17475 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17476 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17477
17478 // Otherwise, we need to visit the operands of the instruction. We only
17479 // handle the interesting cases from buildTree here. If an operand is an
17480 // instruction we haven't yet visited and from the same basic block as the
17481 // user or the use is a PHI node, we add it to the worklist.
17484 for (Use &U : I->operands()) {
17485 if (auto *J = dyn_cast<Instruction>(U.get()))
17486 if (Visited.insert(J).second &&
17487 (isa<PHINode>(I) || J->getParent() == Parent)) {
17488 Worklist.emplace_back(J, J->getParent(), Level + 1);
17489 continue;
17490 }
17491 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17492 FirstNonBool = U.get();
17493 }
17494 } else {
17495 break;
17496 }
17497 }
17498
17499 // If we didn't encounter a memory access in the expression tree, or if we
17500 // gave up for some reason, just return the width of V. Otherwise, return the
17501 // maximum width we found.
17502 if (!Width) {
17503 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17504 V = FirstNonBool;
17505 Width = DL->getTypeSizeInBits(V->getType());
17506 }
17507
17508 for (Instruction *I : Visited)
17509 InstrElementSize[I] = Width;
17510
17511 return Width;
17512}
17513
17514bool BoUpSLP::collectValuesToDemote(
17515 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17517 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17518 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17519 // We can always demote constants.
17520 if (all_of(E.Scalars, IsaPred<Constant>))
17521 return true;
17522
17523 unsigned OrigBitWidth =
17524 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17525 if (OrigBitWidth == BitWidth) {
17526 MaxDepthLevel = 1;
17527 return true;
17528 }
17529
17530 // Check if the node was analyzed already and must keep its original bitwidth.
17531 if (NodesToKeepBWs.contains(E.Idx))
17532 return false;
17533
17534 // If the value is not a vectorized instruction in the expression and not used
17535 // by the insertelement instruction and not used in multiple vector nodes, it
17536 // cannot be demoted.
17537 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17538 if (isa<PoisonValue>(R))
17539 return false;
17540 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17541 });
17542 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17543 if (isa<PoisonValue>(V))
17544 return true;
17545 if (MultiNodeScalars.contains(V))
17546 return false;
17547 // For lat shuffle of sext/zext with many uses need to check the extra bit
17548 // for unsigned values, otherwise may have incorrect casting for reused
17549 // scalars.
17550 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17551 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17552 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17553 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17554 return true;
17555 }
17556 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17557 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17558 if (IsSignedNode)
17559 ++BitWidth1;
17560 if (auto *I = dyn_cast<Instruction>(V)) {
17561 APInt Mask = DB->getDemandedBits(I);
17562 unsigned BitWidth2 =
17563 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17564 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17565 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17566 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17567 break;
17568 BitWidth2 *= 2;
17569 }
17570 BitWidth1 = std::min(BitWidth1, BitWidth2);
17571 }
17572 BitWidth = std::max(BitWidth, BitWidth1);
17573 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17574 };
17575 using namespace std::placeholders;
17576 auto FinalAnalysis = [&]() {
17577 if (!IsProfitableToDemote)
17578 return false;
17579 bool Res = all_of(
17580 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17581 // Demote gathers.
17582 if (Res && E.isGather()) {
17583 // Check possible extractelement instructions bases and final vector
17584 // length.
17585 SmallPtrSet<Value *, 4> UniqueBases;
17586 for (Value *V : E.Scalars) {
17587 auto *EE = dyn_cast<ExtractElementInst>(V);
17588 if (!EE)
17589 continue;
17590 UniqueBases.insert(EE->getVectorOperand());
17591 }
17592 const unsigned VF = E.Scalars.size();
17593 Type *OrigScalarTy = E.Scalars.front()->getType();
17594 if (UniqueBases.size() <= 2 ||
17595 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17597 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17598 ToDemote.push_back(E.Idx);
17599 }
17600 return Res;
17601 };
17602 if (E.isGather() || !Visited.insert(&E).second ||
17603 any_of(E.Scalars, [&](Value *V) {
17604 return all_of(V->users(), [&](User *U) {
17605 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17606 });
17607 }))
17608 return FinalAnalysis();
17609
17610 if (any_of(E.Scalars, [&](Value *V) {
17611 return !all_of(V->users(), [=](User *U) {
17612 return getTreeEntry(U) ||
17613 (E.Idx == 0 && UserIgnoreList &&
17614 UserIgnoreList->contains(U)) ||
17615 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17616 !U->getType()->isScalableTy() &&
17617 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17618 }) && !IsPotentiallyTruncated(V, BitWidth);
17619 }))
17620 return false;
17621
17622 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17623 bool &NeedToExit) {
17624 NeedToExit = false;
17625 unsigned InitLevel = MaxDepthLevel;
17626 for (const TreeEntry *Op : Operands) {
17627 unsigned Level = InitLevel;
17628 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17629 ToDemote, Visited, NodesToKeepBWs, Level,
17630 IsProfitableToDemote, IsTruncRoot)) {
17631 if (!IsProfitableToDemote)
17632 return false;
17633 NeedToExit = true;
17634 if (!FinalAnalysis())
17635 return false;
17636 continue;
17637 }
17638 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17639 }
17640 return true;
17641 };
17642 auto AttemptCheckBitwidth =
17643 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17644 // Try all bitwidth < OrigBitWidth.
17645 NeedToExit = false;
17646 unsigned BestFailBitwidth = 0;
17647 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17648 if (Checker(BitWidth, OrigBitWidth))
17649 return true;
17650 if (BestFailBitwidth == 0 && FinalAnalysis())
17651 BestFailBitwidth = BitWidth;
17652 }
17653 if (BitWidth >= OrigBitWidth) {
17654 if (BestFailBitwidth == 0) {
17655 BitWidth = OrigBitWidth;
17656 return false;
17657 }
17658 MaxDepthLevel = 1;
17659 BitWidth = BestFailBitwidth;
17660 NeedToExit = true;
17661 return true;
17662 }
17663 return false;
17664 };
17665 auto TryProcessInstruction =
17666 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17667 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17668 if (Operands.empty()) {
17669 if (!IsTruncRoot)
17670 MaxDepthLevel = 1;
17671 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17672 std::ref(BitWidth)));
17673 } else {
17674 // Several vectorized uses? Check if we can truncate it, otherwise -
17675 // exit.
17676 if (E.UserTreeIndices.size() > 1 &&
17677 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17678 std::ref(BitWidth))))
17679 return false;
17680 bool NeedToExit = false;
17681 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17682 return false;
17683 if (NeedToExit)
17684 return true;
17685 if (!ProcessOperands(Operands, NeedToExit))
17686 return false;
17687 if (NeedToExit)
17688 return true;
17689 }
17690
17691 ++MaxDepthLevel;
17692 // Record the entry that we can demote.
17693 ToDemote.push_back(E.Idx);
17694 return IsProfitableToDemote;
17695 };
17696 switch (E.getOpcode()) {
17697
17698 // We can always demote truncations and extensions. Since truncations can
17699 // seed additional demotion, we save the truncated value.
17700 case Instruction::Trunc:
17701 if (IsProfitableToDemoteRoot)
17702 IsProfitableToDemote = true;
17703 return TryProcessInstruction(BitWidth);
17704 case Instruction::ZExt:
17705 case Instruction::SExt:
17706 IsProfitableToDemote = true;
17707 return TryProcessInstruction(BitWidth);
17708
17709 // We can demote certain binary operations if we can demote both of their
17710 // operands.
17711 case Instruction::Add:
17712 case Instruction::Sub:
17713 case Instruction::Mul:
17714 case Instruction::And:
17715 case Instruction::Or:
17716 case Instruction::Xor: {
17717 return TryProcessInstruction(
17718 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17719 }
17720 case Instruction::Freeze:
17721 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17722 case Instruction::Shl: {
17723 // If we are truncating the result of this SHL, and if it's a shift of an
17724 // inrange amount, we can always perform a SHL in a smaller type.
17725 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17726 return all_of(E.Scalars, [&](Value *V) {
17727 if (isa<PoisonValue>(V))
17728 return true;
17729 auto *I = cast<Instruction>(V);
17730 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17731 return AmtKnownBits.getMaxValue().ult(BitWidth);
17732 });
17733 };
17734 return TryProcessInstruction(
17735 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17736 }
17737 case Instruction::LShr: {
17738 // If this is a truncate of a logical shr, we can truncate it to a smaller
17739 // lshr iff we know that the bits we would otherwise be shifting in are
17740 // already zeros.
17741 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17742 return all_of(E.Scalars, [&](Value *V) {
17743 if (isa<PoisonValue>(V))
17744 return true;
17745 auto *I = cast<Instruction>(V);
17746 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17747 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17748 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17749 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17750 SimplifyQuery(*DL));
17751 });
17752 };
17753 return TryProcessInstruction(
17754 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17755 LShrChecker);
17756 }
17757 case Instruction::AShr: {
17758 // If this is a truncate of an arithmetic shr, we can truncate it to a
17759 // smaller ashr iff we know that all the bits from the sign bit of the
17760 // original type and the sign bit of the truncate type are similar.
17761 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17762 return all_of(E.Scalars, [&](Value *V) {
17763 if (isa<PoisonValue>(V))
17764 return true;
17765 auto *I = cast<Instruction>(V);
17766 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17767 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17768 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17769 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17770 nullptr, DT);
17771 });
17772 };
17773 return TryProcessInstruction(
17774 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17775 AShrChecker);
17776 }
17777 case Instruction::UDiv:
17778 case Instruction::URem: {
17779 // UDiv and URem can be truncated if all the truncated bits are zero.
17780 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17781 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17782 return all_of(E.Scalars, [&](Value *V) {
17783 auto *I = cast<Instruction>(V);
17784 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17785 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17786 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17787 });
17788 };
17789 return TryProcessInstruction(
17790 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17791 }
17792
17793 // We can demote selects if we can demote their true and false values.
17794 case Instruction::Select: {
17795 return TryProcessInstruction(
17796 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
17797 }
17798
17799 // We can demote phis if we can demote all their incoming operands. Note that
17800 // we don't need to worry about cycles since we ensure single use above.
17801 case Instruction::PHI: {
17802 const unsigned NumOps = E.getNumOperands();
17804 transform(seq<unsigned>(0, NumOps), Ops.begin(),
17805 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
17806
17807 return TryProcessInstruction(BitWidth, Ops);
17808 }
17809
17810 case Instruction::Call: {
17811 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
17812 if (!IC)
17813 break;
17815 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
17816 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
17817 break;
17818 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
17819 function_ref<bool(unsigned, unsigned)> CallChecker;
17820 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17821 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17822 return all_of(E.Scalars, [&](Value *V) {
17823 auto *I = cast<Instruction>(V);
17824 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
17825 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17826 return MaskedValueIsZero(I->getOperand(0), Mask,
17827 SimplifyQuery(*DL)) &&
17828 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17829 }
17830 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
17831 "Expected min/max intrinsics only.");
17832 unsigned SignBits = OrigBitWidth - BitWidth;
17833 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
17834 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17835 nullptr, DT);
17836 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
17837 nullptr, DT);
17838 return SignBits <= Op0SignBits &&
17839 ((SignBits != Op0SignBits &&
17840 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
17841 MaskedValueIsZero(I->getOperand(0), Mask,
17842 SimplifyQuery(*DL))) &&
17843 SignBits <= Op1SignBits &&
17844 ((SignBits != Op1SignBits &&
17845 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
17846 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
17847 });
17848 };
17849 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17850 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17851 return all_of(E.Scalars, [&](Value *V) {
17852 auto *I = cast<Instruction>(V);
17853 unsigned SignBits = OrigBitWidth - BitWidth;
17854 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
17855 unsigned Op0SignBits =
17856 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
17857 return SignBits <= Op0SignBits &&
17858 ((SignBits != Op0SignBits &&
17859 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
17860 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
17861 });
17862 };
17863 if (ID != Intrinsic::abs) {
17864 Operands.push_back(getOperandEntry(&E, 1));
17865 CallChecker = CompChecker;
17866 } else {
17867 CallChecker = AbsChecker;
17868 }
17869 InstructionCost BestCost =
17870 std::numeric_limits<InstructionCost::CostType>::max();
17871 unsigned BestBitWidth = BitWidth;
17872 unsigned VF = E.Scalars.size();
17873 // Choose the best bitwidth based on cost estimations.
17874 auto Checker = [&](unsigned BitWidth, unsigned) {
17875 unsigned MinBW = PowerOf2Ceil(BitWidth);
17876 SmallVector<Type *> ArgTys =
17877 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
17878 auto VecCallCosts = getVectorCallCosts(
17879 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
17880 TTI, TLI, ArgTys);
17881 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
17882 if (Cost < BestCost) {
17883 BestCost = Cost;
17884 BestBitWidth = BitWidth;
17885 }
17886 return false;
17887 };
17888 [[maybe_unused]] bool NeedToExit;
17889 (void)AttemptCheckBitwidth(Checker, NeedToExit);
17890 BitWidth = BestBitWidth;
17891 return TryProcessInstruction(BitWidth, Operands, CallChecker);
17892 }
17893
17894 // Otherwise, conservatively give up.
17895 default:
17896 break;
17897 }
17898 MaxDepthLevel = 1;
17899 return FinalAnalysis();
17900}
17901
17902static RecurKind getRdxKind(Value *V);
17903
17905 // We only attempt to truncate integer expressions.
17906 bool IsStoreOrInsertElt =
17907 VectorizableTree.front()->getOpcode() == Instruction::Store ||
17908 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
17909 if ((IsStoreOrInsertElt || UserIgnoreList) &&
17910 ExtraBitWidthNodes.size() <= 1 &&
17911 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
17912 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
17913 return;
17914
17915 unsigned NodeIdx = 0;
17916 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
17917 NodeIdx = 1;
17918
17919 // Ensure the roots of the vectorizable tree don't form a cycle.
17920 if (VectorizableTree[NodeIdx]->isGather() ||
17921 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
17922 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
17923 [NodeIdx](const EdgeInfo &EI) {
17924 return EI.UserTE->Idx > NodeIdx;
17925 })))
17926 return;
17927
17928 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
17929 // resize to the final type.
17930 bool IsTruncRoot = false;
17931 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
17932 SmallVector<unsigned> RootDemotes;
17933 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
17934 if (NodeIdx != 0 &&
17935 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
17936 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
17937 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
17938 IsTruncRoot = true;
17939 RootDemotes.push_back(NodeIdx);
17940 IsProfitableToDemoteRoot = true;
17941 ++NodeIdx;
17942 }
17943
17944 // Analyzed the reduction already and not profitable - exit.
17945 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
17946 return;
17947
17948 SmallVector<unsigned> ToDemote;
17949 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
17950 bool IsProfitableToDemoteRoot, unsigned Opcode,
17951 unsigned Limit, bool IsTruncRoot,
17952 bool IsSignedCmp) -> unsigned {
17953 ToDemote.clear();
17954 // Check if the root is trunc and the next node is gather/buildvector, then
17955 // keep trunc in scalars, which is free in most cases.
17956 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
17957 !NodesToKeepBWs.contains(E.Idx) &&
17958 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
17959 all_of(E.Scalars, [&](Value *V) {
17960 return V->hasOneUse() || isa<Constant>(V) ||
17961 (!V->hasNUsesOrMore(UsesLimit) &&
17962 none_of(V->users(), [&](User *U) {
17963 const TreeEntry *TE = getTreeEntry(U);
17964 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
17965 if (TE == UserTE || !TE)
17966 return false;
17967 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
17968 SelectInst>(U) ||
17969 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
17970 SelectInst>(UserTE->getMainOp()))
17971 return true;
17972 unsigned UserTESz = DL->getTypeSizeInBits(
17973 UserTE->Scalars.front()->getType());
17974 auto It = MinBWs.find(TE);
17975 if (It != MinBWs.end() && It->second.first > UserTESz)
17976 return true;
17977 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
17978 }));
17979 })) {
17980 ToDemote.push_back(E.Idx);
17981 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
17982 auto It = MinBWs.find(UserTE);
17983 if (It != MinBWs.end())
17984 return It->second.first;
17985 unsigned MaxBitWidth =
17986 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
17987 MaxBitWidth = bit_ceil(MaxBitWidth);
17988 if (MaxBitWidth < 8 && MaxBitWidth > 1)
17989 MaxBitWidth = 8;
17990 return MaxBitWidth;
17991 }
17992
17993 unsigned VF = E.getVectorFactor();
17994 Type *ScalarTy = E.Scalars.front()->getType();
17995 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
17996 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
17997 if (!TreeRootIT || !Opcode)
17998 return 0u;
17999
18000 if (any_of(E.Scalars,
18001 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18002 return 0u;
18003
18004 unsigned NumParts = TTI->getNumberOfParts(
18005 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18006
18007 // The maximum bit width required to represent all the values that can be
18008 // demoted without loss of precision. It would be safe to truncate the roots
18009 // of the expression to this width.
18010 unsigned MaxBitWidth = 1u;
18011
18012 // True if the roots can be zero-extended back to their original type,
18013 // rather than sign-extended. We know that if the leading bits are not
18014 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18015 // True.
18016 // Determine if the sign bit of all the roots is known to be zero. If not,
18017 // IsKnownPositive is set to False.
18018 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18019 if (isa<PoisonValue>(R))
18020 return true;
18021 KnownBits Known = computeKnownBits(R, *DL);
18022 return Known.isNonNegative();
18023 });
18024
18025 // We first check if all the bits of the roots are demanded. If they're not,
18026 // we can truncate the roots to this narrower type.
18027 for (Value *Root : E.Scalars) {
18028 if (isa<PoisonValue>(Root))
18029 continue;
18030 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18031 TypeSize NumTypeBits =
18032 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18033 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18034 // If we can't prove that the sign bit is zero, we must add one to the
18035 // maximum bit width to account for the unknown sign bit. This preserves
18036 // the existing sign bit so we can safely sign-extend the root back to the
18037 // original type. Otherwise, if we know the sign bit is zero, we will
18038 // zero-extend the root instead.
18039 //
18040 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18041 // one to the maximum bit width will yield a larger-than-necessary
18042 // type. In general, we need to add an extra bit only if we can't
18043 // prove that the upper bit of the original type is equal to the
18044 // upper bit of the proposed smaller type. If these two bits are
18045 // the same (either zero or one) we know that sign-extending from
18046 // the smaller type will result in the same value. Here, since we
18047 // can't yet prove this, we are just making the proposed smaller
18048 // type larger to ensure correctness.
18049 if (!IsKnownPositive)
18050 ++BitWidth1;
18051
18052 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18053 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18054 MaxBitWidth =
18055 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18056 }
18057
18058 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18059 MaxBitWidth = 8;
18060
18061 // If the original type is large, but reduced type does not improve the reg
18062 // use - ignore it.
18063 if (NumParts > 1 &&
18064 NumParts ==
18066 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18067 return 0u;
18068
18069 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18070 Opcode == Instruction::SExt ||
18071 Opcode == Instruction::ZExt || NumParts > 1;
18072 // Conservatively determine if we can actually truncate the roots of the
18073 // expression. Collect the values that can be demoted in ToDemote and
18074 // additional roots that require investigating in Roots.
18076 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18077 bool NeedToDemote = IsProfitableToDemote;
18078
18079 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18080 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18081 NeedToDemote, IsTruncRoot) ||
18082 (MaxDepthLevel <= Limit &&
18083 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18084 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18085 DL->getTypeSizeInBits(TreeRootIT) /
18086 DL->getTypeSizeInBits(
18087 E.getMainOp()->getOperand(0)->getType()) >
18088 2)))))
18089 return 0u;
18090 // Round MaxBitWidth up to the next power-of-two.
18091 MaxBitWidth = bit_ceil(MaxBitWidth);
18092
18093 return MaxBitWidth;
18094 };
18095
18096 // If we can truncate the root, we must collect additional values that might
18097 // be demoted as a result. That is, those seeded by truncations we will
18098 // modify.
18099 // Add reduction ops sizes, if any.
18100 if (UserIgnoreList &&
18101 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18102 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18103 // x i1> to in)).
18104 if (all_of(*UserIgnoreList,
18105 [](Value *V) {
18106 return isa<PoisonValue>(V) ||
18107 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18108 }) &&
18109 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18110 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18111 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18112 Builder.getInt1Ty()) {
18113 ReductionBitWidth = 1;
18114 } else {
18115 for (Value *V : *UserIgnoreList) {
18116 if (isa<PoisonValue>(V))
18117 continue;
18118 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18119 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18120 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18122 ++BitWidth1;
18123 unsigned BitWidth2 = BitWidth1;
18125 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18126 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18127 }
18128 ReductionBitWidth =
18129 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18130 }
18131 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18132 ReductionBitWidth = 8;
18133
18134 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18135 }
18136 }
18137 bool IsTopRoot = NodeIdx == 0;
18138 while (NodeIdx < VectorizableTree.size() &&
18139 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18140 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18141 RootDemotes.push_back(NodeIdx);
18142 ++NodeIdx;
18143 IsTruncRoot = true;
18144 }
18145 bool IsSignedCmp = false;
18146 while (NodeIdx < VectorizableTree.size()) {
18147 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18148 unsigned Limit = 2;
18149 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18150 if (IsTopRoot &&
18151 ReductionBitWidth ==
18152 DL->getTypeSizeInBits(
18153 VectorizableTree.front()->Scalars.front()->getType()))
18154 Limit = 3;
18155 unsigned MaxBitWidth = ComputeMaxBitWidth(
18156 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18157 Limit, IsTruncRoot, IsSignedCmp);
18158 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18159 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18160 ReductionBitWidth = bit_ceil(MaxBitWidth);
18161 else if (MaxBitWidth == 0)
18162 ReductionBitWidth = 0;
18163 }
18164
18165 for (unsigned Idx : RootDemotes) {
18166 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18167 uint32_t OrigBitWidth =
18168 DL->getTypeSizeInBits(V->getType()->getScalarType());
18169 if (OrigBitWidth > MaxBitWidth) {
18170 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18171 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18172 }
18173 return false;
18174 }))
18175 ToDemote.push_back(Idx);
18176 }
18177 RootDemotes.clear();
18178 IsTopRoot = false;
18179 IsProfitableToDemoteRoot = true;
18180
18181 if (ExtraBitWidthNodes.empty()) {
18182 NodeIdx = VectorizableTree.size();
18183 } else {
18184 unsigned NewIdx = 0;
18185 do {
18186 NewIdx = *ExtraBitWidthNodes.begin();
18187 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18188 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18189 NodeIdx = NewIdx;
18190 IsTruncRoot =
18191 NodeIdx < VectorizableTree.size() &&
18192 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18193 [](const EdgeInfo &EI) {
18194 return EI.EdgeIdx == 0 &&
18195 EI.UserTE->getOpcode() == Instruction::Trunc &&
18196 !EI.UserTE->isAltShuffle();
18197 });
18198 IsSignedCmp =
18199 NodeIdx < VectorizableTree.size() &&
18200 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18201 [&](const EdgeInfo &EI) {
18202 return EI.UserTE->getOpcode() == Instruction::ICmp &&
18203 any_of(EI.UserTE->Scalars, [&](Value *V) {
18204 auto *IC = dyn_cast<ICmpInst>(V);
18205 return IC &&
18206 (IC->isSigned() ||
18207 !isKnownNonNegative(IC->getOperand(0),
18208 SimplifyQuery(*DL)) ||
18209 !isKnownNonNegative(IC->getOperand(1),
18210 SimplifyQuery(*DL)));
18211 });
18212 });
18213 }
18214
18215 // If the maximum bit width we compute is less than the width of the roots'
18216 // type, we can proceed with the narrowing. Otherwise, do nothing.
18217 if (MaxBitWidth == 0 ||
18218 MaxBitWidth >=
18219 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18220 ->getBitWidth()) {
18221 if (UserIgnoreList)
18222 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18223 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18224 continue;
18225 }
18226
18227 // Finally, map the values we can demote to the maximum bit with we
18228 // computed.
18229 for (unsigned Idx : ToDemote) {
18230 TreeEntry *TE = VectorizableTree[Idx].get();
18231 if (MinBWs.contains(TE))
18232 continue;
18233 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18234 if (isa<PoisonValue>(R))
18235 return false;
18236 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18237 });
18238 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18239 }
18240 }
18241}
18242
18244 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18245 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18246 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18247 auto *AA = &AM.getResult<AAManager>(F);
18248 auto *LI = &AM.getResult<LoopAnalysis>(F);
18249 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18250 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18251 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18253
18254 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18255 if (!Changed)
18256 return PreservedAnalyses::all();
18257
18260 return PA;
18261}
18262
18264 TargetTransformInfo *TTI_,
18265 TargetLibraryInfo *TLI_, AAResults *AA_,
18266 LoopInfo *LI_, DominatorTree *DT_,
18267 AssumptionCache *AC_, DemandedBits *DB_,
18270 return false;
18271 SE = SE_;
18272 TTI = TTI_;
18273 TLI = TLI_;
18274 AA = AA_;
18275 LI = LI_;
18276 DT = DT_;
18277 AC = AC_;
18278 DB = DB_;
18279 DL = &F.getDataLayout();
18280
18281 Stores.clear();
18282 GEPs.clear();
18283 bool Changed = false;
18284
18285 // If the target claims to have no vector registers don't attempt
18286 // vectorization.
18288 LLVM_DEBUG(
18289 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18290 return false;
18291 }
18292
18293 // Don't vectorize when the attribute NoImplicitFloat is used.
18294 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18295 return false;
18296
18297 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18298
18299 // Use the bottom up slp vectorizer to construct chains that start with
18300 // store instructions.
18301 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18302
18303 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18304 // delete instructions.
18305
18306 // Update DFS numbers now so that we can use them for ordering.
18307 DT->updateDFSNumbers();
18308
18309 // Scan the blocks in the function in post order.
18310 for (auto *BB : post_order(&F.getEntryBlock())) {
18311 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18312 continue;
18313
18314 // Start new block - clear the list of reduction roots.
18315 R.clearReductionData();
18316 collectSeedInstructions(BB);
18317
18318 // Vectorize trees that end at stores.
18319 if (!Stores.empty()) {
18320 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18321 << " underlying objects.\n");
18322 Changed |= vectorizeStoreChains(R);
18323 }
18324
18325 // Vectorize trees that end at reductions.
18326 Changed |= vectorizeChainsInBlock(BB, R);
18327
18328 // Vectorize the index computations of getelementptr instructions. This
18329 // is primarily intended to catch gather-like idioms ending at
18330 // non-consecutive loads.
18331 if (!GEPs.empty()) {
18332 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18333 << " underlying objects.\n");
18334 Changed |= vectorizeGEPIndices(BB, R);
18335 }
18336 }
18337
18338 if (Changed) {
18339 R.optimizeGatherSequence();
18340 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18341 }
18342 return Changed;
18343}
18344
18345std::optional<bool>
18346SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18347 unsigned Idx, unsigned MinVF,
18348 unsigned &Size) {
18349 Size = 0;
18350 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18351 << "\n");
18352 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18353 unsigned VF = Chain.size();
18354
18355 if (!has_single_bit(Sz) ||
18357 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18358 VF) ||
18359 VF < 2 || VF < MinVF) {
18360 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18361 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18362 // all vector lanes are used.
18363 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18364 return false;
18365 }
18366
18367 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18368 << "\n");
18369
18370 SetVector<Value *> ValOps;
18371 for (Value *V : Chain)
18372 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18373 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18374 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18375 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18376 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18377 bool IsAllowedSize =
18378 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18379 ValOps.size()) ||
18380 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18381 if ((!IsAllowedSize && S.getOpcode() &&
18382 S.getOpcode() != Instruction::Load &&
18383 (!S.getMainOp()->isSafeToRemove() ||
18384 any_of(ValOps.getArrayRef(),
18385 [&](Value *V) {
18386 return !isa<ExtractElementInst>(V) &&
18387 (V->getNumUses() > Chain.size() ||
18388 any_of(V->users(), [&](User *U) {
18389 return !Stores.contains(U);
18390 }));
18391 }))) ||
18392 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
18393 Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2;
18394 return false;
18395 }
18396 }
18397 if (R.isLoadCombineCandidate(Chain))
18398 return true;
18399 R.buildTree(Chain);
18400 // Check if tree tiny and store itself or its value is not vectorized.
18401 if (R.isTreeTinyAndNotFullyVectorizable()) {
18402 if (R.isGathered(Chain.front()) ||
18403 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18404 return std::nullopt;
18405 Size = R.getCanonicalGraphSize();
18406 return false;
18407 }
18408 R.reorderTopToBottom();
18409 R.reorderBottomToTop();
18410 R.transformNodes();
18411 R.buildExternalUses();
18412
18413 R.computeMinimumValueSizes();
18414
18415 Size = R.getCanonicalGraphSize();
18416 if (S.getOpcode() == Instruction::Load)
18417 Size = 2; // cut off masked gather small trees
18418 InstructionCost Cost = R.getTreeCost();
18419
18420 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18421 if (Cost < -SLPCostThreshold) {
18422 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18423
18424 using namespace ore;
18425
18426 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18427 cast<StoreInst>(Chain[0]))
18428 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18429 << " and with tree size "
18430 << NV("TreeSize", R.getTreeSize()));
18431
18432 R.vectorizeTree();
18433 return true;
18434 }
18435
18436 return false;
18437}
18438
18439/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18440static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18441 bool First) {
18442 unsigned Num = 0;
18443 uint64_t Sum = std::accumulate(
18444 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18445 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18446 unsigned Size = First ? Val.first : Val.second;
18447 if (Size == 1)
18448 return V;
18449 ++Num;
18450 return V + Size;
18451 });
18452 if (Num == 0)
18453 return true;
18454 uint64_t Mean = Sum / Num;
18455 if (Mean == 0)
18456 return true;
18457 uint64_t Dev = std::accumulate(
18458 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18459 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18460 unsigned P = First ? Val.first : Val.second;
18461 if (P == 1)
18462 return V;
18463 return V + (P - Mean) * (P - Mean);
18464 }) /
18465 Num;
18466 return Dev * 81 / (Mean * Mean) == 0;
18467}
18468
18469bool SLPVectorizerPass::vectorizeStores(
18470 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18471 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18472 &Visited) {
18473 // We may run into multiple chains that merge into a single chain. We mark the
18474 // stores that we vectorized so that we don't visit the same store twice.
18475 BoUpSLP::ValueSet VectorizedStores;
18476 bool Changed = false;
18477
18478 struct StoreDistCompare {
18479 bool operator()(const std::pair<unsigned, int> &Op1,
18480 const std::pair<unsigned, int> &Op2) const {
18481 return Op1.second < Op2.second;
18482 }
18483 };
18484 // A set of pairs (index of store in Stores array ref, Distance of the store
18485 // address relative to base store address in units).
18486 using StoreIndexToDistSet =
18487 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18488 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18489 int PrevDist = -1;
18491 // Collect the chain into a list.
18492 for (auto [Idx, Data] : enumerate(Set)) {
18493 if (Operands.empty() || Data.second - PrevDist == 1) {
18494 Operands.push_back(Stores[Data.first]);
18495 PrevDist = Data.second;
18496 if (Idx != Set.size() - 1)
18497 continue;
18498 }
18499 auto E = make_scope_exit([&, &DataVar = Data]() {
18500 Operands.clear();
18501 Operands.push_back(Stores[DataVar.first]);
18502 PrevDist = DataVar.second;
18503 });
18504
18505 if (Operands.size() <= 1 ||
18506 !Visited
18507 .insert({Operands.front(),
18508 cast<StoreInst>(Operands.front())->getValueOperand(),
18509 Operands.back(),
18510 cast<StoreInst>(Operands.back())->getValueOperand(),
18511 Operands.size()})
18512 .second)
18513 continue;
18514
18515 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18516 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18517 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18518
18519 unsigned MaxVF =
18520 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18521 auto *Store = cast<StoreInst>(Operands[0]);
18522 Type *StoreTy = Store->getValueOperand()->getType();
18523 Type *ValueTy = StoreTy;
18524 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18525 ValueTy = Trunc->getSrcTy();
18526 unsigned MinVF = std::max<unsigned>(
18528 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18529 ValueTy)));
18530
18531 if (MaxVF < MinVF) {
18532 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18533 << ") < "
18534 << "MinVF (" << MinVF << ")\n");
18535 continue;
18536 }
18537
18538 unsigned NonPowerOf2VF = 0;
18540 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18541 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18542 // lanes are used.
18543 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18544 if (has_single_bit(CandVF + 1)) {
18545 NonPowerOf2VF = CandVF;
18546 assert(NonPowerOf2VF != MaxVF &&
18547 "Non-power-of-2 VF should not be equal to MaxVF");
18548 }
18549 }
18550
18551 unsigned MaxRegVF = MaxVF;
18552 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18553 if (MaxVF < MinVF) {
18554 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18555 << ") < "
18556 << "MinVF (" << MinVF << ")\n");
18557 continue;
18558 }
18559
18560 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18561 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18562 unsigned Size = MinVF;
18563 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18564 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18565 Size *= 2;
18566 });
18567 unsigned End = Operands.size();
18568 unsigned Repeat = 0;
18569 constexpr unsigned MaxAttempts = 4;
18571 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18572 P.first = P.second = 1;
18573 });
18575 auto IsNotVectorized = [](bool First,
18576 const std::pair<unsigned, unsigned> &P) {
18577 return First ? P.first > 0 : P.second > 0;
18578 };
18579 auto IsVectorized = [](bool First,
18580 const std::pair<unsigned, unsigned> &P) {
18581 return First ? P.first == 0 : P.second == 0;
18582 };
18583 auto VFIsProfitable = [](bool First, unsigned Size,
18584 const std::pair<unsigned, unsigned> &P) {
18585 return First ? Size >= P.first : Size >= P.second;
18586 };
18587 auto FirstSizeSame = [](unsigned Size,
18588 const std::pair<unsigned, unsigned> &P) {
18589 return Size == P.first;
18590 };
18591 while (true) {
18592 ++Repeat;
18593 bool RepeatChanged = false;
18594 bool AnyProfitableGraph = false;
18595 for (unsigned Size : CandidateVFs) {
18596 AnyProfitableGraph = false;
18597 unsigned StartIdx = std::distance(
18598 RangeSizes.begin(),
18599 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18600 std::placeholders::_1)));
18601 while (StartIdx < End) {
18602 unsigned EndIdx =
18603 std::distance(RangeSizes.begin(),
18604 find_if(RangeSizes.drop_front(StartIdx),
18605 std::bind(IsVectorized, Size >= MaxRegVF,
18606 std::placeholders::_1)));
18607 unsigned Sz = EndIdx >= End ? End : EndIdx;
18608 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18609 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18610 Size >= MaxRegVF)) {
18611 ++Cnt;
18612 continue;
18613 }
18615 assert(all_of(Slice,
18616 [&](Value *V) {
18617 return cast<StoreInst>(V)
18618 ->getValueOperand()
18619 ->getType() ==
18620 cast<StoreInst>(Slice.front())
18621 ->getValueOperand()
18622 ->getType();
18623 }) &&
18624 "Expected all operands of same type.");
18625 if (!NonSchedulable.empty()) {
18626 auto [NonSchedSizeMax, NonSchedSizeMin] =
18627 NonSchedulable.lookup(Slice.front());
18628 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18629 Cnt += NonSchedSizeMax;
18630 continue;
18631 }
18632 }
18633 unsigned TreeSize;
18634 std::optional<bool> Res =
18635 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18636 if (!Res) {
18637 NonSchedulable
18638 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18639 .first->getSecond()
18640 .second = Size;
18641 } else if (*Res) {
18642 // Mark the vectorized stores so that we don't vectorize them
18643 // again.
18644 VectorizedStores.insert(Slice.begin(), Slice.end());
18645 // Mark the vectorized stores so that we don't vectorize them
18646 // again.
18647 AnyProfitableGraph = RepeatChanged = Changed = true;
18648 // If we vectorized initial block, no need to try to vectorize
18649 // it again.
18650 for_each(RangeSizes.slice(Cnt, Size),
18651 [](std::pair<unsigned, unsigned> &P) {
18652 P.first = P.second = 0;
18653 });
18654 if (Cnt < StartIdx + MinVF) {
18655 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18656 [](std::pair<unsigned, unsigned> &P) {
18657 P.first = P.second = 0;
18658 });
18659 StartIdx = Cnt + Size;
18660 }
18661 if (Cnt > Sz - Size - MinVF) {
18662 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18663 [](std::pair<unsigned, unsigned> &P) {
18664 P.first = P.second = 0;
18665 });
18666 if (Sz == End)
18667 End = Cnt;
18668 Sz = Cnt;
18669 }
18670 Cnt += Size;
18671 continue;
18672 }
18673 if (Size > 2 && Res &&
18674 !all_of(RangeSizes.slice(Cnt, Size),
18675 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18676 std::placeholders::_1))) {
18677 Cnt += Size;
18678 continue;
18679 }
18680 // Check for the very big VFs that we're not rebuilding same
18681 // trees, just with larger number of elements.
18682 if (Size > MaxRegVF && TreeSize > 1 &&
18683 all_of(RangeSizes.slice(Cnt, Size),
18684 std::bind(FirstSizeSame, TreeSize,
18685 std::placeholders::_1))) {
18686 Cnt += Size;
18687 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18688 ++Cnt;
18689 continue;
18690 }
18691 if (TreeSize > 1)
18692 for_each(RangeSizes.slice(Cnt, Size),
18693 [&](std::pair<unsigned, unsigned> &P) {
18694 if (Size >= MaxRegVF)
18695 P.second = std::max(P.second, TreeSize);
18696 else
18697 P.first = std::max(P.first, TreeSize);
18698 });
18699 ++Cnt;
18700 AnyProfitableGraph = true;
18701 }
18702 if (StartIdx >= End)
18703 break;
18704 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18705 AnyProfitableGraph = true;
18706 StartIdx = std::distance(
18707 RangeSizes.begin(),
18708 find_if(RangeSizes.drop_front(Sz),
18709 std::bind(IsNotVectorized, Size >= MaxRegVF,
18710 std::placeholders::_1)));
18711 }
18712 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18713 break;
18714 }
18715 // All values vectorized - exit.
18716 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18717 return P.first == 0 && P.second == 0;
18718 }))
18719 break;
18720 // Check if tried all attempts or no need for the last attempts at all.
18721 if (Repeat >= MaxAttempts ||
18722 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18723 break;
18724 constexpr unsigned StoresLimit = 64;
18725 const unsigned MaxTotalNum = std::min<unsigned>(
18726 Operands.size(),
18727 static_cast<unsigned>(
18728 End -
18729 std::distance(
18730 RangeSizes.begin(),
18731 find_if(RangeSizes, std::bind(IsNotVectorized, true,
18732 std::placeholders::_1))) +
18733 1));
18734 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
18735 unsigned Limit =
18736 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18737 CandidateVFs.clear();
18738 if (bit_floor(Limit) == VF)
18739 CandidateVFs.push_back(Limit);
18740 if (VF > MaxTotalNum || VF >= StoresLimit)
18741 break;
18742 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18743 if (P.first != 0)
18744 P.first = std::max(P.second, P.first);
18745 });
18746 // Last attempt to vectorize max number of elements, if all previous
18747 // attempts were unsuccessful because of the cost issues.
18748 CandidateVFs.push_back(VF);
18749 }
18750 }
18751 };
18752
18753 // Stores pair (first: index of the store into Stores array ref, address of
18754 // which taken as base, second: sorted set of pairs {index, dist}, which are
18755 // indices of stores in the set and their store location distances relative to
18756 // the base address).
18757
18758 // Need to store the index of the very first store separately, since the set
18759 // may be reordered after the insertion and the first store may be moved. This
18760 // container allows to reduce number of calls of getPointersDiff() function.
18762 // Inserts the specified store SI with the given index Idx to the set of the
18763 // stores. If the store with the same distance is found already - stop
18764 // insertion, try to vectorize already found stores. If some stores from this
18765 // sequence were not vectorized - try to vectorize them with the new store
18766 // later. But this logic is applied only to the stores, that come before the
18767 // previous store with the same distance.
18768 // Example:
18769 // 1. store x, %p
18770 // 2. store y, %p+1
18771 // 3. store z, %p+2
18772 // 4. store a, %p
18773 // 5. store b, %p+3
18774 // - Scan this from the last to first store. The very first bunch of stores is
18775 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
18776 // vector).
18777 // - The next store in the list - #1 - has the same distance from store #5 as
18778 // the store #4.
18779 // - Try to vectorize sequence of stores 4,2,3,5.
18780 // - If all these stores are vectorized - just drop them.
18781 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
18782 // - Start new stores sequence.
18783 // The new bunch of stores is {1, {1, 0}}.
18784 // - Add the stores from previous sequence, that were not vectorized.
18785 // Here we consider the stores in the reversed order, rather they are used in
18786 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
18787 // Store #3 can be added -> comes after store #4 with the same distance as
18788 // store #1.
18789 // Store #5 cannot be added - comes before store #4.
18790 // This logic allows to improve the compile time, we assume that the stores
18791 // after previous store with the same distance most likely have memory
18792 // dependencies and no need to waste compile time to try to vectorize them.
18793 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
18794 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
18795 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
18796 std::optional<int> Diff = getPointersDiff(
18797 Stores[Set.first]->getValueOperand()->getType(),
18798 Stores[Set.first]->getPointerOperand(),
18799 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
18800 /*StrictCheck=*/true);
18801 if (!Diff)
18802 continue;
18803 auto It = Set.second.find(std::make_pair(Idx, *Diff));
18804 if (It == Set.second.end()) {
18805 Set.second.emplace(Idx, *Diff);
18806 return;
18807 }
18808 // Try to vectorize the first found set to avoid duplicate analysis.
18809 TryToVectorize(Set.second);
18810 unsigned ItIdx = It->first;
18811 int ItDist = It->second;
18812 StoreIndexToDistSet PrevSet;
18813 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
18814 [&](const std::pair<unsigned, int> &Pair) {
18815 return Pair.first > ItIdx;
18816 });
18817 Set.second.clear();
18818 Set.first = Idx;
18819 Set.second.emplace(Idx, 0);
18820 // Insert stores that followed previous match to try to vectorize them
18821 // with this store.
18822 unsigned StartIdx = ItIdx + 1;
18823 SmallBitVector UsedStores(Idx - StartIdx);
18824 // Distances to previously found dup store (or this store, since they
18825 // store to the same addresses).
18826 SmallVector<int> Dists(Idx - StartIdx, 0);
18827 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
18828 // Do not try to vectorize sequences, we already tried.
18829 if (VectorizedStores.contains(Stores[Pair.first]))
18830 break;
18831 unsigned BI = Pair.first - StartIdx;
18832 UsedStores.set(BI);
18833 Dists[BI] = Pair.second - ItDist;
18834 }
18835 for (unsigned I = StartIdx; I < Idx; ++I) {
18836 unsigned BI = I - StartIdx;
18837 if (UsedStores.test(BI))
18838 Set.second.emplace(I, Dists[BI]);
18839 }
18840 return;
18841 }
18842 auto &Res = SortedStores.emplace_back();
18843 Res.first = Idx;
18844 Res.second.emplace(Idx, 0);
18845 };
18846 Type *PrevValTy = nullptr;
18847 for (auto [I, SI] : enumerate(Stores)) {
18848 if (R.isDeleted(SI))
18849 continue;
18850 if (!PrevValTy)
18851 PrevValTy = SI->getValueOperand()->getType();
18852 // Check that we do not try to vectorize stores of different types.
18853 if (PrevValTy != SI->getValueOperand()->getType()) {
18854 for (auto &Set : SortedStores)
18855 TryToVectorize(Set.second);
18856 SortedStores.clear();
18857 PrevValTy = SI->getValueOperand()->getType();
18858 }
18859 FillStoresSet(I, SI);
18860 }
18861
18862 // Final vectorization attempt.
18863 for (auto &Set : SortedStores)
18864 TryToVectorize(Set.second);
18865
18866 return Changed;
18867}
18868
18869void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
18870 // Initialize the collections. We will make a single pass over the block.
18871 Stores.clear();
18872 GEPs.clear();
18873
18874 // Visit the store and getelementptr instructions in BB and organize them in
18875 // Stores and GEPs according to the underlying objects of their pointer
18876 // operands.
18877 for (Instruction &I : *BB) {
18878 // Ignore store instructions that are volatile or have a pointer operand
18879 // that doesn't point to a scalar type.
18880 if (auto *SI = dyn_cast<StoreInst>(&I)) {
18881 if (!SI->isSimple())
18882 continue;
18883 if (!isValidElementType(SI->getValueOperand()->getType()))
18884 continue;
18885 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
18886 }
18887
18888 // Ignore getelementptr instructions that have more than one index, a
18889 // constant index, or a pointer operand that doesn't point to a scalar
18890 // type.
18891 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
18892 if (GEP->getNumIndices() != 1)
18893 continue;
18894 Value *Idx = GEP->idx_begin()->get();
18895 if (isa<Constant>(Idx))
18896 continue;
18897 if (!isValidElementType(Idx->getType()))
18898 continue;
18899 if (GEP->getType()->isVectorTy())
18900 continue;
18901 GEPs[GEP->getPointerOperand()].push_back(GEP);
18902 }
18903 }
18904}
18905
18906bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
18907 bool MaxVFOnly) {
18908 if (VL.size() < 2)
18909 return false;
18910
18911 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
18912 << VL.size() << ".\n");
18913
18914 // Check that all of the parts are instructions of the same type,
18915 // we permit an alternate opcode via InstructionsState.
18916 InstructionsState S = getSameOpcode(VL, *TLI);
18917 if (!S.getOpcode())
18918 return false;
18919
18920 Instruction *I0 = S.getMainOp();
18921 // Make sure invalid types (including vector type) are rejected before
18922 // determining vectorization factor for scalar instructions.
18923 for (Value *V : VL) {
18924 Type *Ty = V->getType();
18925 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
18926 // NOTE: the following will give user internal llvm type name, which may
18927 // not be useful.
18928 R.getORE()->emit([&]() {
18929 std::string TypeStr;
18930 llvm::raw_string_ostream rso(TypeStr);
18931 Ty->print(rso);
18932 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
18933 << "Cannot SLP vectorize list: type "
18934 << TypeStr + " is unsupported by vectorizer";
18935 });
18936 return false;
18937 }
18938 }
18939
18940 unsigned Sz = R.getVectorElementSize(I0);
18941 unsigned MinVF = R.getMinVF(Sz);
18942 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
18943 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
18944 if (MaxVF < 2) {
18945 R.getORE()->emit([&]() {
18946 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
18947 << "Cannot SLP vectorize list: vectorization factor "
18948 << "less than 2 is not supported";
18949 });
18950 return false;
18951 }
18952
18953 bool Changed = false;
18954 bool CandidateFound = false;
18955 InstructionCost MinCost = SLPCostThreshold.getValue();
18956 Type *ScalarTy = getValueType(VL[0]);
18957
18958 unsigned NextInst = 0, MaxInst = VL.size();
18959 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
18960 // No actual vectorization should happen, if number of parts is the same as
18961 // provided vectorization factor (i.e. the scalar type is used for vector
18962 // code during codegen).
18963 auto *VecTy = getWidenedType(ScalarTy, VF);
18964 if (TTI->getNumberOfParts(VecTy) == VF)
18965 continue;
18966 for (unsigned I = NextInst; I < MaxInst; ++I) {
18967 unsigned ActualVF = std::min(MaxInst - I, VF);
18968
18969 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
18970 continue;
18971
18972 if (MaxVFOnly && ActualVF < MaxVF)
18973 break;
18974 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
18975 break;
18976
18977 SmallVector<Value *> Ops(ActualVF, nullptr);
18978 unsigned Idx = 0;
18979 for (Value *V : VL.drop_front(I)) {
18980 // Check that a previous iteration of this loop did not delete the
18981 // Value.
18982 if (auto *Inst = dyn_cast<Instruction>(V);
18983 !Inst || !R.isDeleted(Inst)) {
18984 Ops[Idx] = V;
18985 ++Idx;
18986 if (Idx == ActualVF)
18987 break;
18988 }
18989 }
18990 // Not enough vectorizable instructions - exit.
18991 if (Idx != ActualVF)
18992 break;
18993
18994 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
18995 << "\n");
18996
18997 R.buildTree(Ops);
18998 if (R.isTreeTinyAndNotFullyVectorizable())
18999 continue;
19000 R.reorderTopToBottom();
19001 R.reorderBottomToTop(
19002 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19003 !R.doesRootHaveInTreeUses());
19004 R.transformNodes();
19005 R.buildExternalUses();
19006
19007 R.computeMinimumValueSizes();
19008 InstructionCost Cost = R.getTreeCost();
19009 CandidateFound = true;
19010 MinCost = std::min(MinCost, Cost);
19011
19012 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19013 << " for VF=" << ActualVF << "\n");
19014 if (Cost < -SLPCostThreshold) {
19015 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19016 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19017 cast<Instruction>(Ops[0]))
19018 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19019 << " and with tree size "
19020 << ore::NV("TreeSize", R.getTreeSize()));
19021
19022 R.vectorizeTree();
19023 // Move to the next bundle.
19024 I += VF - 1;
19025 NextInst = I + 1;
19026 Changed = true;
19027 }
19028 }
19029 }
19030
19031 if (!Changed && CandidateFound) {
19032 R.getORE()->emit([&]() {
19033 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19034 << "List vectorization was possible but not beneficial with cost "
19035 << ore::NV("Cost", MinCost) << " >= "
19036 << ore::NV("Treshold", -SLPCostThreshold);
19037 });
19038 } else if (!Changed) {
19039 R.getORE()->emit([&]() {
19040 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19041 << "Cannot SLP vectorize list: vectorization was impossible"
19042 << " with available vectorization factors";
19043 });
19044 }
19045 return Changed;
19046}
19047
19048bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19049 if (!I)
19050 return false;
19051
19052 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19053 return false;
19054
19055 Value *P = I->getParent();
19056
19057 // Vectorize in current basic block only.
19058 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19059 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19060 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19061 R.isDeleted(Op0) || R.isDeleted(Op1))
19062 return false;
19063
19064 // First collect all possible candidates
19066 Candidates.emplace_back(Op0, Op1);
19067
19068 auto *A = dyn_cast<BinaryOperator>(Op0);
19069 auto *B = dyn_cast<BinaryOperator>(Op1);
19070 // Try to skip B.
19071 if (A && B && B->hasOneUse()) {
19072 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19073 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19074 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19075 Candidates.emplace_back(A, B0);
19076 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19077 Candidates.emplace_back(A, B1);
19078 }
19079 // Try to skip A.
19080 if (B && A && A->hasOneUse()) {
19081 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19082 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19083 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19084 Candidates.emplace_back(A0, B);
19085 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19086 Candidates.emplace_back(A1, B);
19087 }
19088
19089 if (Candidates.size() == 1)
19090 return tryToVectorizeList({Op0, Op1}, R);
19091
19092 // We have multiple options. Try to pick the single best.
19093 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19094 if (!BestCandidate)
19095 return false;
19096 return tryToVectorizeList(
19097 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19098}
19099
19100namespace {
19101
19102/// Model horizontal reductions.
19103///
19104/// A horizontal reduction is a tree of reduction instructions that has values
19105/// that can be put into a vector as its leaves. For example:
19106///
19107/// mul mul mul mul
19108/// \ / \ /
19109/// + +
19110/// \ /
19111/// +
19112/// This tree has "mul" as its leaf values and "+" as its reduction
19113/// instructions. A reduction can feed into a store or a binary operation
19114/// feeding a phi.
19115/// ...
19116/// \ /
19117/// +
19118/// |
19119/// phi +=
19120///
19121/// Or:
19122/// ...
19123/// \ /
19124/// +
19125/// |
19126/// *p =
19127///
19128class HorizontalReduction {
19129 using ReductionOpsType = SmallVector<Value *, 16>;
19130 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19131 ReductionOpsListType ReductionOps;
19132 /// List of possibly reduced values.
19134 /// Maps reduced value to the corresponding reduction operation.
19136 WeakTrackingVH ReductionRoot;
19137 /// The type of reduction operation.
19138 RecurKind RdxKind;
19139 /// Checks if the optimization of original scalar identity operations on
19140 /// matched horizontal reductions is enabled and allowed.
19141 bool IsSupportedHorRdxIdentityOp = false;
19142
19143 static bool isCmpSelMinMax(Instruction *I) {
19144 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19146 }
19147
19148 // And/or are potentially poison-safe logical patterns like:
19149 // select x, y, false
19150 // select x, true, y
19151 static bool isBoolLogicOp(Instruction *I) {
19152 return isa<SelectInst>(I) &&
19153 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19154 }
19155
19156 /// Checks if instruction is associative and can be vectorized.
19157 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19158 if (Kind == RecurKind::None)
19159 return false;
19160
19161 // Integer ops that map to select instructions or intrinsics are fine.
19163 isBoolLogicOp(I))
19164 return true;
19165
19166 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19167 // FP min/max are associative except for NaN and -0.0. We do not
19168 // have to rule out -0.0 here because the intrinsic semantics do not
19169 // specify a fixed result for it.
19170 return I->getFastMathFlags().noNaNs();
19171 }
19172
19173 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19174 return true;
19175
19176 return I->isAssociative();
19177 }
19178
19179 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19180 // Poison-safe 'or' takes the form: select X, true, Y
19181 // To make that work with the normal operand processing, we skip the
19182 // true value operand.
19183 // TODO: Change the code and data structures to handle this without a hack.
19184 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19185 return I->getOperand(2);
19186 return I->getOperand(Index);
19187 }
19188
19189 /// Creates reduction operation with the current opcode.
19190 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19191 Value *RHS, const Twine &Name, bool UseSelect) {
19192 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19193 switch (Kind) {
19194 case RecurKind::Or:
19195 if (UseSelect &&
19197 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19198 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19199 Name);
19200 case RecurKind::And:
19201 if (UseSelect &&
19203 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19204 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19205 Name);
19206 case RecurKind::Add:
19207 case RecurKind::Mul:
19208 case RecurKind::Xor:
19209 case RecurKind::FAdd:
19210 case RecurKind::FMul:
19211 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19212 Name);
19213 case RecurKind::FMax:
19214 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
19215 case RecurKind::FMin:
19216 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
19217 case RecurKind::FMaximum:
19218 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
19219 case RecurKind::FMinimum:
19220 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
19221 case RecurKind::SMax:
19222 if (UseSelect) {
19223 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
19224 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19225 }
19226 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
19227 case RecurKind::SMin:
19228 if (UseSelect) {
19229 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
19230 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19231 }
19232 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
19233 case RecurKind::UMax:
19234 if (UseSelect) {
19235 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
19236 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19237 }
19238 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
19239 case RecurKind::UMin:
19240 if (UseSelect) {
19241 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
19242 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19243 }
19244 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
19245 default:
19246 llvm_unreachable("Unknown reduction operation.");
19247 }
19248 }
19249
19250 /// Creates reduction operation with the current opcode with the IR flags
19251 /// from \p ReductionOps, dropping nuw/nsw flags.
19252 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19253 Value *RHS, const Twine &Name,
19254 const ReductionOpsListType &ReductionOps) {
19255 bool UseSelect = ReductionOps.size() == 2 ||
19256 // Logical or/and.
19257 (ReductionOps.size() == 1 &&
19258 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19259 assert((!UseSelect || ReductionOps.size() != 2 ||
19260 isa<SelectInst>(ReductionOps[1][0])) &&
19261 "Expected cmp + select pairs for reduction");
19262 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19264 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19265 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19266 /*IncludeWrapFlags=*/false);
19267 propagateIRFlags(Op, ReductionOps[1], nullptr,
19268 /*IncludeWrapFlags=*/false);
19269 return Op;
19270 }
19271 }
19272 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19273 return Op;
19274 }
19275
19276public:
19277 static RecurKind getRdxKind(Value *V) {
19278 auto *I = dyn_cast<Instruction>(V);
19279 if (!I)
19280 return RecurKind::None;
19281 if (match(I, m_Add(m_Value(), m_Value())))
19282 return RecurKind::Add;
19283 if (match(I, m_Mul(m_Value(), m_Value())))
19284 return RecurKind::Mul;
19285 if (match(I, m_And(m_Value(), m_Value())) ||
19287 return RecurKind::And;
19288 if (match(I, m_Or(m_Value(), m_Value())) ||
19290 return RecurKind::Or;
19291 if (match(I, m_Xor(m_Value(), m_Value())))
19292 return RecurKind::Xor;
19293 if (match(I, m_FAdd(m_Value(), m_Value())))
19294 return RecurKind::FAdd;
19295 if (match(I, m_FMul(m_Value(), m_Value())))
19296 return RecurKind::FMul;
19297
19298 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19299 return RecurKind::FMax;
19300 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19301 return RecurKind::FMin;
19302
19303 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19304 return RecurKind::FMaximum;
19305 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19306 return RecurKind::FMinimum;
19307 // This matches either cmp+select or intrinsics. SLP is expected to handle
19308 // either form.
19309 // TODO: If we are canonicalizing to intrinsics, we can remove several
19310 // special-case paths that deal with selects.
19311 if (match(I, m_SMax(m_Value(), m_Value())))
19312 return RecurKind::SMax;
19313 if (match(I, m_SMin(m_Value(), m_Value())))
19314 return RecurKind::SMin;
19315 if (match(I, m_UMax(m_Value(), m_Value())))
19316 return RecurKind::UMax;
19317 if (match(I, m_UMin(m_Value(), m_Value())))
19318 return RecurKind::UMin;
19319
19320 if (auto *Select = dyn_cast<SelectInst>(I)) {
19321 // Try harder: look for min/max pattern based on instructions producing
19322 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19323 // During the intermediate stages of SLP, it's very common to have
19324 // pattern like this (since optimizeGatherSequence is run only once
19325 // at the end):
19326 // %1 = extractelement <2 x i32> %a, i32 0
19327 // %2 = extractelement <2 x i32> %a, i32 1
19328 // %cond = icmp sgt i32 %1, %2
19329 // %3 = extractelement <2 x i32> %a, i32 0
19330 // %4 = extractelement <2 x i32> %a, i32 1
19331 // %select = select i1 %cond, i32 %3, i32 %4
19332 CmpPredicate Pred;
19333 Instruction *L1;
19334 Instruction *L2;
19335
19336 Value *LHS = Select->getTrueValue();
19337 Value *RHS = Select->getFalseValue();
19338 Value *Cond = Select->getCondition();
19339
19340 // TODO: Support inverse predicates.
19341 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19342 if (!isa<ExtractElementInst>(RHS) ||
19343 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19344 return RecurKind::None;
19345 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19346 if (!isa<ExtractElementInst>(LHS) ||
19347 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19348 return RecurKind::None;
19349 } else {
19350 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19351 return RecurKind::None;
19352 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19353 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19354 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19355 return RecurKind::None;
19356 }
19357
19358 switch (Pred) {
19359 default:
19360 return RecurKind::None;
19361 case CmpInst::ICMP_SGT:
19362 case CmpInst::ICMP_SGE:
19363 return RecurKind::SMax;
19364 case CmpInst::ICMP_SLT:
19365 case CmpInst::ICMP_SLE:
19366 return RecurKind::SMin;
19367 case CmpInst::ICMP_UGT:
19368 case CmpInst::ICMP_UGE:
19369 return RecurKind::UMax;
19370 case CmpInst::ICMP_ULT:
19371 case CmpInst::ICMP_ULE:
19372 return RecurKind::UMin;
19373 }
19374 }
19375 return RecurKind::None;
19376 }
19377
19378 /// Get the index of the first operand.
19379 static unsigned getFirstOperandIndex(Instruction *I) {
19380 return isCmpSelMinMax(I) ? 1 : 0;
19381 }
19382
19383private:
19384 /// Total number of operands in the reduction operation.
19385 static unsigned getNumberOfOperands(Instruction *I) {
19386 return isCmpSelMinMax(I) ? 3 : 2;
19387 }
19388
19389 /// Checks if the instruction is in basic block \p BB.
19390 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19391 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19392 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19393 auto *Sel = cast<SelectInst>(I);
19394 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19395 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19396 }
19397 return I->getParent() == BB;
19398 }
19399
19400 /// Expected number of uses for reduction operations/reduced values.
19401 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19402 if (IsCmpSelMinMax) {
19403 // SelectInst must be used twice while the condition op must have single
19404 // use only.
19405 if (auto *Sel = dyn_cast<SelectInst>(I))
19406 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19407 return I->hasNUses(2);
19408 }
19409
19410 // Arithmetic reduction operation must be used once only.
19411 return I->hasOneUse();
19412 }
19413
19414 /// Initializes the list of reduction operations.
19415 void initReductionOps(Instruction *I) {
19416 if (isCmpSelMinMax(I))
19417 ReductionOps.assign(2, ReductionOpsType());
19418 else
19419 ReductionOps.assign(1, ReductionOpsType());
19420 }
19421
19422 /// Add all reduction operations for the reduction instruction \p I.
19423 void addReductionOps(Instruction *I) {
19424 if (isCmpSelMinMax(I)) {
19425 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19426 ReductionOps[1].emplace_back(I);
19427 } else {
19428 ReductionOps[0].emplace_back(I);
19429 }
19430 }
19431
19432 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19433 int Sz = Data.size();
19434 auto *I = dyn_cast<Instruction>(Data.front());
19435 return Sz > 1 || isConstant(Data.front()) ||
19436 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19437 }
19438
19439public:
19440 HorizontalReduction() = default;
19441
19442 /// Try to find a reduction tree.
19443 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19444 ScalarEvolution &SE, const DataLayout &DL,
19445 const TargetLibraryInfo &TLI) {
19446 RdxKind = HorizontalReduction::getRdxKind(Root);
19447 if (!isVectorizable(RdxKind, Root))
19448 return false;
19449
19450 // Analyze "regular" integer/FP types for reductions - no target-specific
19451 // types or pointers.
19452 Type *Ty = Root->getType();
19453 if (!isValidElementType(Ty) || Ty->isPointerTy())
19454 return false;
19455
19456 // Though the ultimate reduction may have multiple uses, its condition must
19457 // have only single use.
19458 if (auto *Sel = dyn_cast<SelectInst>(Root))
19459 if (!Sel->getCondition()->hasOneUse())
19460 return false;
19461
19462 ReductionRoot = Root;
19463
19464 // Iterate through all the operands of the possible reduction tree and
19465 // gather all the reduced values, sorting them by their value id.
19466 BasicBlock *BB = Root->getParent();
19467 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19469 1, std::make_pair(Root, 0));
19470 // Checks if the operands of the \p TreeN instruction are also reduction
19471 // operations or should be treated as reduced values or an extra argument,
19472 // which is not part of the reduction.
19473 auto CheckOperands = [&](Instruction *TreeN,
19474 SmallVectorImpl<Value *> &PossibleReducedVals,
19475 SmallVectorImpl<Instruction *> &ReductionOps,
19476 unsigned Level) {
19477 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19478 getNumberOfOperands(TreeN)))) {
19479 Value *EdgeVal = getRdxOperand(TreeN, I);
19480 ReducedValsToOps[EdgeVal].push_back(TreeN);
19481 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19482 // If the edge is not an instruction, or it is different from the main
19483 // reduction opcode or has too many uses - possible reduced value.
19484 // Also, do not try to reduce const values, if the operation is not
19485 // foldable.
19486 if (!EdgeInst || Level > RecursionMaxDepth ||
19487 getRdxKind(EdgeInst) != RdxKind ||
19488 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19489 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19490 !isVectorizable(RdxKind, EdgeInst) ||
19491 (R.isAnalyzedReductionRoot(EdgeInst) &&
19492 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19493 PossibleReducedVals.push_back(EdgeVal);
19494 continue;
19495 }
19496 ReductionOps.push_back(EdgeInst);
19497 }
19498 };
19499 // Try to regroup reduced values so that it gets more profitable to try to
19500 // reduce them. Values are grouped by their value ids, instructions - by
19501 // instruction op id and/or alternate op id, plus do extra analysis for
19502 // loads (grouping them by the distabce between pointers) and cmp
19503 // instructions (grouping them by the predicate).
19506 8>
19507 PossibleReducedVals;
19508 initReductionOps(Root);
19510 SmallSet<size_t, 2> LoadKeyUsed;
19511
19512 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19513 Key = hash_combine(hash_value(LI->getParent()), Key);
19514 Value *Ptr =
19516 if (!LoadKeyUsed.insert(Key).second) {
19517 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19518 if (LIt != LoadsMap.end()) {
19519 for (LoadInst *RLI : LIt->second) {
19520 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19521 LI->getType(), LI->getPointerOperand(), DL, SE,
19522 /*StrictCheck=*/true))
19523 return hash_value(RLI->getPointerOperand());
19524 }
19525 for (LoadInst *RLI : LIt->second) {
19527 LI->getPointerOperand(), TLI)) {
19528 hash_code SubKey = hash_value(RLI->getPointerOperand());
19529 return SubKey;
19530 }
19531 }
19532 if (LIt->second.size() > 2) {
19533 hash_code SubKey =
19534 hash_value(LIt->second.back()->getPointerOperand());
19535 return SubKey;
19536 }
19537 }
19538 }
19539 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19540 .first->second.push_back(LI);
19541 return hash_value(LI->getPointerOperand());
19542 };
19543
19544 while (!Worklist.empty()) {
19545 auto [TreeN, Level] = Worklist.pop_back_val();
19546 SmallVector<Value *> PossibleRedVals;
19547 SmallVector<Instruction *> PossibleReductionOps;
19548 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19549 addReductionOps(TreeN);
19550 // Add reduction values. The values are sorted for better vectorization
19551 // results.
19552 for (Value *V : PossibleRedVals) {
19553 size_t Key, Idx;
19554 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19555 /*AllowAlternate=*/false);
19556 ++PossibleReducedVals[Key][Idx]
19557 .insert(std::make_pair(V, 0))
19558 .first->second;
19559 }
19560 for (Instruction *I : reverse(PossibleReductionOps))
19561 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19562 }
19563 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19564 // Sort values by the total number of values kinds to start the reduction
19565 // from the longest possible reduced values sequences.
19566 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19567 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19568 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19569 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19570 It != E; ++It) {
19571 PossibleRedValsVect.emplace_back();
19572 auto RedValsVect = It->second.takeVector();
19573 stable_sort(RedValsVect, llvm::less_second());
19574 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19575 PossibleRedValsVect.back().append(Data.second, Data.first);
19576 }
19577 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19578 return P1.size() > P2.size();
19579 });
19580 int NewIdx = -1;
19581 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19582 if (NewIdx < 0 ||
19583 (!isGoodForReduction(Data) &&
19584 (!isa<LoadInst>(Data.front()) ||
19585 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19587 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19589 cast<LoadInst>(ReducedVals[NewIdx].front())
19590 ->getPointerOperand())))) {
19591 NewIdx = ReducedVals.size();
19592 ReducedVals.emplace_back();
19593 }
19594 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19595 }
19596 }
19597 // Sort the reduced values by number of same/alternate opcode and/or pointer
19598 // operand.
19599 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19600 return P1.size() > P2.size();
19601 });
19602 return true;
19603 }
19604
19605 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19606 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19607 const TargetLibraryInfo &TLI) {
19608 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19609 constexpr unsigned RegMaxNumber = 4;
19610 constexpr unsigned RedValsMaxNumber = 128;
19611 // If there are a sufficient number of reduction values, reduce
19612 // to a nearby power-of-2. We can safely generate oversized
19613 // vectors and rely on the backend to split them to legal sizes.
19614 if (unsigned NumReducedVals = std::accumulate(
19615 ReducedVals.begin(), ReducedVals.end(), 0,
19616 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19617 if (!isGoodForReduction(Vals))
19618 return Num;
19619 return Num + Vals.size();
19620 });
19621 NumReducedVals < ReductionLimit &&
19622 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19623 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19624 })) {
19625 for (ReductionOpsType &RdxOps : ReductionOps)
19626 for (Value *RdxOp : RdxOps)
19627 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19628 return nullptr;
19629 }
19630
19631 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19632 TargetFolder(DL));
19633 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19634
19635 // Track the reduced values in case if they are replaced by extractelement
19636 // because of the vectorization.
19637 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19638 ReducedVals.front().size());
19639
19640 // The compare instruction of a min/max is the insertion point for new
19641 // instructions and may be replaced with a new compare instruction.
19642 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19643 assert(isa<SelectInst>(RdxRootInst) &&
19644 "Expected min/max reduction to have select root instruction");
19645 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19646 assert(isa<Instruction>(ScalarCond) &&
19647 "Expected min/max reduction to have compare condition");
19648 return cast<Instruction>(ScalarCond);
19649 };
19650
19651 // Return new VectorizedTree, based on previous value.
19652 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19653 if (VectorizedTree) {
19654 // Update the final value in the reduction.
19656 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19657 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
19659 !isGuaranteedNotToBePoison(VectorizedTree))) {
19660 auto It = ReducedValsToOps.find(Res);
19661 if (It != ReducedValsToOps.end() &&
19662 any_of(It->getSecond(),
19663 [](Instruction *I) { return isBoolLogicOp(I); }))
19664 std::swap(VectorizedTree, Res);
19665 }
19666
19667 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19668 ReductionOps);
19669 }
19670 // Initialize the final value in the reduction.
19671 return Res;
19672 };
19673 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19674 return isBoolLogicOp(cast<Instruction>(V));
19675 });
19676 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19677 ReductionOps.front().size());
19678 for (ReductionOpsType &RdxOps : ReductionOps)
19679 for (Value *RdxOp : RdxOps) {
19680 if (!RdxOp)
19681 continue;
19682 IgnoreList.insert(RdxOp);
19683 }
19684 // Intersect the fast-math-flags from all reduction operations.
19685 FastMathFlags RdxFMF;
19686 RdxFMF.set();
19687 for (Value *U : IgnoreList)
19688 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19689 RdxFMF &= FPMO->getFastMathFlags();
19690 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19691
19692 // Need to track reduced vals, they may be changed during vectorization of
19693 // subvectors.
19694 for (ArrayRef<Value *> Candidates : ReducedVals)
19695 for (Value *V : Candidates)
19696 TrackedVals.try_emplace(V, V);
19697
19699 Value *V) -> unsigned & {
19700 auto *It = MV.find(V);
19701 assert(It != MV.end() && "Unable to find given key.");
19702 return It->second;
19703 };
19704
19705 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19706 // List of the values that were reduced in other trees as part of gather
19707 // nodes and thus requiring extract if fully vectorized in other trees.
19708 SmallPtrSet<Value *, 4> RequiredExtract;
19709 WeakTrackingVH VectorizedTree = nullptr;
19710 bool CheckForReusedReductionOps = false;
19711 // Try to vectorize elements based on their type.
19713 for (ArrayRef<Value *> RV : ReducedVals)
19714 States.push_back(getSameOpcode(RV, TLI));
19715 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19716 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19717 InstructionsState S = States[I];
19718 SmallVector<Value *> Candidates;
19719 Candidates.reserve(2 * OrigReducedVals.size());
19720 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19721 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19722 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19723 // Check if the reduction value was not overriden by the extractelement
19724 // instruction because of the vectorization and exclude it, if it is not
19725 // compatible with other values.
19726 // Also check if the instruction was folded to constant/other value.
19727 auto *Inst = dyn_cast<Instruction>(RdxVal);
19728 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
19729 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
19730 (S.getOpcode() && !Inst))
19731 continue;
19732 Candidates.push_back(RdxVal);
19733 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19734 }
19735 bool ShuffledExtracts = false;
19736 // Try to handle shuffled extractelements.
19737 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
19738 I + 1 < E) {
19739 SmallVector<Value *> CommonCandidates(Candidates);
19740 for (Value *RV : ReducedVals[I + 1]) {
19741 Value *RdxVal = TrackedVals.at(RV);
19742 // Check if the reduction value was not overriden by the
19743 // extractelement instruction because of the vectorization and
19744 // exclude it, if it is not compatible with other values.
19745 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19746 if (!Inst)
19747 continue;
19748 CommonCandidates.push_back(RdxVal);
19749 TrackedToOrig.try_emplace(RdxVal, RV);
19750 }
19752 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
19753 ++I;
19754 Candidates.swap(CommonCandidates);
19755 ShuffledExtracts = true;
19756 }
19757 }
19758
19759 // Emit code for constant values.
19760 if (Candidates.size() > 1 && allConstant(Candidates)) {
19761 Value *Res = Candidates.front();
19762 Value *OrigV = TrackedToOrig.at(Candidates.front());
19763 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19764 for (Value *VC : ArrayRef(Candidates).drop_front()) {
19765 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
19766 Value *OrigV = TrackedToOrig.at(VC);
19767 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19768 if (auto *ResI = dyn_cast<Instruction>(Res))
19769 V.analyzedReductionRoot(ResI);
19770 }
19771 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19772 continue;
19773 }
19774
19775 unsigned NumReducedVals = Candidates.size();
19776 if (NumReducedVals < ReductionLimit &&
19777 (NumReducedVals < 2 || !isSplat(Candidates)))
19778 continue;
19779
19780 // Check if we support repeated scalar values processing (optimization of
19781 // original scalar identity operations on matched horizontal reductions).
19782 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
19783 RdxKind != RecurKind::FMul &&
19784 RdxKind != RecurKind::FMulAdd;
19785 // Gather same values.
19786 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
19787 if (IsSupportedHorRdxIdentityOp)
19788 for (Value *V : Candidates) {
19789 Value *OrigV = TrackedToOrig.at(V);
19790 ++SameValuesCounter.try_emplace(OrigV).first->second;
19791 }
19792 // Used to check if the reduced values used same number of times. In this
19793 // case the compiler may produce better code. E.g. if reduced values are
19794 // aabbccdd (8 x values), then the first node of the tree will have a node
19795 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
19796 // Plus, the final reduction will be performed on <8 x aabbccdd>.
19797 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
19798 // x abcd) * 2.
19799 // Currently it only handles add/fadd/xor. and/or/min/max do not require
19800 // this analysis, other operations may require an extra estimation of
19801 // the profitability.
19802 bool SameScaleFactor = false;
19803 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
19804 SameValuesCounter.size() != Candidates.size();
19805 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
19806 if (OptReusedScalars) {
19807 SameScaleFactor =
19808 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
19809 RdxKind == RecurKind::Xor) &&
19810 all_of(drop_begin(SameValuesCounter),
19811 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
19812 return P.second == SameValuesCounter.front().second;
19813 });
19814 Candidates.resize(SameValuesCounter.size());
19815 transform(SameValuesCounter, Candidates.begin(),
19816 [&](const auto &P) { return TrackedVals.at(P.first); });
19817 NumReducedVals = Candidates.size();
19818 // Have a reduction of the same element.
19819 if (NumReducedVals == 1) {
19820 Value *OrigV = TrackedToOrig.at(Candidates.front());
19821 unsigned Cnt = At(SameValuesCounter, OrigV);
19822 Value *RedVal =
19823 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
19824 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
19825 VectorizedVals.try_emplace(OrigV, Cnt);
19826 ExternallyUsedValues.insert(OrigV);
19827 continue;
19828 }
19829 }
19830
19831 unsigned MaxVecRegSize = V.getMaxVecRegSize();
19832 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
19833 const unsigned MaxElts = std::clamp<unsigned>(
19834 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
19835 RegMaxNumber * RedValsMaxNumber);
19836
19837 unsigned ReduxWidth = NumReducedVals;
19838 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
19839 unsigned NumParts, NumRegs;
19840 Type *ScalarTy = Candidates.front()->getType();
19841 ReduxWidth =
19842 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
19843 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
19844 NumParts = TTI.getNumberOfParts(Tp);
19845 NumRegs =
19847 while (NumParts > NumRegs) {
19848 ReduxWidth = bit_floor(ReduxWidth - 1);
19849 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
19850 NumParts = TTI.getNumberOfParts(Tp);
19851 NumRegs =
19853 }
19854 if (NumParts > NumRegs / 2)
19855 ReduxWidth = bit_floor(ReduxWidth);
19856 return ReduxWidth;
19857 };
19858 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
19859 ReduxWidth = GetVectorFactor(ReduxWidth);
19860 ReduxWidth = std::min(ReduxWidth, MaxElts);
19861
19862 unsigned Start = 0;
19863 unsigned Pos = Start;
19864 // Restarts vectorization attempt with lower vector factor.
19865 unsigned PrevReduxWidth = ReduxWidth;
19866 bool CheckForReusedReductionOpsLocal = false;
19867 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
19868 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
19869 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
19870 // Check if any of the reduction ops are gathered. If so, worth
19871 // trying again with less number of reduction ops.
19872 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
19873 }
19874 ++Pos;
19875 if (Pos < NumReducedVals - ReduxWidth + 1)
19876 return IsAnyRedOpGathered;
19877 Pos = Start;
19878 --ReduxWidth;
19879 if (ReduxWidth > 1)
19880 ReduxWidth = GetVectorFactor(ReduxWidth);
19881 return IsAnyRedOpGathered;
19882 };
19883 bool AnyVectorized = false;
19884 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
19885 while (Pos < NumReducedVals - ReduxWidth + 1 &&
19886 ReduxWidth >= ReductionLimit) {
19887 // Dependency in tree of the reduction ops - drop this attempt, try
19888 // later.
19889 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
19890 Start == 0) {
19891 CheckForReusedReductionOps = true;
19892 break;
19893 }
19894 PrevReduxWidth = ReduxWidth;
19895 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
19896 // Been analyzed already - skip.
19897 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
19898 (!has_single_bit(ReduxWidth) &&
19899 (IgnoredCandidates.contains(
19900 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
19901 IgnoredCandidates.contains(
19902 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
19903 bit_floor(ReduxWidth))))) ||
19904 V.areAnalyzedReductionVals(VL)) {
19905 (void)AdjustReducedVals(/*IgnoreVL=*/true);
19906 continue;
19907 }
19908 // Early exit if any of the reduction values were deleted during
19909 // previous vectorization attempts.
19910 if (any_of(VL, [&V](Value *RedVal) {
19911 auto *RedValI = dyn_cast<Instruction>(RedVal);
19912 if (!RedValI)
19913 return false;
19914 return V.isDeleted(RedValI);
19915 }))
19916 break;
19917 V.buildTree(VL, IgnoreList);
19918 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
19919 if (!AdjustReducedVals())
19920 V.analyzedReductionVals(VL);
19921 continue;
19922 }
19923 if (V.isLoadCombineReductionCandidate(RdxKind)) {
19924 if (!AdjustReducedVals())
19925 V.analyzedReductionVals(VL);
19926 continue;
19927 }
19928 V.reorderTopToBottom();
19929 // No need to reorder the root node at all.
19930 V.reorderBottomToTop(/*IgnoreReorder=*/true);
19931 // Keep extracted other reduction values, if they are used in the
19932 // vectorization trees.
19933 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
19934 ExternallyUsedValues);
19935 // The reduction root is used as the insertion point for new
19936 // instructions, so set it as externally used to prevent it from being
19937 // deleted.
19938 LocalExternallyUsedValues.insert(ReductionRoot);
19939 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
19940 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
19941 continue;
19942 for (Value *V : ReducedVals[Cnt])
19943 if (isa<Instruction>(V))
19944 LocalExternallyUsedValues.insert(TrackedVals[V]);
19945 }
19946 if (!IsSupportedHorRdxIdentityOp) {
19947 // Number of uses of the candidates in the vector of values.
19948 assert(SameValuesCounter.empty() &&
19949 "Reused values counter map is not empty");
19950 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
19951 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
19952 continue;
19953 Value *V = Candidates[Cnt];
19954 Value *OrigV = TrackedToOrig.at(V);
19955 ++SameValuesCounter.try_emplace(OrigV).first->second;
19956 }
19957 }
19958 V.transformNodes();
19959 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
19960 // Gather externally used values.
19962 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
19963 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
19964 continue;
19965 Value *RdxVal = Candidates[Cnt];
19966 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
19967 RdxVal = It->second;
19968 if (!Visited.insert(RdxVal).second)
19969 continue;
19970 // Check if the scalar was vectorized as part of the vectorization
19971 // tree but not the top node.
19972 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
19973 LocalExternallyUsedValues.insert(RdxVal);
19974 continue;
19975 }
19976 Value *OrigV = TrackedToOrig.at(RdxVal);
19977 unsigned NumOps =
19978 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
19979 if (NumOps != ReducedValsToOps.at(OrigV).size())
19980 LocalExternallyUsedValues.insert(RdxVal);
19981 }
19982 // Do not need the list of reused scalars in regular mode anymore.
19983 if (!IsSupportedHorRdxIdentityOp)
19984 SameValuesCounter.clear();
19985 for (Value *RdxVal : VL)
19986 if (RequiredExtract.contains(RdxVal))
19987 LocalExternallyUsedValues.insert(RdxVal);
19988 V.buildExternalUses(LocalExternallyUsedValues);
19989
19990 V.computeMinimumValueSizes();
19991
19992 // Estimate cost.
19993 InstructionCost TreeCost = V.getTreeCost(VL);
19994 InstructionCost ReductionCost =
19995 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
19996 InstructionCost Cost = TreeCost + ReductionCost;
19997 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19998 << " for reduction\n");
19999 if (!Cost.isValid())
20000 break;
20001 if (Cost >= -SLPCostThreshold) {
20002 V.getORE()->emit([&]() {
20003 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20004 ReducedValsToOps.at(VL[0]).front())
20005 << "Vectorizing horizontal reduction is possible "
20006 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20007 << " and threshold "
20008 << ore::NV("Threshold", -SLPCostThreshold);
20009 });
20010 if (!AdjustReducedVals()) {
20011 V.analyzedReductionVals(VL);
20012 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20013 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20014 // Add subvectors of VL to the list of the analyzed values.
20015 for (unsigned VF = getFloorFullVectorNumberOfElements(
20016 *TTI, VL.front()->getType(), ReduxWidth - 1);
20017 VF >= ReductionLimit;
20019 *TTI, VL.front()->getType(), VF - 1)) {
20020 if (has_single_bit(VF) &&
20021 V.getCanonicalGraphSize() != V.getTreeSize())
20022 continue;
20023 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20024 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20025 }
20026 }
20027 }
20028 continue;
20029 }
20030
20031 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20032 << Cost << ". (HorRdx)\n");
20033 V.getORE()->emit([&]() {
20034 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20035 ReducedValsToOps.at(VL[0]).front())
20036 << "Vectorized horizontal reduction with cost "
20037 << ore::NV("Cost", Cost) << " and with tree size "
20038 << ore::NV("TreeSize", V.getTreeSize());
20039 });
20040
20041 Builder.setFastMathFlags(RdxFMF);
20042
20043 // Emit a reduction. If the root is a select (min/max idiom), the insert
20044 // point is the compare condition of that select.
20045 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20046 Instruction *InsertPt = RdxRootInst;
20047 if (IsCmpSelMinMax)
20048 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20049
20050 // Vectorize a tree.
20051 Value *VectorizedRoot =
20052 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20053 // Update TrackedToOrig mapping, since the tracked values might be
20054 // updated.
20055 for (Value *RdxVal : Candidates) {
20056 Value *OrigVal = TrackedToOrig.at(RdxVal);
20057 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20058 if (TransformedRdxVal != RdxVal)
20059 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20060 }
20061
20062 Builder.SetInsertPoint(InsertPt);
20063
20064 // To prevent poison from leaking across what used to be sequential,
20065 // safe, scalar boolean logic operations, the reduction operand must be
20066 // frozen.
20067 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot))
20068 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20069
20070 // Emit code to correctly handle reused reduced values, if required.
20071 if (OptReusedScalars && !SameScaleFactor) {
20072 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20073 SameValuesCounter, TrackedToOrig);
20074 }
20075
20076 Value *ReducedSubTree;
20077 Type *ScalarTy = VL.front()->getType();
20078 if (isa<FixedVectorType>(ScalarTy)) {
20079 assert(SLPReVec && "FixedVectorType is not expected.");
20080 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20081 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20082 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20083 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20084 // Do reduction for each lane.
20085 // e.g., do reduce add for
20086 // VL[0] = <4 x Ty> <a, b, c, d>
20087 // VL[1] = <4 x Ty> <e, f, g, h>
20088 // Lane[0] = <2 x Ty> <a, e>
20089 // Lane[1] = <2 x Ty> <b, f>
20090 // Lane[2] = <2 x Ty> <c, g>
20091 // Lane[3] = <2 x Ty> <d, h>
20092 // result[0] = reduce add Lane[0]
20093 // result[1] = reduce add Lane[1]
20094 // result[2] = reduce add Lane[2]
20095 // result[3] = reduce add Lane[3]
20097 createStrideMask(I, ScalarTyNumElements, VL.size());
20098 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20099 ReducedSubTree = Builder.CreateInsertElement(
20100 ReducedSubTree,
20101 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20102 }
20103 } else {
20104 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20105 RdxRootInst->getType());
20106 }
20107 if (ReducedSubTree->getType() != VL.front()->getType()) {
20108 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20109 "Expected different reduction type.");
20110 ReducedSubTree =
20111 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20112 V.isSignedMinBitwidthRootNode());
20113 }
20114
20115 // Improved analysis for add/fadd/xor reductions with same scale factor
20116 // for all operands of reductions. We can emit scalar ops for them
20117 // instead.
20118 if (OptReusedScalars && SameScaleFactor)
20119 ReducedSubTree = emitScaleForReusedOps(
20120 ReducedSubTree, Builder, SameValuesCounter.front().second);
20121
20122 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20123 // Count vectorized reduced values to exclude them from final reduction.
20124 for (Value *RdxVal : VL) {
20125 Value *OrigV = TrackedToOrig.at(RdxVal);
20126 if (IsSupportedHorRdxIdentityOp) {
20127 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20128 continue;
20129 }
20130 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20131 if (!V.isVectorized(RdxVal))
20132 RequiredExtract.insert(RdxVal);
20133 }
20134 Pos += ReduxWidth;
20135 Start = Pos;
20136 ReduxWidth = NumReducedVals - Pos;
20137 if (ReduxWidth > 1)
20138 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20139 AnyVectorized = true;
20140 }
20141 if (OptReusedScalars && !AnyVectorized) {
20142 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20143 Value *RdxVal = TrackedVals.at(P.first);
20144 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20145 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20146 VectorizedVals.try_emplace(P.first, P.second);
20147 }
20148 continue;
20149 }
20150 }
20151 if (VectorizedTree) {
20152 // Reorder operands of bool logical op in the natural order to avoid
20153 // possible problem with poison propagation. If not possible to reorder
20154 // (both operands are originally RHS), emit an extra freeze instruction
20155 // for the LHS operand.
20156 // I.e., if we have original code like this:
20157 // RedOp1 = select i1 ?, i1 LHS, i1 false
20158 // RedOp2 = select i1 RHS, i1 ?, i1 false
20159
20160 // Then, we swap LHS/RHS to create a new op that matches the poison
20161 // semantics of the original code.
20162
20163 // If we have original code like this and both values could be poison:
20164 // RedOp1 = select i1 ?, i1 LHS, i1 false
20165 // RedOp2 = select i1 ?, i1 RHS, i1 false
20166
20167 // Then, we must freeze LHS in the new op.
20168 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20169 Instruction *RedOp1,
20170 Instruction *RedOp2,
20171 bool InitStep) {
20172 if (!AnyBoolLogicOp)
20173 return;
20174 if (isBoolLogicOp(RedOp1) &&
20175 ((!InitStep && LHS == VectorizedTree) ||
20176 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
20177 return;
20178 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20179 getRdxOperand(RedOp2, 0) == RHS ||
20181 std::swap(LHS, RHS);
20182 return;
20183 }
20184 if (LHS != VectorizedTree)
20185 LHS = Builder.CreateFreeze(LHS);
20186 };
20187 // Finish the reduction.
20188 // Need to add extra arguments and not vectorized possible reduction
20189 // values.
20190 // Try to avoid dependencies between the scalar remainders after
20191 // reductions.
20192 auto FinalGen =
20194 bool InitStep) {
20195 unsigned Sz = InstVals.size();
20197 Sz % 2);
20198 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20199 Instruction *RedOp = InstVals[I + 1].first;
20200 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20201 Value *RdxVal1 = InstVals[I].second;
20202 Value *StableRdxVal1 = RdxVal1;
20203 auto It1 = TrackedVals.find(RdxVal1);
20204 if (It1 != TrackedVals.end())
20205 StableRdxVal1 = It1->second;
20206 Value *RdxVal2 = InstVals[I + 1].second;
20207 Value *StableRdxVal2 = RdxVal2;
20208 auto It2 = TrackedVals.find(RdxVal2);
20209 if (It2 != TrackedVals.end())
20210 StableRdxVal2 = It2->second;
20211 // To prevent poison from leaking across what used to be
20212 // sequential, safe, scalar boolean logic operations, the
20213 // reduction operand must be frozen.
20214 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20215 RedOp, InitStep);
20216 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20217 StableRdxVal2, "op.rdx", ReductionOps);
20218 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20219 }
20220 if (Sz % 2 == 1)
20221 ExtraReds[Sz / 2] = InstVals.back();
20222 return ExtraReds;
20223 };
20225 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20226 VectorizedTree);
20228 for (ArrayRef<Value *> Candidates : ReducedVals) {
20229 for (Value *RdxVal : Candidates) {
20230 if (!Visited.insert(RdxVal).second)
20231 continue;
20232 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20233 for (Instruction *RedOp :
20234 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20235 ExtraReductions.emplace_back(RedOp, RdxVal);
20236 }
20237 }
20238 // Iterate through all not-vectorized reduction values/extra arguments.
20239 bool InitStep = true;
20240 while (ExtraReductions.size() > 1) {
20242 FinalGen(ExtraReductions, InitStep);
20243 ExtraReductions.swap(NewReds);
20244 InitStep = false;
20245 }
20246 VectorizedTree = ExtraReductions.front().second;
20247
20248 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20249
20250 // The original scalar reduction is expected to have no remaining
20251 // uses outside the reduction tree itself. Assert that we got this
20252 // correct, replace internal uses with undef, and mark for eventual
20253 // deletion.
20254#ifndef NDEBUG
20255 SmallSet<Value *, 4> IgnoreSet;
20256 for (ArrayRef<Value *> RdxOps : ReductionOps)
20257 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20258#endif
20259 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20260 for (Value *Ignore : RdxOps) {
20261 if (!Ignore)
20262 continue;
20263#ifndef NDEBUG
20264 for (auto *U : Ignore->users()) {
20265 assert(IgnoreSet.count(U) &&
20266 "All users must be either in the reduction ops list.");
20267 }
20268#endif
20269 if (!Ignore->use_empty()) {
20270 Value *P = PoisonValue::get(Ignore->getType());
20271 Ignore->replaceAllUsesWith(P);
20272 }
20273 }
20274 V.removeInstructionsAndOperands(RdxOps);
20275 }
20276 } else if (!CheckForReusedReductionOps) {
20277 for (ReductionOpsType &RdxOps : ReductionOps)
20278 for (Value *RdxOp : RdxOps)
20279 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20280 }
20281 return VectorizedTree;
20282 }
20283
20284private:
20285 /// Calculate the cost of a reduction.
20286 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20287 ArrayRef<Value *> ReducedVals,
20288 bool IsCmpSelMinMax, FastMathFlags FMF,
20289 const BoUpSLP &R) {
20291 Type *ScalarTy = ReducedVals.front()->getType();
20292 unsigned ReduxWidth = ReducedVals.size();
20293 FixedVectorType *VectorTy = R.getReductionType();
20294 InstructionCost VectorCost = 0, ScalarCost;
20295 // If all of the reduced values are constant, the vector cost is 0, since
20296 // the reduction value can be calculated at the compile time.
20297 bool AllConsts = allConstant(ReducedVals);
20298 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20300 // Scalar cost is repeated for N-1 elements.
20301 int Cnt = ReducedVals.size();
20302 for (Value *RdxVal : ReducedVals) {
20303 if (Cnt == 1)
20304 break;
20305 --Cnt;
20306 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20307 Cost += GenCostFn();
20308 continue;
20309 }
20310 InstructionCost ScalarCost = 0;
20311 for (User *U : RdxVal->users()) {
20312 auto *RdxOp = cast<Instruction>(U);
20313 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20314 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20315 continue;
20316 }
20317 ScalarCost = InstructionCost::getInvalid();
20318 break;
20319 }
20320 if (ScalarCost.isValid())
20321 Cost += ScalarCost;
20322 else
20323 Cost += GenCostFn();
20324 }
20325 return Cost;
20326 };
20327 switch (RdxKind) {
20328 case RecurKind::Add:
20329 case RecurKind::Mul:
20330 case RecurKind::Or:
20331 case RecurKind::And:
20332 case RecurKind::Xor:
20333 case RecurKind::FAdd:
20334 case RecurKind::FMul: {
20335 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20336 if (!AllConsts) {
20337 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20338 assert(SLPReVec && "FixedVectorType is not expected.");
20339 unsigned ScalarTyNumElements = VecTy->getNumElements();
20340 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20341 VectorCost += TTI->getShuffleCost(
20342 TTI::SK_PermuteSingleSrc, VectorTy,
20343 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20344 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20345 CostKind);
20346 }
20347 VectorCost += TTI->getScalarizationOverhead(
20348 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20349 /*Extract*/ false, TTI::TCK_RecipThroughput);
20350 } else {
20351 Type *RedTy = VectorTy->getElementType();
20352 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20353 std::make_pair(RedTy, true));
20354 if (RType == RedTy) {
20355 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20356 FMF, CostKind);
20357 } else {
20358 VectorCost = TTI->getExtendedReductionCost(
20359 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20360 FMF, CostKind);
20361 }
20362 }
20363 }
20364 ScalarCost = EvaluateScalarCost([&]() {
20365 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20366 });
20367 break;
20368 }
20369 case RecurKind::FMax:
20370 case RecurKind::FMin:
20371 case RecurKind::FMaximum:
20372 case RecurKind::FMinimum:
20373 case RecurKind::SMax:
20374 case RecurKind::SMin:
20375 case RecurKind::UMax:
20376 case RecurKind::UMin: {
20378 if (!AllConsts)
20379 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20380 ScalarCost = EvaluateScalarCost([&]() {
20381 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20382 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20383 });
20384 break;
20385 }
20386 default:
20387 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20388 }
20389
20390 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20391 << " for reduction of " << shortBundleName(ReducedVals)
20392 << " (It is a splitting reduction)\n");
20393 return VectorCost - ScalarCost;
20394 }
20395
20396 /// Emit a horizontal reduction of the vectorized value.
20397 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20398 const TargetTransformInfo *TTI, Type *DestTy) {
20399 assert(VectorizedValue && "Need to have a vectorized tree node");
20400 assert(RdxKind != RecurKind::FMulAdd &&
20401 "A call to the llvm.fmuladd intrinsic is not handled yet");
20402
20403 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20404 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20405 RdxKind == RecurKind::Add &&
20406 DestTy->getScalarType() != FTy->getScalarType()) {
20407 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20408 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20409 Value *V = Builder.CreateBitCast(
20410 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20411 ++NumVectorInstructions;
20412 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20413 }
20414 ++NumVectorInstructions;
20415 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20416 }
20417
20418 /// Emits optimized code for unique scalar value reused \p Cnt times.
20419 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20420 unsigned Cnt) {
20421 assert(IsSupportedHorRdxIdentityOp &&
20422 "The optimization of matched scalar identity horizontal reductions "
20423 "must be supported.");
20424 if (Cnt == 1)
20425 return VectorizedValue;
20426 switch (RdxKind) {
20427 case RecurKind::Add: {
20428 // res = mul vv, n
20429 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20430 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20431 << VectorizedValue << ". (HorRdx)\n");
20432 return Builder.CreateMul(VectorizedValue, Scale);
20433 }
20434 case RecurKind::Xor: {
20435 // res = n % 2 ? 0 : vv
20436 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20437 << ". (HorRdx)\n");
20438 if (Cnt % 2 == 0)
20439 return Constant::getNullValue(VectorizedValue->getType());
20440 return VectorizedValue;
20441 }
20442 case RecurKind::FAdd: {
20443 // res = fmul v, n
20444 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20445 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20446 << VectorizedValue << ". (HorRdx)\n");
20447 return Builder.CreateFMul(VectorizedValue, Scale);
20448 }
20449 case RecurKind::And:
20450 case RecurKind::Or:
20451 case RecurKind::SMax:
20452 case RecurKind::SMin:
20453 case RecurKind::UMax:
20454 case RecurKind::UMin:
20455 case RecurKind::FMax:
20456 case RecurKind::FMin:
20457 case RecurKind::FMaximum:
20458 case RecurKind::FMinimum:
20459 // res = vv
20460 return VectorizedValue;
20461 case RecurKind::Mul:
20462 case RecurKind::FMul:
20463 case RecurKind::FMulAdd:
20464 case RecurKind::IAnyOf:
20465 case RecurKind::FAnyOf:
20466 case RecurKind::IFindLastIV:
20467 case RecurKind::FFindLastIV:
20468 case RecurKind::None:
20469 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20470 }
20471 return nullptr;
20472 }
20473
20474 /// Emits actual operation for the scalar identity values, found during
20475 /// horizontal reduction analysis.
20476 Value *
20477 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20478 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20479 const DenseMap<Value *, Value *> &TrackedToOrig) {
20480 assert(IsSupportedHorRdxIdentityOp &&
20481 "The optimization of matched scalar identity horizontal reductions "
20482 "must be supported.");
20483 ArrayRef<Value *> VL = R.getRootNodeScalars();
20484 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20485 if (VTy->getElementType() != VL.front()->getType()) {
20486 VectorizedValue = Builder.CreateIntCast(
20487 VectorizedValue,
20488 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20489 R.isSignedMinBitwidthRootNode());
20490 }
20491 switch (RdxKind) {
20492 case RecurKind::Add: {
20493 // root = mul prev_root, <1, 1, n, 1>
20495 for (Value *V : VL) {
20496 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20497 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20498 }
20499 auto *Scale = ConstantVector::get(Vals);
20500 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20501 << VectorizedValue << ". (HorRdx)\n");
20502 return Builder.CreateMul(VectorizedValue, Scale);
20503 }
20504 case RecurKind::And:
20505 case RecurKind::Or:
20506 // No need for multiple or/and(s).
20507 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20508 << ". (HorRdx)\n");
20509 return VectorizedValue;
20510 case RecurKind::SMax:
20511 case RecurKind::SMin:
20512 case RecurKind::UMax:
20513 case RecurKind::UMin:
20514 case RecurKind::FMax:
20515 case RecurKind::FMin:
20516 case RecurKind::FMaximum:
20517 case RecurKind::FMinimum:
20518 // No need for multiple min/max(s) of the same value.
20519 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20520 << ". (HorRdx)\n");
20521 return VectorizedValue;
20522 case RecurKind::Xor: {
20523 // Replace values with even number of repeats with 0, since
20524 // x xor x = 0.
20525 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20526 // 7>, if elements 4th and 6th elements have even number of repeats.
20528 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20530 std::iota(Mask.begin(), Mask.end(), 0);
20531 bool NeedShuffle = false;
20532 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20533 Value *V = VL[I];
20534 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20535 if (Cnt % 2 == 0) {
20536 Mask[I] = VF;
20537 NeedShuffle = true;
20538 }
20539 }
20540 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20541 : Mask) dbgs()
20542 << I << " ";
20543 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20544 if (NeedShuffle)
20545 VectorizedValue = Builder.CreateShuffleVector(
20546 VectorizedValue,
20547 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20548 return VectorizedValue;
20549 }
20550 case RecurKind::FAdd: {
20551 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20553 for (Value *V : VL) {
20554 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20555 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20556 }
20557 auto *Scale = ConstantVector::get(Vals);
20558 return Builder.CreateFMul(VectorizedValue, Scale);
20559 }
20560 case RecurKind::Mul:
20561 case RecurKind::FMul:
20562 case RecurKind::FMulAdd:
20563 case RecurKind::IAnyOf:
20564 case RecurKind::FAnyOf:
20565 case RecurKind::IFindLastIV:
20566 case RecurKind::FFindLastIV:
20567 case RecurKind::None:
20568 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20569 }
20570 return nullptr;
20571 }
20572};
20573} // end anonymous namespace
20574
20575/// Gets recurrence kind from the specified value.
20577 return HorizontalReduction::getRdxKind(V);
20578}
20579static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20580 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20581 return cast<FixedVectorType>(IE->getType())->getNumElements();
20582
20583 unsigned AggregateSize = 1;
20584 auto *IV = cast<InsertValueInst>(InsertInst);
20585 Type *CurrentType = IV->getType();
20586 do {
20587 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20588 for (auto *Elt : ST->elements())
20589 if (Elt != ST->getElementType(0)) // check homogeneity
20590 return std::nullopt;
20591 AggregateSize *= ST->getNumElements();
20592 CurrentType = ST->getElementType(0);
20593 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20594 AggregateSize *= AT->getNumElements();
20595 CurrentType = AT->getElementType();
20596 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20597 AggregateSize *= VT->getNumElements();
20598 return AggregateSize;
20599 } else if (CurrentType->isSingleValueType()) {
20600 return AggregateSize;
20601 } else {
20602 return std::nullopt;
20603 }
20604 } while (true);
20605}
20606
20607static void findBuildAggregate_rec(Instruction *LastInsertInst,
20609 SmallVectorImpl<Value *> &BuildVectorOpds,
20610 SmallVectorImpl<Value *> &InsertElts,
20611 unsigned OperandOffset, const BoUpSLP &R) {
20612 do {
20613 Value *InsertedOperand = LastInsertInst->getOperand(1);
20614 std::optional<unsigned> OperandIndex =
20615 getElementIndex(LastInsertInst, OperandOffset);
20616 if (!OperandIndex || R.isDeleted(LastInsertInst))
20617 return;
20618 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20619 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20620 BuildVectorOpds, InsertElts, *OperandIndex, R);
20621
20622 } else {
20623 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20624 InsertElts[*OperandIndex] = LastInsertInst;
20625 }
20626 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20627 } while (LastInsertInst != nullptr &&
20628 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20629 LastInsertInst->hasOneUse());
20630}
20631
20632/// Recognize construction of vectors like
20633/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20634/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20635/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20636/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20637/// starting from the last insertelement or insertvalue instruction.
20638///
20639/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20640/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20641/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20642///
20643/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20644///
20645/// \return true if it matches.
20646static bool findBuildAggregate(Instruction *LastInsertInst,
20648 SmallVectorImpl<Value *> &BuildVectorOpds,
20649 SmallVectorImpl<Value *> &InsertElts,
20650 const BoUpSLP &R) {
20651
20652 assert((isa<InsertElementInst>(LastInsertInst) ||
20653 isa<InsertValueInst>(LastInsertInst)) &&
20654 "Expected insertelement or insertvalue instruction!");
20655
20656 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20657 "Expected empty result vectors!");
20658
20659 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20660 if (!AggregateSize)
20661 return false;
20662 BuildVectorOpds.resize(*AggregateSize);
20663 InsertElts.resize(*AggregateSize);
20664
20665 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20666 R);
20667 llvm::erase(BuildVectorOpds, nullptr);
20668 llvm::erase(InsertElts, nullptr);
20669 if (BuildVectorOpds.size() >= 2)
20670 return true;
20671
20672 return false;
20673}
20674
20675/// Try and get a reduction instruction from a phi node.
20676///
20677/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20678/// if they come from either \p ParentBB or a containing loop latch.
20679///
20680/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20681/// if not possible.
20683 BasicBlock *ParentBB, LoopInfo *LI) {
20684 // There are situations where the reduction value is not dominated by the
20685 // reduction phi. Vectorizing such cases has been reported to cause
20686 // miscompiles. See PR25787.
20687 auto DominatedReduxValue = [&](Value *R) {
20688 return isa<Instruction>(R) &&
20689 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20690 };
20691
20692 Instruction *Rdx = nullptr;
20693
20694 // Return the incoming value if it comes from the same BB as the phi node.
20695 if (P->getIncomingBlock(0) == ParentBB) {
20696 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20697 } else if (P->getIncomingBlock(1) == ParentBB) {
20698 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20699 }
20700
20701 if (Rdx && DominatedReduxValue(Rdx))
20702 return Rdx;
20703
20704 // Otherwise, check whether we have a loop latch to look at.
20705 Loop *BBL = LI->getLoopFor(ParentBB);
20706 if (!BBL)
20707 return nullptr;
20708 BasicBlock *BBLatch = BBL->getLoopLatch();
20709 if (!BBLatch)
20710 return nullptr;
20711
20712 // There is a loop latch, return the incoming value if it comes from
20713 // that. This reduction pattern occasionally turns up.
20714 if (P->getIncomingBlock(0) == BBLatch) {
20715 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20716 } else if (P->getIncomingBlock(1) == BBLatch) {
20717 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20718 }
20719
20720 if (Rdx && DominatedReduxValue(Rdx))
20721 return Rdx;
20722
20723 return nullptr;
20724}
20725
20726static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
20727 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
20728 return true;
20729 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
20730 return true;
20731 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
20732 return true;
20733 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
20734 return true;
20735 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
20736 return true;
20737 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
20738 return true;
20739 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
20740 return true;
20741 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
20742 return true;
20743 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
20744 return true;
20745 return false;
20746}
20747
20748/// We could have an initial reduction that is not an add.
20749/// r *= v1 + v2 + v3 + v4
20750/// In such a case start looking for a tree rooted in the first '+'.
20751/// \Returns the new root if found, which may be nullptr if not an instruction.
20753 Instruction *Root) {
20754 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20755 isa<IntrinsicInst>(Root)) &&
20756 "Expected binop, select, or intrinsic for reduction matching");
20757 Value *LHS =
20758 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20759 Value *RHS =
20760 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20761 if (LHS == Phi)
20762 return dyn_cast<Instruction>(RHS);
20763 if (RHS == Phi)
20764 return dyn_cast<Instruction>(LHS);
20765 return nullptr;
20766}
20767
20768/// \p Returns the first operand of \p I that does not match \p Phi. If
20769/// operand is not an instruction it returns nullptr.
20771 Value *Op0 = nullptr;
20772 Value *Op1 = nullptr;
20773 if (!matchRdxBop(I, Op0, Op1))
20774 return nullptr;
20775 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
20776}
20777
20778/// \Returns true if \p I is a candidate instruction for reduction vectorization.
20780 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
20781 Value *B0 = nullptr, *B1 = nullptr;
20782 bool IsBinop = matchRdxBop(I, B0, B1);
20783 return IsBinop || IsSelect;
20784}
20785
20786bool SLPVectorizerPass::vectorizeHorReduction(
20787 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
20788 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
20789 if (!ShouldVectorizeHor)
20790 return false;
20791 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
20792
20793 if (Root->getParent() != BB || isa<PHINode>(Root))
20794 return false;
20795
20796 // If we can find a secondary reduction root, use that instead.
20797 auto SelectRoot = [&]() {
20798 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
20799 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
20800 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
20801 return NewRoot;
20802 return Root;
20803 };
20804
20805 // Start analysis starting from Root instruction. If horizontal reduction is
20806 // found, try to vectorize it. If it is not a horizontal reduction or
20807 // vectorization is not possible or not effective, and currently analyzed
20808 // instruction is a binary operation, try to vectorize the operands, using
20809 // pre-order DFS traversal order. If the operands were not vectorized, repeat
20810 // the same procedure considering each operand as a possible root of the
20811 // horizontal reduction.
20812 // Interrupt the process if the Root instruction itself was vectorized or all
20813 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
20814 // If a horizintal reduction was not matched or vectorized we collect
20815 // instructions for possible later attempts for vectorization.
20816 std::queue<std::pair<Instruction *, unsigned>> Stack;
20817 Stack.emplace(SelectRoot(), 0);
20818 SmallPtrSet<Value *, 8> VisitedInstrs;
20819 bool Res = false;
20820 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
20821 if (R.isAnalyzedReductionRoot(Inst))
20822 return nullptr;
20823 if (!isReductionCandidate(Inst))
20824 return nullptr;
20825 HorizontalReduction HorRdx;
20826 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
20827 return nullptr;
20828 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
20829 };
20830 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
20831 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
20832 FutureSeed = getNonPhiOperand(Root, P);
20833 if (!FutureSeed)
20834 return false;
20835 }
20836 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
20837 // analysis is done separately.
20838 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
20839 PostponedInsts.push_back(FutureSeed);
20840 return true;
20841 };
20842
20843 while (!Stack.empty()) {
20844 Instruction *Inst;
20845 unsigned Level;
20846 std::tie(Inst, Level) = Stack.front();
20847 Stack.pop();
20848 // Do not try to analyze instruction that has already been vectorized.
20849 // This may happen when we vectorize instruction operands on a previous
20850 // iteration while stack was populated before that happened.
20851 if (R.isDeleted(Inst))
20852 continue;
20853 if (Value *VectorizedV = TryToReduce(Inst)) {
20854 Res = true;
20855 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
20856 // Try to find another reduction.
20857 Stack.emplace(I, Level);
20858 continue;
20859 }
20860 if (R.isDeleted(Inst))
20861 continue;
20862 } else {
20863 // We could not vectorize `Inst` so try to use it as a future seed.
20864 if (!TryAppendToPostponedInsts(Inst)) {
20865 assert(Stack.empty() && "Expected empty stack");
20866 break;
20867 }
20868 }
20869
20870 // Try to vectorize operands.
20871 // Continue analysis for the instruction from the same basic block only to
20872 // save compile time.
20873 if (++Level < RecursionMaxDepth)
20874 for (auto *Op : Inst->operand_values())
20875 if (VisitedInstrs.insert(Op).second)
20876 if (auto *I = dyn_cast<Instruction>(Op))
20877 // Do not try to vectorize CmpInst operands, this is done
20878 // separately.
20879 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
20880 !R.isDeleted(I) && I->getParent() == BB)
20881 Stack.emplace(I, Level);
20882 }
20883 return Res;
20884}
20885
20886bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
20887 BasicBlock *BB, BoUpSLP &R) {
20888 SmallVector<WeakTrackingVH> PostponedInsts;
20889 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
20890 Res |= tryToVectorize(PostponedInsts, R);
20891 return Res;
20892}
20893
20894bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
20895 BoUpSLP &R) {
20896 bool Res = false;
20897 for (Value *V : Insts)
20898 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
20899 Res |= tryToVectorize(Inst, R);
20900 return Res;
20901}
20902
20903bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
20904 BasicBlock *BB, BoUpSLP &R,
20905 bool MaxVFOnly) {
20906 if (!R.canMapToVector(IVI->getType()))
20907 return false;
20908
20909 SmallVector<Value *, 16> BuildVectorOpds;
20910 SmallVector<Value *, 16> BuildVectorInsts;
20911 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
20912 return false;
20913
20914 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
20915 R.getORE()->emit([&]() {
20916 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
20917 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
20918 "trying reduction first.";
20919 });
20920 return false;
20921 }
20922 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
20923 // Aggregate value is unlikely to be processed in vector register.
20924 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
20925}
20926
20927bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
20928 BasicBlock *BB, BoUpSLP &R,
20929 bool MaxVFOnly) {
20930 SmallVector<Value *, 16> BuildVectorInsts;
20931 SmallVector<Value *, 16> BuildVectorOpds;
20933 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
20934 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
20935 isFixedVectorShuffle(BuildVectorOpds, Mask)))
20936 return false;
20937
20938 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
20939 R.getORE()->emit([&]() {
20940 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
20941 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
20942 "trying reduction first.";
20943 });
20944 return false;
20945 }
20946 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
20947 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
20948}
20949
20950template <typename T>
20952 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
20953 function_ref<bool(T *, T *)> AreCompatible,
20954 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
20955 bool MaxVFOnly, BoUpSLP &R) {
20956 bool Changed = false;
20957 // Sort by type, parent, operands.
20958 stable_sort(Incoming, Comparator);
20959
20960 // Try to vectorize elements base on their type.
20961 SmallVector<T *> Candidates;
20963 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
20964 VL.clear()) {
20965 // Look for the next elements with the same type, parent and operand
20966 // kinds.
20967 auto *I = dyn_cast<Instruction>(*IncIt);
20968 if (!I || R.isDeleted(I)) {
20969 ++IncIt;
20970 continue;
20971 }
20972 auto *SameTypeIt = IncIt;
20973 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
20974 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
20975 AreCompatible(*SameTypeIt, *IncIt))) {
20976 auto *I = dyn_cast<Instruction>(*SameTypeIt);
20977 ++SameTypeIt;
20978 if (I && !R.isDeleted(I))
20979 VL.push_back(cast<T>(I));
20980 }
20981
20982 // Try to vectorize them.
20983 unsigned NumElts = VL.size();
20984 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
20985 << NumElts << ")\n");
20986 // The vectorization is a 3-state attempt:
20987 // 1. Try to vectorize instructions with the same/alternate opcodes with the
20988 // size of maximal register at first.
20989 // 2. Try to vectorize remaining instructions with the same type, if
20990 // possible. This may result in the better vectorization results rather than
20991 // if we try just to vectorize instructions with the same/alternate opcodes.
20992 // 3. Final attempt to try to vectorize all instructions with the
20993 // same/alternate ops only, this may result in some extra final
20994 // vectorization.
20995 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
20996 // Success start over because instructions might have been changed.
20997 Changed = true;
20998 VL.swap(Candidates);
20999 Candidates.clear();
21000 for (T *V : VL) {
21001 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21002 Candidates.push_back(V);
21003 }
21004 } else {
21005 /// \Returns the minimum number of elements that we will attempt to
21006 /// vectorize.
21007 auto GetMinNumElements = [&R](Value *V) {
21008 unsigned EltSize = R.getVectorElementSize(V);
21009 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21010 };
21011 if (NumElts < GetMinNumElements(*IncIt) &&
21012 (Candidates.empty() ||
21013 Candidates.front()->getType() == (*IncIt)->getType())) {
21014 for (T *V : VL) {
21015 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21016 Candidates.push_back(V);
21017 }
21018 }
21019 }
21020 // Final attempt to vectorize instructions with the same types.
21021 if (Candidates.size() > 1 &&
21022 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21023 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21024 // Success start over because instructions might have been changed.
21025 Changed = true;
21026 } else if (MaxVFOnly) {
21027 // Try to vectorize using small vectors.
21029 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21030 VL.clear()) {
21031 auto *I = dyn_cast<Instruction>(*It);
21032 if (!I || R.isDeleted(I)) {
21033 ++It;
21034 continue;
21035 }
21036 auto *SameTypeIt = It;
21037 while (SameTypeIt != End &&
21038 (!isa<Instruction>(*SameTypeIt) ||
21039 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21040 AreCompatible(*SameTypeIt, *It))) {
21041 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21042 ++SameTypeIt;
21043 if (I && !R.isDeleted(I))
21044 VL.push_back(cast<T>(I));
21045 }
21046 unsigned NumElts = VL.size();
21047 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21048 /*MaxVFOnly=*/false))
21049 Changed = true;
21050 It = SameTypeIt;
21051 }
21052 }
21053 Candidates.clear();
21054 }
21055
21056 // Start over at the next instruction of a different type (or the end).
21057 IncIt = SameTypeIt;
21058 }
21059 return Changed;
21060}
21061
21062/// Compare two cmp instructions. If IsCompatibility is true, function returns
21063/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21064/// operands. If IsCompatibility is false, function implements strict weak
21065/// ordering relation between two cmp instructions, returning true if the first
21066/// instruction is "less" than the second, i.e. its predicate is less than the
21067/// predicate of the second or the operands IDs are less than the operands IDs
21068/// of the second cmp instruction.
21069template <bool IsCompatibility>
21070static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21071 const DominatorTree &DT) {
21072 assert(isValidElementType(V->getType()) &&
21073 isValidElementType(V2->getType()) &&
21074 "Expected valid element types only.");
21075 if (V == V2)
21076 return IsCompatibility;
21077 auto *CI1 = cast<CmpInst>(V);
21078 auto *CI2 = cast<CmpInst>(V2);
21079 if (CI1->getOperand(0)->getType()->getTypeID() <
21080 CI2->getOperand(0)->getType()->getTypeID())
21081 return !IsCompatibility;
21082 if (CI1->getOperand(0)->getType()->getTypeID() >
21083 CI2->getOperand(0)->getType()->getTypeID())
21084 return false;
21085 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21087 return !IsCompatibility;
21088 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21090 return false;
21091 CmpInst::Predicate Pred1 = CI1->getPredicate();
21092 CmpInst::Predicate Pred2 = CI2->getPredicate();
21095 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21096 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21097 if (BasePred1 < BasePred2)
21098 return !IsCompatibility;
21099 if (BasePred1 > BasePred2)
21100 return false;
21101 // Compare operands.
21102 bool CI1Preds = Pred1 == BasePred1;
21103 bool CI2Preds = Pred2 == BasePred1;
21104 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21105 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21106 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21107 if (Op1 == Op2)
21108 continue;
21109 if (Op1->getValueID() < Op2->getValueID())
21110 return !IsCompatibility;
21111 if (Op1->getValueID() > Op2->getValueID())
21112 return false;
21113 if (auto *I1 = dyn_cast<Instruction>(Op1))
21114 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21115 if (IsCompatibility) {
21116 if (I1->getParent() != I2->getParent())
21117 return false;
21118 } else {
21119 // Try to compare nodes with same parent.
21120 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21121 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21122 if (!NodeI1)
21123 return NodeI2 != nullptr;
21124 if (!NodeI2)
21125 return false;
21126 assert((NodeI1 == NodeI2) ==
21127 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21128 "Different nodes should have different DFS numbers");
21129 if (NodeI1 != NodeI2)
21130 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21131 }
21132 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21133 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
21134 continue;
21135 if (IsCompatibility)
21136 return false;
21137 if (I1->getOpcode() != I2->getOpcode())
21138 return I1->getOpcode() < I2->getOpcode();
21139 }
21140 }
21141 return IsCompatibility;
21142}
21143
21144template <typename ItT>
21145bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21146 BasicBlock *BB, BoUpSLP &R) {
21147 bool Changed = false;
21148 // Try to find reductions first.
21149 for (CmpInst *I : CmpInsts) {
21150 if (R.isDeleted(I))
21151 continue;
21152 for (Value *Op : I->operands())
21153 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21154 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21155 if (R.isDeleted(I))
21156 break;
21157 }
21158 }
21159 // Try to vectorize operands as vector bundles.
21160 for (CmpInst *I : CmpInsts) {
21161 if (R.isDeleted(I))
21162 continue;
21163 Changed |= tryToVectorize(I, R);
21164 }
21165 // Try to vectorize list of compares.
21166 // Sort by type, compare predicate, etc.
21167 auto CompareSorter = [&](Value *V, Value *V2) {
21168 if (V == V2)
21169 return false;
21170 return compareCmp<false>(V, V2, *TLI, *DT);
21171 };
21172
21173 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21174 if (V1 == V2)
21175 return true;
21176 return compareCmp<true>(V1, V2, *TLI, *DT);
21177 };
21178
21180 for (Instruction *V : CmpInsts)
21181 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21182 Vals.push_back(V);
21183 if (Vals.size() <= 1)
21184 return Changed;
21185 Changed |= tryToVectorizeSequence<Value>(
21186 Vals, CompareSorter, AreCompatibleCompares,
21187 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21188 // Exclude possible reductions from other blocks.
21189 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21190 return any_of(V->users(), [V](User *U) {
21191 auto *Select = dyn_cast<SelectInst>(U);
21192 return Select &&
21193 Select->getParent() != cast<Instruction>(V)->getParent();
21194 });
21195 });
21196 if (ArePossiblyReducedInOtherBlock)
21197 return false;
21198 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21199 },
21200 /*MaxVFOnly=*/true, R);
21201 return Changed;
21202}
21203
21204bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21205 BasicBlock *BB, BoUpSLP &R) {
21206 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21207 "This function only accepts Insert instructions");
21208 bool OpsChanged = false;
21209 SmallVector<WeakTrackingVH> PostponedInsts;
21210 for (auto *I : reverse(Instructions)) {
21211 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21212 if (R.isDeleted(I) || isa<CmpInst>(I))
21213 continue;
21214 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21215 OpsChanged |=
21216 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21217 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21218 OpsChanged |=
21219 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21220 }
21221 // pass2 - try to vectorize reductions only
21222 if (R.isDeleted(I))
21223 continue;
21224 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21225 if (R.isDeleted(I) || isa<CmpInst>(I))
21226 continue;
21227 // pass3 - try to match and vectorize a buildvector sequence.
21228 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21229 OpsChanged |=
21230 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21231 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21232 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21233 /*MaxVFOnly=*/false);
21234 }
21235 }
21236 // Now try to vectorize postponed instructions.
21237 OpsChanged |= tryToVectorize(PostponedInsts, R);
21238
21239 Instructions.clear();
21240 return OpsChanged;
21241}
21242
21243bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21244 bool Changed = false;
21246 SmallPtrSet<Value *, 16> VisitedInstrs;
21247 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21248 // node. Allows better to identify the chains that can be vectorized in the
21249 // better way.
21251 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21253 isValidElementType(V2->getType()) &&
21254 "Expected vectorizable types only.");
21255 // It is fine to compare type IDs here, since we expect only vectorizable
21256 // types, like ints, floats and pointers, we don't care about other type.
21257 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21258 return true;
21259 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21260 return false;
21261 if (V1->getType()->getScalarSizeInBits() <
21262 V2->getType()->getScalarSizeInBits())
21263 return true;
21264 if (V1->getType()->getScalarSizeInBits() >
21265 V2->getType()->getScalarSizeInBits())
21266 return false;
21267 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21268 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21269 if (Opcodes1.size() < Opcodes2.size())
21270 return true;
21271 if (Opcodes1.size() > Opcodes2.size())
21272 return false;
21273 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21274 {
21275 // Instructions come first.
21276 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21277 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21278 if (I1 && I2) {
21279 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21280 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21281 if (!NodeI1)
21282 return NodeI2 != nullptr;
21283 if (!NodeI2)
21284 return false;
21285 assert((NodeI1 == NodeI2) ==
21286 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21287 "Different nodes should have different DFS numbers");
21288 if (NodeI1 != NodeI2)
21289 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21290 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21291 if (S.getOpcode() && !S.isAltShuffle())
21292 continue;
21293 return I1->getOpcode() < I2->getOpcode();
21294 }
21295 if (I1)
21296 return true;
21297 if (I2)
21298 return false;
21299 }
21300 {
21301 // Non-undef constants come next.
21302 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21303 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21304 if (C1 && C2)
21305 continue;
21306 if (C1)
21307 return true;
21308 if (C2)
21309 return false;
21310 }
21311 bool U1 = isa<UndefValue>(Opcodes1[I]);
21312 bool U2 = isa<UndefValue>(Opcodes2[I]);
21313 {
21314 // Non-constant non-instructions come next.
21315 if (!U1 && !U2) {
21316 auto ValID1 = Opcodes1[I]->getValueID();
21317 auto ValID2 = Opcodes2[I]->getValueID();
21318 if (ValID1 == ValID2)
21319 continue;
21320 if (ValID1 < ValID2)
21321 return true;
21322 if (ValID1 > ValID2)
21323 return false;
21324 }
21325 if (!U1)
21326 return true;
21327 if (!U2)
21328 return false;
21329 }
21330 // Undefs come last.
21331 assert(U1 && U2 && "The only thing left should be undef & undef.");
21332 }
21333 return false;
21334 };
21335 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21336 if (V1 == V2)
21337 return true;
21338 if (V1->getType() != V2->getType())
21339 return false;
21340 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21341 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21342 if (Opcodes1.size() != Opcodes2.size())
21343 return false;
21344 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21345 // Undefs are compatible with any other value.
21346 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21347 continue;
21348 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21349 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21350 if (R.isDeleted(I1) || R.isDeleted(I2))
21351 return false;
21352 if (I1->getParent() != I2->getParent())
21353 return false;
21354 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21355 if (S.getOpcode())
21356 continue;
21357 return false;
21358 }
21359 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21360 continue;
21361 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21362 return false;
21363 }
21364 return true;
21365 };
21366
21367 bool HaveVectorizedPhiNodes = false;
21368 do {
21369 // Collect the incoming values from the PHIs.
21370 Incoming.clear();
21371 for (Instruction &I : *BB) {
21372 auto *P = dyn_cast<PHINode>(&I);
21373 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21374 break;
21375
21376 // No need to analyze deleted, vectorized and non-vectorizable
21377 // instructions.
21378 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21379 isValidElementType(P->getType()))
21380 Incoming.push_back(P);
21381 }
21382
21383 if (Incoming.size() <= 1)
21384 break;
21385
21386 // Find the corresponding non-phi nodes for better matching when trying to
21387 // build the tree.
21388 for (Value *V : Incoming) {
21389 SmallVectorImpl<Value *> &Opcodes =
21390 PHIToOpcodes.try_emplace(V).first->getSecond();
21391 if (!Opcodes.empty())
21392 continue;
21393 SmallVector<Value *, 4> Nodes(1, V);
21395 while (!Nodes.empty()) {
21396 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21397 if (!Visited.insert(PHI).second)
21398 continue;
21399 for (Value *V : PHI->incoming_values()) {
21400 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21401 Nodes.push_back(PHI1);
21402 continue;
21403 }
21404 Opcodes.emplace_back(V);
21405 }
21406 }
21407 }
21408
21409 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21410 Incoming, PHICompare, AreCompatiblePHIs,
21411 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21412 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21413 },
21414 /*MaxVFOnly=*/true, R);
21415 Changed |= HaveVectorizedPhiNodes;
21416 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21417 auto *PHI = dyn_cast<PHINode>(P.first);
21418 return !PHI || R.isDeleted(PHI);
21419 }))
21420 PHIToOpcodes.clear();
21421 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21422 } while (HaveVectorizedPhiNodes);
21423
21424 VisitedInstrs.clear();
21425
21426 InstSetVector PostProcessInserts;
21427 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21428 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21429 // also vectorizes `PostProcessCmps`.
21430 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21431 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21432 if (VectorizeCmps) {
21433 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21434 PostProcessCmps.clear();
21435 }
21436 PostProcessInserts.clear();
21437 return Changed;
21438 };
21439 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21440 auto IsInPostProcessInstrs = [&](Instruction *I) {
21441 if (auto *Cmp = dyn_cast<CmpInst>(I))
21442 return PostProcessCmps.contains(Cmp);
21443 return isa<InsertElementInst, InsertValueInst>(I) &&
21444 PostProcessInserts.contains(I);
21445 };
21446 // Returns true if `I` is an instruction without users, like terminator, or
21447 // function call with ignored return value, store. Ignore unused instructions
21448 // (basing on instruction type, except for CallInst and InvokeInst).
21449 auto HasNoUsers = [](Instruction *I) {
21450 return I->use_empty() &&
21451 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21452 };
21453 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21454 // Skip instructions with scalable type. The num of elements is unknown at
21455 // compile-time for scalable type.
21456 if (isa<ScalableVectorType>(It->getType()))
21457 continue;
21458
21459 // Skip instructions marked for the deletion.
21460 if (R.isDeleted(&*It))
21461 continue;
21462 // We may go through BB multiple times so skip the one we have checked.
21463 if (!VisitedInstrs.insert(&*It).second) {
21464 if (HasNoUsers(&*It) &&
21465 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21466 // We would like to start over since some instructions are deleted
21467 // and the iterator may become invalid value.
21468 Changed = true;
21469 It = BB->begin();
21470 E = BB->end();
21471 }
21472 continue;
21473 }
21474
21475 if (isa<DbgInfoIntrinsic>(It))
21476 continue;
21477
21478 // Try to vectorize reductions that use PHINodes.
21479 if (PHINode *P = dyn_cast<PHINode>(It)) {
21480 // Check that the PHI is a reduction PHI.
21481 if (P->getNumIncomingValues() == 2) {
21482 // Try to match and vectorize a horizontal reduction.
21483 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21484 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21485 Changed = true;
21486 It = BB->begin();
21487 E = BB->end();
21488 continue;
21489 }
21490 }
21491 // Try to vectorize the incoming values of the PHI, to catch reductions
21492 // that feed into PHIs.
21493 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21494 // Skip if the incoming block is the current BB for now. Also, bypass
21495 // unreachable IR for efficiency and to avoid crashing.
21496 // TODO: Collect the skipped incoming values and try to vectorize them
21497 // after processing BB.
21498 if (BB == P->getIncomingBlock(I) ||
21499 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21500 continue;
21501
21502 // Postponed instructions should not be vectorized here, delay their
21503 // vectorization.
21504 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21505 PI && !IsInPostProcessInstrs(PI)) {
21506 bool Res =
21507 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21508 Changed |= Res;
21509 if (Res && R.isDeleted(P)) {
21510 It = BB->begin();
21511 E = BB->end();
21512 break;
21513 }
21514 }
21515 }
21516 continue;
21517 }
21518
21519 if (HasNoUsers(&*It)) {
21520 bool OpsChanged = false;
21521 auto *SI = dyn_cast<StoreInst>(It);
21522 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21523 if (SI) {
21524 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21525 // Try to vectorize chain in store, if this is the only store to the
21526 // address in the block.
21527 // TODO: This is just a temporarily solution to save compile time. Need
21528 // to investigate if we can safely turn on slp-vectorize-hor-store
21529 // instead to allow lookup for reduction chains in all non-vectorized
21530 // stores (need to check side effects and compile time).
21531 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21532 SI->getValueOperand()->hasOneUse();
21533 }
21534 if (TryToVectorizeRoot) {
21535 for (auto *V : It->operand_values()) {
21536 // Postponed instructions should not be vectorized here, delay their
21537 // vectorization.
21538 if (auto *VI = dyn_cast<Instruction>(V);
21539 VI && !IsInPostProcessInstrs(VI))
21540 // Try to match and vectorize a horizontal reduction.
21541 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21542 }
21543 }
21544 // Start vectorization of post-process list of instructions from the
21545 // top-tree instructions to try to vectorize as many instructions as
21546 // possible.
21547 OpsChanged |=
21548 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21549 if (OpsChanged) {
21550 // We would like to start over since some instructions are deleted
21551 // and the iterator may become invalid value.
21552 Changed = true;
21553 It = BB->begin();
21554 E = BB->end();
21555 continue;
21556 }
21557 }
21558
21559 if (isa<InsertElementInst, InsertValueInst>(It))
21560 PostProcessInserts.insert(&*It);
21561 else if (isa<CmpInst>(It))
21562 PostProcessCmps.insert(cast<CmpInst>(&*It));
21563 }
21564
21565 return Changed;
21566}
21567
21568bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21569 auto Changed = false;
21570 for (auto &Entry : GEPs) {
21571 // If the getelementptr list has fewer than two elements, there's nothing
21572 // to do.
21573 if (Entry.second.size() < 2)
21574 continue;
21575
21576 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21577 << Entry.second.size() << ".\n");
21578
21579 // Process the GEP list in chunks suitable for the target's supported
21580 // vector size. If a vector register can't hold 1 element, we are done. We
21581 // are trying to vectorize the index computations, so the maximum number of
21582 // elements is based on the size of the index expression, rather than the
21583 // size of the GEP itself (the target's pointer size).
21584 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21585 return !R.isDeleted(GEP);
21586 });
21587 if (It == Entry.second.end())
21588 continue;
21589 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21590 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21591 if (MaxVecRegSize < EltSize)
21592 continue;
21593
21594 unsigned MaxElts = MaxVecRegSize / EltSize;
21595 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21596 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21597 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21598
21599 // Initialize a set a candidate getelementptrs. Note that we use a
21600 // SetVector here to preserve program order. If the index computations
21601 // are vectorizable and begin with loads, we want to minimize the chance
21602 // of having to reorder them later.
21603 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21604
21605 // Some of the candidates may have already been vectorized after we
21606 // initially collected them or their index is optimized to constant value.
21607 // If so, they are marked as deleted, so remove them from the set of
21608 // candidates.
21609 Candidates.remove_if([&R](Value *I) {
21610 return R.isDeleted(cast<Instruction>(I)) ||
21611 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21612 });
21613
21614 // Remove from the set of candidates all pairs of getelementptrs with
21615 // constant differences. Such getelementptrs are likely not good
21616 // candidates for vectorization in a bottom-up phase since one can be
21617 // computed from the other. We also ensure all candidate getelementptr
21618 // indices are unique.
21619 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21620 auto *GEPI = GEPList[I];
21621 if (!Candidates.count(GEPI))
21622 continue;
21623 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21624 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21625 auto *GEPJ = GEPList[J];
21626 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21627 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21628 Candidates.remove(GEPI);
21629 Candidates.remove(GEPJ);
21630 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21631 Candidates.remove(GEPJ);
21632 }
21633 }
21634 }
21635
21636 // We break out of the above computation as soon as we know there are
21637 // fewer than two candidates remaining.
21638 if (Candidates.size() < 2)
21639 continue;
21640
21641 // Add the single, non-constant index of each candidate to the bundle. We
21642 // ensured the indices met these constraints when we originally collected
21643 // the getelementptrs.
21644 SmallVector<Value *, 16> Bundle(Candidates.size());
21645 auto BundleIndex = 0u;
21646 for (auto *V : Candidates) {
21647 auto *GEP = cast<GetElementPtrInst>(V);
21648 auto *GEPIdx = GEP->idx_begin()->get();
21649 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21650 Bundle[BundleIndex++] = GEPIdx;
21651 }
21652
21653 // Try and vectorize the indices. We are currently only interested in
21654 // gather-like cases of the form:
21655 //
21656 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21657 //
21658 // where the loads of "a", the loads of "b", and the subtractions can be
21659 // performed in parallel. It's likely that detecting this pattern in a
21660 // bottom-up phase will be simpler and less costly than building a
21661 // full-blown top-down phase beginning at the consecutive loads.
21662 Changed |= tryToVectorizeList(Bundle, R);
21663 }
21664 }
21665 return Changed;
21666}
21667
21668bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21669 bool Changed = false;
21670 // Sort by type, base pointers and values operand. Value operands must be
21671 // compatible (have the same opcode, same parent), otherwise it is
21672 // definitely not profitable to try to vectorize them.
21673 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21674 if (V->getValueOperand()->getType()->getTypeID() <
21675 V2->getValueOperand()->getType()->getTypeID())
21676 return true;
21677 if (V->getValueOperand()->getType()->getTypeID() >
21678 V2->getValueOperand()->getType()->getTypeID())
21679 return false;
21680 if (V->getPointerOperandType()->getTypeID() <
21681 V2->getPointerOperandType()->getTypeID())
21682 return true;
21683 if (V->getPointerOperandType()->getTypeID() >
21684 V2->getPointerOperandType()->getTypeID())
21685 return false;
21686 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21687 V2->getValueOperand()->getType()->getScalarSizeInBits())
21688 return true;
21689 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21690 V2->getValueOperand()->getType()->getScalarSizeInBits())
21691 return false;
21692 // UndefValues are compatible with all other values.
21693 if (isa<UndefValue>(V->getValueOperand()) ||
21694 isa<UndefValue>(V2->getValueOperand()))
21695 return false;
21696 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21697 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21699 DT->getNode(I1->getParent());
21701 DT->getNode(I2->getParent());
21702 assert(NodeI1 && "Should only process reachable instructions");
21703 assert(NodeI2 && "Should only process reachable instructions");
21704 assert((NodeI1 == NodeI2) ==
21705 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21706 "Different nodes should have different DFS numbers");
21707 if (NodeI1 != NodeI2)
21708 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21709 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21710 if (S.getOpcode())
21711 return false;
21712 return I1->getOpcode() < I2->getOpcode();
21713 }
21714 if (isa<Constant>(V->getValueOperand()) &&
21715 isa<Constant>(V2->getValueOperand()))
21716 return false;
21717 return V->getValueOperand()->getValueID() <
21718 V2->getValueOperand()->getValueID();
21719 };
21720
21721 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21722 if (V1 == V2)
21723 return true;
21724 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21725 return false;
21726 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21727 return false;
21728 // Undefs are compatible with any other value.
21729 if (isa<UndefValue>(V1->getValueOperand()) ||
21730 isa<UndefValue>(V2->getValueOperand()))
21731 return true;
21732 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21733 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21734 if (I1->getParent() != I2->getParent())
21735 return false;
21736 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21737 return S.getOpcode() > 0;
21738 }
21739 if (isa<Constant>(V1->getValueOperand()) &&
21740 isa<Constant>(V2->getValueOperand()))
21741 return true;
21742 return V1->getValueOperand()->getValueID() ==
21743 V2->getValueOperand()->getValueID();
21744 };
21745
21746 // Attempt to sort and vectorize each of the store-groups.
21748 for (auto &Pair : Stores) {
21749 if (Pair.second.size() < 2)
21750 continue;
21751
21752 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
21753 << Pair.second.size() << ".\n");
21754
21755 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21756 continue;
21757
21758 // Reverse stores to do bottom-to-top analysis. This is important if the
21759 // values are stores to the same addresses several times, in this case need
21760 // to follow the stores order (reversed to meet the memory dependecies).
21761 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21762 Pair.second.rend());
21763 Changed |= tryToVectorizeSequence<StoreInst>(
21764 ReversedStores, StoreSorter, AreCompatibleStores,
21765 [&](ArrayRef<StoreInst *> Candidates, bool) {
21766 return vectorizeStores(Candidates, R, Attempted);
21767 },
21768 /*MaxVFOnly=*/false, R);
21769 }
21770 return Changed;
21771}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
reverse_iterator rend()
Definition: BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:675
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1978
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1873
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2115
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1972
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1207
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1285
unsigned arg_size() const
Definition: InstrTypes.h:1292
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1502
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1969
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:988
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:859
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:871
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2289
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1052
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:879
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2503
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:508
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1060
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2491
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2297
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1830
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1048
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2566
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1889
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:844
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1776
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2398
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2429
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2281
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2525
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2189
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2444
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1689
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2305
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2227
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1849
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1610
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1384
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:283
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:759
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:471
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
bool erase(const ValueT &V)
Definition: DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7293
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1656
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:250
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2136
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.