LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107
108#define SV_NAME "slp-vectorizer"
109#define DEBUG_TYPE "SLP"
110
111STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
112
113DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
114 "Controls which SLP graphs should be vectorized.");
115
116static cl::opt<bool>
117 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
118 cl::desc("Run the SLP vectorization passes"));
119
120static cl::opt<bool>
121 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
122 cl::desc("Enable vectorization for wider vector utilization"));
123
124static cl::opt<int>
126 cl::desc("Only vectorize if you gain more than this "
127 "number "));
128
130 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
131 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
132 "heuristics and makes vectorization decision via cost modeling."));
133
134static cl::opt<bool>
135ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
136 cl::desc("Attempt to vectorize horizontal reductions"));
137
139 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
140 cl::desc(
141 "Attempt to vectorize horizontal reductions feeding into a store"));
142
143static cl::opt<int>
145 cl::desc("Attempt to vectorize for this register size in bits"));
146
149 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
150
151/// Limits the size of scheduling regions in a block.
152/// It avoid long compile times for _very_ large blocks where vector
153/// instructions are spread over a wide range.
154/// This limit is way higher than needed by real-world functions.
155static cl::opt<int>
156ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
157 cl::desc("Limit the size of the SLP scheduling region per block"));
158
160 "slp-min-reg-size", cl::init(128), cl::Hidden,
161 cl::desc("Attempt to vectorize for this register size in bits"));
162
164 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
165 cl::desc("Limit the recursion depth when building a vectorizable tree"));
166
168 "slp-min-tree-size", cl::init(3), cl::Hidden,
169 cl::desc("Only vectorize small trees if they are fully vectorizable"));
170
171// The maximum depth that the look-ahead score heuristic will explore.
172// The higher this value, the higher the compilation time overhead.
174 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
175 cl::desc("The maximum look-ahead depth for operand reordering scores"));
176
177// The maximum depth that the look-ahead score heuristic will explore
178// when it probing among candidates for vectorization tree roots.
179// The higher this value, the higher the compilation time overhead but unlike
180// similar limit for operands ordering this is less frequently used, hence
181// impact of higher value is less noticeable.
183 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
184 cl::desc("The maximum look-ahead depth for searching best rooting option"));
185
187 "slp-min-strided-loads", cl::init(2), cl::Hidden,
188 cl::desc("The minimum number of loads, which should be considered strided, "
189 "if the stride is > 1 or is runtime value"));
190
192 "slp-max-stride", cl::init(8), cl::Hidden,
193 cl::desc("The maximum stride, considered to be profitable."));
194
195static cl::opt<bool>
196 ViewSLPTree("view-slp-tree", cl::Hidden,
197 cl::desc("Display the SLP trees with Graphviz"));
198
200 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
201 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
202
203// Limit the number of alias checks. The limit is chosen so that
204// it has no negative effect on the llvm benchmarks.
205static const unsigned AliasedCheckLimit = 10;
206
207// Limit of the number of uses for potentially transformed instructions/values,
208// used in checks to avoid compile-time explode.
209static constexpr int UsesLimit = 64;
210
211// Another limit for the alias checks: The maximum distance between load/store
212// instructions where alias checks are done.
213// This limit is useful for very large basic blocks.
214static const unsigned MaxMemDepDistance = 160;
215
216/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
217/// regions to be handled.
218static const int MinScheduleRegionSize = 16;
219
220/// Maximum allowed number of operands in the PHI nodes.
221static const unsigned MaxPHINumOperands = 128;
222
223/// Predicate for the element types that the SLP vectorizer supports.
224///
225/// The most important thing to filter here are types which are invalid in LLVM
226/// vectors. We also filter target specific types which have absolutely no
227/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
228/// avoids spending time checking the cost model and realizing that they will
229/// be inevitably scalarized.
230static bool isValidElementType(Type *Ty) {
231 // TODO: Support ScalableVectorType.
232 if (SLPReVec && isa<FixedVectorType>(Ty))
233 Ty = Ty->getScalarType();
234 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
235 !Ty->isPPC_FP128Ty();
236}
237
238/// Returns the type of the given value/instruction \p V. If it is store,
239/// returns the type of its value operand, for Cmp - the types of the compare
240/// operands and for insertelement - the type os the inserted operand.
241/// Otherwise, just the type of the value is returned.
243 if (auto *SI = dyn_cast<StoreInst>(V))
244 return SI->getValueOperand()->getType();
245 if (auto *CI = dyn_cast<CmpInst>(V))
246 return CI->getOperand(0)->getType();
247 if (auto *IE = dyn_cast<InsertElementInst>(V))
248 return IE->getOperand(1)->getType();
249 return V->getType();
250}
251
252/// \returns the number of elements for Ty.
253static unsigned getNumElements(Type *Ty) {
254 assert(!isa<ScalableVectorType>(Ty) &&
255 "ScalableVectorType is not supported.");
256 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
257 return VecTy->getNumElements();
258 return 1;
259}
260
261/// \returns the vector type of ScalarTy based on vectorization factor.
262static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
263 return FixedVectorType::get(ScalarTy->getScalarType(),
264 VF * getNumElements(ScalarTy));
265}
266
267/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
268/// which forms type, which splits by \p TTI into whole vector types during
269/// legalization.
271 Type *Ty, unsigned Sz) {
272 if (!isValidElementType(Ty))
273 return bit_ceil(Sz);
274 // Find the number of elements, which forms full vectors.
275 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
276 if (NumParts == 0 || NumParts >= Sz)
277 return bit_ceil(Sz);
278 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
279}
280
281/// Returns the number of elements of the given type \p Ty, not greater than \p
282/// Sz, which forms type, which splits by \p TTI into whole vector types during
283/// legalization.
284static unsigned
286 unsigned Sz) {
287 if (!isValidElementType(Ty))
288 return bit_floor(Sz);
289 // Find the number of elements, which forms full vectors.
290 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
291 if (NumParts == 0 || NumParts >= Sz)
292 return bit_floor(Sz);
293 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294 if (RegVF > Sz)
295 return bit_floor(Sz);
296 return (Sz / RegVF) * RegVF;
297}
298
299static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
300 SmallVectorImpl<int> &Mask) {
301 // The ShuffleBuilder implementation use shufflevector to splat an "element".
302 // But the element have different meaning for SLP (scalar) and REVEC
303 // (vector). We need to expand Mask into masks which shufflevector can use
304 // directly.
305 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
306 for (unsigned I : seq<unsigned>(Mask.size()))
307 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
308 I * VecTyNumElements, VecTyNumElements)))
309 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
310 : Mask[I] * VecTyNumElements + J;
311 Mask.swap(NewMask);
312}
313
314/// \returns the number of groups of shufflevector
315/// A group has the following features
316/// 1. All of value in a group are shufflevector.
317/// 2. The mask of all shufflevector is isExtractSubvectorMask.
318/// 3. The mask of all shufflevector uses all of the elements of the source.
319/// e.g., it is 1 group (%0)
320/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
321/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
322/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
323/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
324/// it is 2 groups (%3 and %4)
325/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
326/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
327/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
328/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
329/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
330/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
331/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
332/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
333/// it is 0 group
334/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
335/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
336/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
337/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339 if (VL.empty())
340 return 0;
341 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
342 return 0;
343 auto *SV = cast<ShuffleVectorInst>(VL.front());
344 unsigned SVNumElements =
345 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
346 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
347 if (SVNumElements % ShuffleMaskSize != 0)
348 return 0;
349 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
350 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
351 return 0;
352 unsigned NumGroup = 0;
353 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
354 auto *SV = cast<ShuffleVectorInst>(VL[I]);
355 Value *Src = SV->getOperand(0);
356 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
357 SmallBitVector ExpectedIndex(GroupSize);
358 if (!all_of(Group, [&](Value *V) {
359 auto *SV = cast<ShuffleVectorInst>(V);
360 // From the same source.
361 if (SV->getOperand(0) != Src)
362 return false;
363 int Index;
364 if (!SV->isExtractSubvectorMask(Index))
365 return false;
366 ExpectedIndex.set(Index / ShuffleMaskSize);
367 return true;
368 }))
369 return 0;
370 if (!ExpectedIndex.all())
371 return 0;
372 ++NumGroup;
373 }
374 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
375 return NumGroup;
376}
377
378/// \returns a shufflevector mask which is used to vectorize shufflevectors
379/// e.g.,
380/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
381/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
382/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
383/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
384/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
385/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
386/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
387/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
388/// the result is
389/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
391 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
392 auto *SV = cast<ShuffleVectorInst>(VL.front());
393 unsigned SVNumElements =
394 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
395 SmallVector<int> Mask;
396 unsigned AccumulateLength = 0;
397 for (Value *V : VL) {
398 auto *SV = cast<ShuffleVectorInst>(V);
399 for (int M : SV->getShuffleMask())
400 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
401 : AccumulateLength + M);
402 AccumulateLength += SVNumElements;
403 }
404 return Mask;
405}
406
407/// \returns True if the value is a constant (but not globals/constant
408/// expressions).
409static bool isConstant(Value *V) {
410 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
411}
412
413/// Checks if \p V is one of vector-like instructions, i.e. undef,
414/// insertelement/extractelement with constant indices for fixed vector type or
415/// extractvalue instruction.
417 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
418 !isa<ExtractValueInst, UndefValue>(V))
419 return false;
420 auto *I = dyn_cast<Instruction>(V);
421 if (!I || isa<ExtractValueInst>(I))
422 return true;
423 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
424 return false;
425 if (isa<ExtractElementInst>(I))
426 return isConstant(I->getOperand(1));
427 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
428 return isConstant(I->getOperand(2));
429}
430
431/// Returns power-of-2 number of elements in a single register (part), given the
432/// total number of elements \p Size and number of registers (parts) \p
433/// NumParts.
434static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
435 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
436}
437
438/// Returns correct remaining number of elements, considering total amount \p
439/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
440/// and current register (part) \p Part.
441static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
442 unsigned Part) {
443 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
444}
445
446#if !defined(NDEBUG)
447/// Print a short descriptor of the instruction bundle suitable for debug output.
448static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
449 std::string Result;
450 raw_string_ostream OS(Result);
451 if (Idx >= 0)
452 OS << "Idx: " << Idx << ", ";
453 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
454 return Result;
455}
456#endif
457
458/// \returns true if all of the instructions in \p VL are in the same block or
459/// false otherwise.
461 auto *It = find_if(VL, IsaPred<Instruction>);
462 if (It == VL.end())
463 return false;
464 Instruction *I0 = cast<Instruction>(*It);
466 return true;
467
468 BasicBlock *BB = I0->getParent();
469 for (Value *V : iterator_range(It, VL.end())) {
470 if (isa<PoisonValue>(V))
471 continue;
472 auto *II = dyn_cast<Instruction>(V);
473 if (!II)
474 return false;
475
476 if (BB != II->getParent())
477 return false;
478 }
479 return true;
480}
481
482/// \returns True if all of the values in \p VL are constants (but not
483/// globals/constant expressions).
485 // Constant expressions and globals can't be vectorized like normal integer/FP
486 // constants.
487 return all_of(VL, isConstant);
488}
489
490/// \returns True if all of the values in \p VL are identical or some of them
491/// are UndefValue.
492static bool isSplat(ArrayRef<Value *> VL) {
493 Value *FirstNonUndef = nullptr;
494 for (Value *V : VL) {
495 if (isa<UndefValue>(V))
496 continue;
497 if (!FirstNonUndef) {
498 FirstNonUndef = V;
499 continue;
500 }
501 if (V != FirstNonUndef)
502 return false;
503 }
504 return FirstNonUndef != nullptr;
505}
506
507/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
509 if (auto *Cmp = dyn_cast<CmpInst>(I))
510 return Cmp->isCommutative();
511 if (auto *BO = dyn_cast<BinaryOperator>(I))
512 return BO->isCommutative() ||
513 (BO->getOpcode() == Instruction::Sub &&
514 !BO->hasNUsesOrMore(UsesLimit) &&
515 all_of(
516 BO->uses(),
517 [](const Use &U) {
518 // Commutative, if icmp eq/ne sub, 0
519 CmpPredicate Pred;
520 if (match(U.getUser(),
521 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
522 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
523 return true;
524 // Commutative, if abs(sub nsw, true) or abs(sub, false).
525 ConstantInt *Flag;
526 return match(U.getUser(),
527 m_Intrinsic<Intrinsic::abs>(
528 m_Specific(U.get()), m_ConstantInt(Flag))) &&
529 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
530 Flag->isOne());
531 })) ||
532 (BO->getOpcode() == Instruction::FSub &&
533 !BO->hasNUsesOrMore(UsesLimit) &&
534 all_of(BO->uses(), [](const Use &U) {
535 return match(U.getUser(),
536 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
537 }));
538 return I->isCommutative();
539}
540
541template <typename T>
542static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
543 unsigned Offset) {
544 static_assert(std::is_same_v<T, InsertElementInst> ||
545 std::is_same_v<T, ExtractElementInst>,
546 "unsupported T");
547 int Index = Offset;
548 if (const auto *IE = dyn_cast<T>(Inst)) {
549 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
550 if (!VT)
551 return std::nullopt;
552 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
553 if (!CI)
554 return std::nullopt;
555 if (CI->getValue().uge(VT->getNumElements()))
556 return std::nullopt;
557 Index *= VT->getNumElements();
558 Index += CI->getZExtValue();
559 return Index;
560 }
561 return std::nullopt;
562}
563
564/// \returns inserting or extracting index of InsertElement, ExtractElement or
565/// InsertValue instruction, using Offset as base offset for index.
566/// \returns std::nullopt if the index is not an immediate.
567static std::optional<unsigned> getElementIndex(const Value *Inst,
568 unsigned Offset = 0) {
569 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
570 return Index;
571 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
572 return Index;
573
574 int Index = Offset;
575
576 const auto *IV = dyn_cast<InsertValueInst>(Inst);
577 if (!IV)
578 return std::nullopt;
579
580 Type *CurrentType = IV->getType();
581 for (unsigned I : IV->indices()) {
582 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
583 Index *= ST->getNumElements();
584 CurrentType = ST->getElementType(I);
585 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
586 Index *= AT->getNumElements();
587 CurrentType = AT->getElementType();
588 } else {
589 return std::nullopt;
590 }
591 Index += I;
592 }
593 return Index;
594}
595
596namespace {
597/// Specifies the way the mask should be analyzed for undefs/poisonous elements
598/// in the shuffle mask.
599enum class UseMask {
600 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
601 ///< check for the mask elements for the first argument (mask
602 ///< indices are in range [0:VF)).
603 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
604 ///< for the mask elements for the second argument (mask indices
605 ///< are in range [VF:2*VF))
606 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
607 ///< future shuffle elements and mark them as ones as being used
608 ///< in future. Non-undef elements are considered as unused since
609 ///< they're already marked as used in the mask.
610};
611} // namespace
612
613/// Prepares a use bitset for the given mask either for the first argument or
614/// for the second.
616 UseMask MaskArg) {
617 SmallBitVector UseMask(VF, true);
618 for (auto [Idx, Value] : enumerate(Mask)) {
619 if (Value == PoisonMaskElem) {
620 if (MaskArg == UseMask::UndefsAsMask)
621 UseMask.reset(Idx);
622 continue;
623 }
624 if (MaskArg == UseMask::FirstArg && Value < VF)
625 UseMask.reset(Value);
626 else if (MaskArg == UseMask::SecondArg && Value >= VF)
627 UseMask.reset(Value - VF);
628 }
629 return UseMask;
630}
631
632/// Checks if the given value is actually an undefined constant vector.
633/// Also, if the \p UseMask is not empty, tries to check if the non-masked
634/// elements actually mask the insertelement buildvector, if any.
635template <bool IsPoisonOnly = false>
637 const SmallBitVector &UseMask = {}) {
638 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
639 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
640 if (isa<T>(V))
641 return Res;
642 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
643 if (!VecTy)
644 return Res.reset();
645 auto *C = dyn_cast<Constant>(V);
646 if (!C) {
647 if (!UseMask.empty()) {
648 const Value *Base = V;
649 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
650 Base = II->getOperand(0);
651 if (isa<T>(II->getOperand(1)))
652 continue;
653 std::optional<unsigned> Idx = getElementIndex(II);
654 if (!Idx) {
655 Res.reset();
656 return Res;
657 }
658 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
659 Res.reset(*Idx);
660 }
661 // TODO: Add analysis for shuffles here too.
662 if (V == Base) {
663 Res.reset();
664 } else {
665 SmallBitVector SubMask(UseMask.size(), false);
666 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
667 }
668 } else {
669 Res.reset();
670 }
671 return Res;
672 }
673 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
674 if (Constant *Elem = C->getAggregateElement(I))
675 if (!isa<T>(Elem) &&
676 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
677 Res.reset(I);
678 }
679 return Res;
680}
681
682/// Checks if the vector of instructions can be represented as a shuffle, like:
683/// %x0 = extractelement <4 x i8> %x, i32 0
684/// %x3 = extractelement <4 x i8> %x, i32 3
685/// %y1 = extractelement <4 x i8> %y, i32 1
686/// %y2 = extractelement <4 x i8> %y, i32 2
687/// %x0x0 = mul i8 %x0, %x0
688/// %x3x3 = mul i8 %x3, %x3
689/// %y1y1 = mul i8 %y1, %y1
690/// %y2y2 = mul i8 %y2, %y2
691/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
692/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
693/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
694/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
695/// ret <4 x i8> %ins4
696/// can be transformed into:
697/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
698/// i32 6>
699/// %2 = mul <4 x i8> %1, %1
700/// ret <4 x i8> %2
701/// Mask will return the Shuffle Mask equivalent to the extracted elements.
702/// TODO: Can we split off and reuse the shuffle mask detection from
703/// ShuffleVectorInst/getShuffleCost?
704static std::optional<TargetTransformInfo::ShuffleKind>
706 AssumptionCache *AC) {
707 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
708 if (It == VL.end())
709 return std::nullopt;
710 unsigned Size =
711 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
712 auto *EI = dyn_cast<ExtractElementInst>(V);
713 if (!EI)
714 return S;
715 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
716 if (!VTy)
717 return S;
718 return std::max(S, VTy->getNumElements());
719 });
720
721 Value *Vec1 = nullptr;
722 Value *Vec2 = nullptr;
723 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
724 auto *EE = dyn_cast<ExtractElementInst>(V);
725 if (!EE)
726 return false;
727 Value *Vec = EE->getVectorOperand();
728 if (isa<UndefValue>(Vec))
729 return false;
730 return isGuaranteedNotToBePoison(Vec, AC);
731 });
732 enum ShuffleMode { Unknown, Select, Permute };
733 ShuffleMode CommonShuffleMode = Unknown;
734 Mask.assign(VL.size(), PoisonMaskElem);
735 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
736 // Undef can be represented as an undef element in a vector.
737 if (isa<UndefValue>(VL[I]))
738 continue;
739 auto *EI = cast<ExtractElementInst>(VL[I]);
740 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
741 return std::nullopt;
742 auto *Vec = EI->getVectorOperand();
743 // We can extractelement from undef or poison vector.
744 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
745 continue;
746 // All vector operands must have the same number of vector elements.
747 if (isa<UndefValue>(Vec)) {
748 Mask[I] = I;
749 } else {
750 if (isa<UndefValue>(EI->getIndexOperand()))
751 continue;
752 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
753 if (!Idx)
754 return std::nullopt;
755 // Undefined behavior if Idx is negative or >= Size.
756 if (Idx->getValue().uge(Size))
757 continue;
758 unsigned IntIdx = Idx->getValue().getZExtValue();
759 Mask[I] = IntIdx;
760 }
761 if (isUndefVector(Vec).all() && HasNonUndefVec)
762 continue;
763 // For correct shuffling we have to have at most 2 different vector operands
764 // in all extractelement instructions.
765 if (!Vec1 || Vec1 == Vec) {
766 Vec1 = Vec;
767 } else if (!Vec2 || Vec2 == Vec) {
768 Vec2 = Vec;
769 Mask[I] += Size;
770 } else {
771 return std::nullopt;
772 }
773 if (CommonShuffleMode == Permute)
774 continue;
775 // If the extract index is not the same as the operation number, it is a
776 // permutation.
777 if (Mask[I] % Size != I) {
778 CommonShuffleMode = Permute;
779 continue;
780 }
781 CommonShuffleMode = Select;
782 }
783 // If we're not crossing lanes in different vectors, consider it as blending.
784 if (CommonShuffleMode == Select && Vec2)
786 // If Vec2 was never used, we have a permutation of a single vector, otherwise
787 // we have permutation of 2 vectors.
790}
791
792/// \returns True if Extract{Value,Element} instruction extracts element Idx.
793static std::optional<unsigned> getExtractIndex(Instruction *E) {
794 unsigned Opcode = E->getOpcode();
795 assert((Opcode == Instruction::ExtractElement ||
796 Opcode == Instruction::ExtractValue) &&
797 "Expected extractelement or extractvalue instruction.");
798 if (Opcode == Instruction::ExtractElement) {
799 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
800 if (!CI)
801 return std::nullopt;
802 return CI->getZExtValue();
803 }
804 auto *EI = cast<ExtractValueInst>(E);
805 if (EI->getNumIndices() != 1)
806 return std::nullopt;
807 return *EI->idx_begin();
808}
809
810namespace {
811
812/// Main data required for vectorization of instructions.
813class InstructionsState {
814 /// The main/alternate instruction. MainOp is also VL0.
815 Instruction *MainOp = nullptr;
816 Instruction *AltOp = nullptr;
817
818public:
819 Instruction *getMainOp() const { return MainOp; }
820
821 Instruction *getAltOp() const { return AltOp; }
822
823 /// The main/alternate opcodes for the list of instructions.
824 unsigned getOpcode() const {
825 return MainOp ? MainOp->getOpcode() : 0;
826 }
827
828 unsigned getAltOpcode() const {
829 return AltOp ? AltOp->getOpcode() : 0;
830 }
831
832 /// Some of the instructions in the list have alternate opcodes.
833 bool isAltShuffle() const { return AltOp != MainOp; }
834
835 bool isOpcodeOrAlt(Instruction *I) const {
836 unsigned CheckedOpcode = I->getOpcode();
837 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
838 }
839
840 InstructionsState() = delete;
841 InstructionsState(Instruction *MainOp, Instruction *AltOp)
842 : MainOp(MainOp), AltOp(AltOp) {}
843 static InstructionsState invalid() { return {nullptr, nullptr}; }
844};
845
846} // end anonymous namespace
847
848/// \returns true if \p Opcode is allowed as part of the main/alternate
849/// instruction for SLP vectorization.
850///
851/// Example of unsupported opcode is SDIV that can potentially cause UB if the
852/// "shuffled out" lane would result in division by zero.
853static bool isValidForAlternation(unsigned Opcode) {
854 if (Instruction::isIntDivRem(Opcode))
855 return false;
856
857 return true;
858}
859
860static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
861 const TargetLibraryInfo &TLI);
862
863/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
864/// compatible instructions or constants, or just some other regular values.
865static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
866 Value *Op1, const TargetLibraryInfo &TLI) {
867 return (isConstant(BaseOp0) && isConstant(Op0)) ||
868 (isConstant(BaseOp1) && isConstant(Op1)) ||
869 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
870 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
871 BaseOp0 == Op0 || BaseOp1 == Op1 ||
872 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
873 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
874}
875
876/// \returns true if a compare instruction \p CI has similar "look" and
877/// same predicate as \p BaseCI, "as is" or with its operands and predicate
878/// swapped, false otherwise.
879static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
880 const TargetLibraryInfo &TLI) {
881 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
882 "Assessing comparisons of different types?");
883 CmpInst::Predicate BasePred = BaseCI->getPredicate();
884 CmpInst::Predicate Pred = CI->getPredicate();
886
887 Value *BaseOp0 = BaseCI->getOperand(0);
888 Value *BaseOp1 = BaseCI->getOperand(1);
889 Value *Op0 = CI->getOperand(0);
890 Value *Op1 = CI->getOperand(1);
891
892 return (BasePred == Pred &&
893 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
894 (BasePred == SwappedPred &&
895 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
896}
897
898/// \returns analysis of the Instructions in \p VL described in
899/// InstructionsState, the Opcode that we suppose the whole list
900/// could be vectorized even if its structure is diverse.
901static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
902 const TargetLibraryInfo &TLI) {
903 // Make sure these are all Instructions.
904 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
905 return InstructionsState::invalid();
906
907 auto *It = find_if(VL, IsaPred<Instruction>);
908 if (It == VL.end())
909 return InstructionsState::invalid();
910
911 Value *V = *It;
912 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
913 if ((VL.size() > 2 && !isa<PHINode>(V) && InstCnt < VL.size() / 2) ||
914 (VL.size() == 2 && InstCnt < 2))
915 return InstructionsState::invalid();
916
917 bool IsCastOp = isa<CastInst>(V);
918 bool IsBinOp = isa<BinaryOperator>(V);
919 bool IsCmpOp = isa<CmpInst>(V);
920 CmpInst::Predicate BasePred =
921 IsCmpOp ? cast<CmpInst>(V)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE;
922 unsigned Opcode = cast<Instruction>(V)->getOpcode();
923 unsigned AltOpcode = Opcode;
924 unsigned AltIndex = std::distance(VL.begin(), It);
925
926 bool SwappedPredsCompatible = [&]() {
927 if (!IsCmpOp)
928 return false;
929 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
930 UniquePreds.insert(BasePred);
931 UniqueNonSwappedPreds.insert(BasePred);
932 for (Value *V : VL) {
933 auto *I = dyn_cast<CmpInst>(V);
934 if (!I)
935 return false;
936 CmpInst::Predicate CurrentPred = I->getPredicate();
937 CmpInst::Predicate SwappedCurrentPred =
938 CmpInst::getSwappedPredicate(CurrentPred);
939 UniqueNonSwappedPreds.insert(CurrentPred);
940 if (!UniquePreds.contains(CurrentPred) &&
941 !UniquePreds.contains(SwappedCurrentPred))
942 UniquePreds.insert(CurrentPred);
943 }
944 // Total number of predicates > 2, but if consider swapped predicates
945 // compatible only 2, consider swappable predicates as compatible opcodes,
946 // not alternate.
947 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
948 }();
949 // Check for one alternate opcode from another BinaryOperator.
950 // TODO - generalize to support all operators (types, calls etc.).
951 auto *IBase = cast<Instruction>(V);
952 Intrinsic::ID BaseID = 0;
953 SmallVector<VFInfo> BaseMappings;
954 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
956 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
957 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
958 return InstructionsState::invalid();
959 }
960 bool AnyPoison = InstCnt != VL.size();
961 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
962 auto *I = dyn_cast<Instruction>(VL[Cnt]);
963 if (!I)
964 continue;
965
966 // Cannot combine poison and divisions.
967 // TODO: do some smart analysis of the CallInsts to exclude divide-like
968 // intrinsics/functions only.
969 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
970 return InstructionsState::invalid();
971 unsigned InstOpcode = I->getOpcode();
972 if (IsBinOp && isa<BinaryOperator>(I)) {
973 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
974 continue;
975 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
976 isValidForAlternation(Opcode)) {
977 AltOpcode = InstOpcode;
978 AltIndex = Cnt;
979 continue;
980 }
981 } else if (IsCastOp && isa<CastInst>(I)) {
982 Value *Op0 = IBase->getOperand(0);
983 Type *Ty0 = Op0->getType();
984 Value *Op1 = I->getOperand(0);
985 Type *Ty1 = Op1->getType();
986 if (Ty0 == Ty1) {
987 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
988 continue;
989 if (Opcode == AltOpcode) {
991 isValidForAlternation(InstOpcode) &&
992 "Cast isn't safe for alternation, logic needs to be updated!");
993 AltOpcode = InstOpcode;
994 AltIndex = Cnt;
995 continue;
996 }
997 }
998 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
999 auto *BaseInst = cast<CmpInst>(V);
1000 Type *Ty0 = BaseInst->getOperand(0)->getType();
1001 Type *Ty1 = Inst->getOperand(0)->getType();
1002 if (Ty0 == Ty1) {
1003 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1004 assert(InstOpcode == AltOpcode &&
1005 "Alternate instructions are only supported by BinaryOperator "
1006 "and CastInst.");
1007 // Check for compatible operands. If the corresponding operands are not
1008 // compatible - need to perform alternate vectorization.
1009 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1010 CmpInst::Predicate SwappedCurrentPred =
1011 CmpInst::getSwappedPredicate(CurrentPred);
1012
1013 if ((E == 2 || SwappedPredsCompatible) &&
1014 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1015 continue;
1016
1017 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1018 continue;
1019 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
1020 if (AltIndex) {
1021 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1022 continue;
1023 } else if (BasePred != CurrentPred) {
1024 assert(
1025 isValidForAlternation(InstOpcode) &&
1026 "CmpInst isn't safe for alternation, logic needs to be updated!");
1027 AltIndex = Cnt;
1028 continue;
1029 }
1030 CmpInst::Predicate AltPred = AltInst->getPredicate();
1031 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1032 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1033 continue;
1034 }
1035 } else if (InstOpcode == Opcode) {
1036 assert(InstOpcode == AltOpcode &&
1037 "Alternate instructions are only supported by BinaryOperator and "
1038 "CastInst.");
1039 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1040 if (Gep->getNumOperands() != 2 ||
1041 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
1042 return InstructionsState::invalid();
1043 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1045 return InstructionsState::invalid();
1046 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1047 auto *BaseLI = cast<LoadInst>(IBase);
1048 if (!LI->isSimple() || !BaseLI->isSimple())
1049 return InstructionsState::invalid();
1050 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1051 auto *CallBase = cast<CallInst>(IBase);
1052 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1053 return InstructionsState::invalid();
1054 if (Call->hasOperandBundles() &&
1056 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1057 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1058 CallBase->op_begin() +
1060 return InstructionsState::invalid();
1062 if (ID != BaseID)
1063 return InstructionsState::invalid();
1064 if (!ID) {
1065 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1066 if (Mappings.size() != BaseMappings.size() ||
1067 Mappings.front().ISA != BaseMappings.front().ISA ||
1068 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1069 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1070 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1071 Mappings.front().Shape.Parameters !=
1072 BaseMappings.front().Shape.Parameters)
1073 return InstructionsState::invalid();
1074 }
1075 }
1076 continue;
1077 }
1078 return InstructionsState::invalid();
1079 }
1080
1081 return InstructionsState(cast<Instruction>(V),
1082 cast<Instruction>(VL[AltIndex]));
1083}
1084
1085/// \returns true if all of the values in \p VL have the same type or false
1086/// otherwise.
1088 Type *Ty = VL.front()->getType();
1089 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1090}
1091
1092/// \returns True if in-tree use also needs extract. This refers to
1093/// possible scalar operand in vectorized instruction.
1094static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1095 TargetLibraryInfo *TLI,
1096 const TargetTransformInfo *TTI) {
1097 if (!UserInst)
1098 return false;
1099 unsigned Opcode = UserInst->getOpcode();
1100 switch (Opcode) {
1101 case Instruction::Load: {
1102 LoadInst *LI = cast<LoadInst>(UserInst);
1103 return (LI->getPointerOperand() == Scalar);
1104 }
1105 case Instruction::Store: {
1106 StoreInst *SI = cast<StoreInst>(UserInst);
1107 return (SI->getPointerOperand() == Scalar);
1108 }
1109 case Instruction::Call: {
1110 CallInst *CI = cast<CallInst>(UserInst);
1112 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1113 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1114 Arg.value().get() == Scalar;
1115 });
1116 }
1117 default:
1118 return false;
1119 }
1120}
1121
1122/// \returns the AA location that is being access by the instruction.
1124 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1125 return MemoryLocation::get(SI);
1126 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1127 return MemoryLocation::get(LI);
1128 return MemoryLocation();
1129}
1130
1131/// \returns True if the instruction is not a volatile or atomic load/store.
1132static bool isSimple(Instruction *I) {
1133 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1134 return LI->isSimple();
1135 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1136 return SI->isSimple();
1137 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1138 return !MI->isVolatile();
1139 return true;
1140}
1141
1142/// Shuffles \p Mask in accordance with the given \p SubMask.
1143/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1144/// one but two input vectors.
1145static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1146 bool ExtendingManyInputs = false) {
1147 if (SubMask.empty())
1148 return;
1149 assert(
1150 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1151 // Check if input scalars were extended to match the size of other node.
1152 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1153 "SubMask with many inputs support must be larger than the mask.");
1154 if (Mask.empty()) {
1155 Mask.append(SubMask.begin(), SubMask.end());
1156 return;
1157 }
1158 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1159 int TermValue = std::min(Mask.size(), SubMask.size());
1160 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1161 if (SubMask[I] == PoisonMaskElem ||
1162 (!ExtendingManyInputs &&
1163 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1164 continue;
1165 NewMask[I] = Mask[SubMask[I]];
1166 }
1167 Mask.swap(NewMask);
1168}
1169
1170/// Order may have elements assigned special value (size) which is out of
1171/// bounds. Such indices only appear on places which correspond to undef values
1172/// (see canReuseExtract for details) and used in order to avoid undef values
1173/// have effect on operands ordering.
1174/// The first loop below simply finds all unused indices and then the next loop
1175/// nest assigns these indices for undef values positions.
1176/// As an example below Order has two undef positions and they have assigned
1177/// values 3 and 7 respectively:
1178/// before: 6 9 5 4 9 2 1 0
1179/// after: 6 3 5 4 7 2 1 0
1181 const unsigned Sz = Order.size();
1182 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1183 SmallBitVector MaskedIndices(Sz);
1184 for (unsigned I = 0; I < Sz; ++I) {
1185 if (Order[I] < Sz)
1186 UnusedIndices.reset(Order[I]);
1187 else
1188 MaskedIndices.set(I);
1189 }
1190 if (MaskedIndices.none())
1191 return;
1192 assert(UnusedIndices.count() == MaskedIndices.count() &&
1193 "Non-synced masked/available indices.");
1194 int Idx = UnusedIndices.find_first();
1195 int MIdx = MaskedIndices.find_first();
1196 while (MIdx >= 0) {
1197 assert(Idx >= 0 && "Indices must be synced.");
1198 Order[MIdx] = Idx;
1199 Idx = UnusedIndices.find_next(Idx);
1200 MIdx = MaskedIndices.find_next(MIdx);
1201 }
1202}
1203
1204/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1205/// Opcode1.
1207 unsigned Opcode1) {
1208 Type *ScalarTy = VL[0]->getType();
1209 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1210 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1211 for (unsigned Lane : seq<unsigned>(VL.size())) {
1212 if (isa<PoisonValue>(VL[Lane]))
1213 continue;
1214 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1215 OpcodeMask.set(Lane * ScalarTyNumElements,
1216 Lane * ScalarTyNumElements + ScalarTyNumElements);
1217 }
1218 return OpcodeMask;
1219}
1220
1221namespace llvm {
1222
1224 SmallVectorImpl<int> &Mask) {
1225 Mask.clear();
1226 const unsigned E = Indices.size();
1227 Mask.resize(E, PoisonMaskElem);
1228 for (unsigned I = 0; I < E; ++I)
1229 Mask[Indices[I]] = I;
1230}
1231
1232/// Reorders the list of scalars in accordance with the given \p Mask.
1234 ArrayRef<int> Mask) {
1235 assert(!Mask.empty() && "Expected non-empty mask.");
1236 SmallVector<Value *> Prev(Scalars.size(),
1237 PoisonValue::get(Scalars.front()->getType()));
1238 Prev.swap(Scalars);
1239 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1240 if (Mask[I] != PoisonMaskElem)
1241 Scalars[Mask[I]] = Prev[I];
1242}
1243
1244/// Checks if the provided value does not require scheduling. It does not
1245/// require scheduling if this is not an instruction or it is an instruction
1246/// that does not read/write memory and all operands are either not instructions
1247/// or phi nodes or instructions from different blocks.
1249 auto *I = dyn_cast<Instruction>(V);
1250 if (!I)
1251 return true;
1252 return !mayHaveNonDefUseDependency(*I) &&
1253 all_of(I->operands(), [I](Value *V) {
1254 auto *IO = dyn_cast<Instruction>(V);
1255 if (!IO)
1256 return true;
1257 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1258 });
1259}
1260
1261/// Checks if the provided value does not require scheduling. It does not
1262/// require scheduling if this is not an instruction or it is an instruction
1263/// that does not read/write memory and all users are phi nodes or instructions
1264/// from the different blocks.
1265static bool isUsedOutsideBlock(Value *V) {
1266 auto *I = dyn_cast<Instruction>(V);
1267 if (!I)
1268 return true;
1269 // Limits the number of uses to save compile time.
1270 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1271 all_of(I->users(), [I](User *U) {
1272 auto *IU = dyn_cast<Instruction>(U);
1273 if (!IU)
1274 return true;
1275 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1276 });
1277}
1278
1279/// Checks if the specified value does not require scheduling. It does not
1280/// require scheduling if all operands and all users do not need to be scheduled
1281/// in the current basic block.
1284}
1285
1286/// Checks if the specified array of instructions does not require scheduling.
1287/// It is so if all either instructions have operands that do not require
1288/// scheduling or their users do not require scheduling since they are phis or
1289/// in other basic blocks.
1291 return !VL.empty() &&
1293}
1294
1295/// Returns true if widened type of \p Ty elements with size \p Sz represents
1296/// full vector type, i.e. adding extra element results in extra parts upon type
1297/// legalization.
1299 unsigned Sz) {
1300 if (Sz <= 1)
1301 return false;
1302 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1303 return false;
1304 if (has_single_bit(Sz))
1305 return true;
1306 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1307 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1308 Sz % NumParts == 0;
1309}
1310
1311namespace slpvectorizer {
1312
1313/// Bottom Up SLP Vectorizer.
1314class BoUpSLP {
1315 struct TreeEntry;
1316 struct ScheduleData;
1319
1320public:
1321 /// Tracks the state we can represent the loads in the given sequence.
1322 enum class LoadsState {
1323 Gather,
1324 Vectorize,
1327 };
1328
1335
1337 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1340 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1341 AC(AC), DB(DB), DL(DL), ORE(ORE),
1342 Builder(Se->getContext(), TargetFolder(*DL)) {
1343 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1344 // Use the vector register size specified by the target unless overridden
1345 // by a command-line option.
1346 // TODO: It would be better to limit the vectorization factor based on
1347 // data type rather than just register size. For example, x86 AVX has
1348 // 256-bit registers, but it does not support integer operations
1349 // at that width (that requires AVX2).
1350 if (MaxVectorRegSizeOption.getNumOccurrences())
1351 MaxVecRegSize = MaxVectorRegSizeOption;
1352 else
1353 MaxVecRegSize =
1355 .getFixedValue();
1356
1357 if (MinVectorRegSizeOption.getNumOccurrences())
1358 MinVecRegSize = MinVectorRegSizeOption;
1359 else
1360 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1361 }
1362
1363 /// Vectorize the tree that starts with the elements in \p VL.
1364 /// Returns the vectorized root.
1366
1367 /// Vectorize the tree but with the list of externally used values \p
1368 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1369 /// generated extractvalue instructions.
1370 Value *
1371 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1372 Instruction *ReductionRoot = nullptr);
1373
1374 /// \returns the cost incurred by unwanted spills and fills, caused by
1375 /// holding live values over call sites.
1377
1378 /// \returns the vectorization cost of the subtree that starts at \p VL.
1379 /// A negative number means that this is profitable.
1380 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1381
1382 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1383 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1384 void buildTree(ArrayRef<Value *> Roots,
1385 const SmallDenseSet<Value *> &UserIgnoreLst);
1386
1387 /// Construct a vectorizable tree that starts at \p Roots.
1388 void buildTree(ArrayRef<Value *> Roots);
1389
1390 /// Returns whether the root node has in-tree uses.
1392 return !VectorizableTree.empty() &&
1393 !VectorizableTree.front()->UserTreeIndices.empty();
1394 }
1395
1396 /// Return the scalars of the root node.
1398 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1399 return VectorizableTree.front()->Scalars;
1400 }
1401
1402 /// Returns the type/is-signed info for the root node in the graph without
1403 /// casting.
1404 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1405 const TreeEntry &Root = *VectorizableTree.front().get();
1406 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1407 !Root.Scalars.front()->getType()->isIntegerTy())
1408 return std::nullopt;
1409 auto It = MinBWs.find(&Root);
1410 if (It != MinBWs.end())
1411 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1412 It->second.first),
1413 It->second.second);
1414 if (Root.getOpcode() == Instruction::ZExt ||
1415 Root.getOpcode() == Instruction::SExt)
1416 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1417 Root.getOpcode() == Instruction::SExt);
1418 return std::nullopt;
1419 }
1420
1421 /// Checks if the root graph node can be emitted with narrower bitwidth at
1422 /// codegen and returns it signedness, if so.
1424 return MinBWs.at(VectorizableTree.front().get()).second;
1425 }
1426
1427 /// Returns reduction type after minbitdth analysis.
1429 if (ReductionBitWidth == 0 ||
1430 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1431 ReductionBitWidth >=
1432 DL->getTypeSizeInBits(
1433 VectorizableTree.front()->Scalars.front()->getType()))
1434 return getWidenedType(
1435 VectorizableTree.front()->Scalars.front()->getType(),
1436 VectorizableTree.front()->getVectorFactor());
1437 return getWidenedType(
1439 VectorizableTree.front()->Scalars.front()->getContext(),
1440 ReductionBitWidth),
1441 VectorizableTree.front()->getVectorFactor());
1442 }
1443
1444 /// Builds external uses of the vectorized scalars, i.e. the list of
1445 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1446 /// ExternallyUsedValues contains additional list of external uses to handle
1447 /// vectorization of reductions.
1448 void
1449 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1450
1451 /// Transforms graph nodes to target specific representations, if profitable.
1452 void transformNodes();
1453
1454 /// Clear the internal data structures that are created by 'buildTree'.
1455 void deleteTree() {
1456 VectorizableTree.clear();
1457 ScalarToTreeEntry.clear();
1458 MultiNodeScalars.clear();
1459 MustGather.clear();
1460 NonScheduledFirst.clear();
1461 EntryToLastInstruction.clear();
1462 LoadEntriesToVectorize.clear();
1463 IsGraphTransformMode = false;
1464 GatheredLoadsEntriesFirst.reset();
1465 ExternalUses.clear();
1466 ExternalUsesAsOriginalScalar.clear();
1467 for (auto &Iter : BlocksSchedules) {
1468 BlockScheduling *BS = Iter.second.get();
1469 BS->clear();
1470 }
1471 MinBWs.clear();
1472 ReductionBitWidth = 0;
1473 BaseGraphSize = 1;
1474 CastMaxMinBWSizes.reset();
1475 ExtraBitWidthNodes.clear();
1476 InstrElementSize.clear();
1477 UserIgnoreList = nullptr;
1478 PostponedGathers.clear();
1479 ValueToGatherNodes.clear();
1480 }
1481
1482 unsigned getTreeSize() const { return VectorizableTree.size(); }
1483
1484 /// Returns the base graph size, before any transformations.
1485 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1486
1487 /// Perform LICM and CSE on the newly generated gather sequences.
1489
1490 /// Does this non-empty order represent an identity order? Identity
1491 /// should be represented as an empty order, so this is used to
1492 /// decide if we can canonicalize a computed order. Undef elements
1493 /// (represented as size) are ignored.
1495 assert(!Order.empty() && "expected non-empty order");
1496 const unsigned Sz = Order.size();
1497 return all_of(enumerate(Order), [&](const auto &P) {
1498 return P.value() == P.index() || P.value() == Sz;
1499 });
1500 }
1501
1502 /// Checks if the specified gather tree entry \p TE can be represented as a
1503 /// shuffled vector entry + (possibly) permutation with other gathers. It
1504 /// implements the checks only for possibly ordered scalars (Loads,
1505 /// ExtractElement, ExtractValue), which can be part of the graph.
1506 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1507
1508 /// Sort loads into increasing pointers offsets to allow greater clustering.
1509 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1510
1511 /// Gets reordering data for the given tree entry. If the entry is vectorized
1512 /// - just return ReorderIndices, otherwise check if the scalars can be
1513 /// reordered and return the most optimal order.
1514 /// \return std::nullopt if ordering is not important, empty order, if
1515 /// identity order is important, or the actual order.
1516 /// \param TopToBottom If true, include the order of vectorized stores and
1517 /// insertelement nodes, otherwise skip them.
1518 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1519 bool TopToBottom);
1520
1521 /// Reorders the current graph to the most profitable order starting from the
1522 /// root node to the leaf nodes. The best order is chosen only from the nodes
1523 /// of the same size (vectorization factor). Smaller nodes are considered
1524 /// parts of subgraph with smaller VF and they are reordered independently. We
1525 /// can make it because we still need to extend smaller nodes to the wider VF
1526 /// and we can merge reordering shuffles with the widening shuffles.
1527 void reorderTopToBottom();
1528
1529 /// Reorders the current graph to the most profitable order starting from
1530 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1531 /// number of reshuffles if the leaf nodes use the same order. In this case we
1532 /// can merge the orders and just shuffle user node instead of shuffling its
1533 /// operands. Plus, even the leaf nodes have different orders, it allows to
1534 /// sink reordering in the graph closer to the root node and merge it later
1535 /// during analysis.
1536 void reorderBottomToTop(bool IgnoreReorder = false);
1537
1538 /// \return The vector element size in bits to use when vectorizing the
1539 /// expression tree ending at \p V. If V is a store, the size is the width of
1540 /// the stored value. Otherwise, the size is the width of the largest loaded
1541 /// value reaching V. This method is used by the vectorizer to calculate
1542 /// vectorization factors.
1543 unsigned getVectorElementSize(Value *V);
1544
1545 /// Compute the minimum type sizes required to represent the entries in a
1546 /// vectorizable tree.
1548
1549 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1550 unsigned getMaxVecRegSize() const {
1551 return MaxVecRegSize;
1552 }
1553
1554 // \returns minimum vector register size as set by cl::opt.
1555 unsigned getMinVecRegSize() const {
1556 return MinVecRegSize;
1557 }
1558
1559 unsigned getMinVF(unsigned Sz) const {
1560 return std::max(2U, getMinVecRegSize() / Sz);
1561 }
1562
1563 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1564 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1565 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1566 return MaxVF ? MaxVF : UINT_MAX;
1567 }
1568
1569 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1570 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1571 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1572 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1573 ///
1574 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1575 unsigned canMapToVector(Type *T) const;
1576
1577 /// \returns True if the VectorizableTree is both tiny and not fully
1578 /// vectorizable. We do not vectorize such trees.
1579 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1580
1581 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1582 /// It may happen, if all gather nodes are loads and they cannot be
1583 /// "clusterized". In this case even subgraphs cannot be vectorized more
1584 /// effectively than the base graph.
1585 bool isTreeNotExtendable() const;
1586
1587 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1588 /// can be load combined in the backend. Load combining may not be allowed in
1589 /// the IR optimizer, so we do not want to alter the pattern. For example,
1590 /// partially transforming a scalar bswap() pattern into vector code is
1591 /// effectively impossible for the backend to undo.
1592 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1593 /// may not be necessary.
1594 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1595
1596 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1597 /// can be load combined in the backend. Load combining may not be allowed in
1598 /// the IR optimizer, so we do not want to alter the pattern. For example,
1599 /// partially transforming a scalar bswap() pattern into vector code is
1600 /// effectively impossible for the backend to undo.
1601 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1602 /// may not be necessary.
1603 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1604
1605 /// Checks if the given array of loads can be represented as a vectorized,
1606 /// scatter or just simple gather.
1607 /// \param VL list of loads.
1608 /// \param VL0 main load value.
1609 /// \param Order returned order of load instructions.
1610 /// \param PointerOps returned list of pointer operands.
1611 /// \param BestVF return best vector factor, if recursive check found better
1612 /// vectorization sequences rather than masked gather.
1613 /// \param TryRecursiveCheck used to check if long masked gather can be
1614 /// represented as a serie of loads/insert subvector, if profitable.
1617 SmallVectorImpl<Value *> &PointerOps,
1618 unsigned *BestVF = nullptr,
1619 bool TryRecursiveCheck = true) const;
1620
1621 /// Registers non-vectorizable sequence of loads
1622 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1623 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1624 }
1625
1626 /// Checks if the given loads sequence is known as not vectorizable
1627 template <typename T>
1629 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1630 }
1631
1633
1634 /// This structure holds any data we need about the edges being traversed
1635 /// during buildTree_rec(). We keep track of:
1636 /// (i) the user TreeEntry index, and
1637 /// (ii) the index of the edge.
1638 struct EdgeInfo {
1639 EdgeInfo() = default;
1640 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1642 /// The user TreeEntry.
1643 TreeEntry *UserTE = nullptr;
1644 /// The operand index of the use.
1645 unsigned EdgeIdx = UINT_MAX;
1646#ifndef NDEBUG
1648 const BoUpSLP::EdgeInfo &EI) {
1649 EI.dump(OS);
1650 return OS;
1651 }
1652 /// Debug print.
1653 void dump(raw_ostream &OS) const {
1654 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1655 << " EdgeIdx:" << EdgeIdx << "}";
1656 }
1657 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1658#endif
1659 bool operator == (const EdgeInfo &Other) const {
1660 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1661 }
1662 };
1663
1664 /// A helper class used for scoring candidates for two consecutive lanes.
1666 const TargetLibraryInfo &TLI;
1667 const DataLayout &DL;
1668 ScalarEvolution &SE;
1669 const BoUpSLP &R;
1670 int NumLanes; // Total number of lanes (aka vectorization factor).
1671 int MaxLevel; // The maximum recursion depth for accumulating score.
1672
1673 public:
1675 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1676 int MaxLevel)
1677 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1678 MaxLevel(MaxLevel) {}
1679
1680 // The hard-coded scores listed here are not very important, though it shall
1681 // be higher for better matches to improve the resulting cost. When
1682 // computing the scores of matching one sub-tree with another, we are
1683 // basically counting the number of values that are matching. So even if all
1684 // scores are set to 1, we would still get a decent matching result.
1685 // However, sometimes we have to break ties. For example we may have to
1686 // choose between matching loads vs matching opcodes. This is what these
1687 // scores are helping us with: they provide the order of preference. Also,
1688 // this is important if the scalar is externally used or used in another
1689 // tree entry node in the different lane.
1690
1691 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1692 static const int ScoreConsecutiveLoads = 4;
1693 /// The same load multiple times. This should have a better score than
1694 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1695 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1696 /// a vector load and 1.0 for a broadcast.
1697 static const int ScoreSplatLoads = 3;
1698 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1699 static const int ScoreReversedLoads = 3;
1700 /// A load candidate for masked gather.
1701 static const int ScoreMaskedGatherCandidate = 1;
1702 /// ExtractElementInst from same vector and consecutive indexes.
1703 static const int ScoreConsecutiveExtracts = 4;
1704 /// ExtractElementInst from same vector and reversed indices.
1705 static const int ScoreReversedExtracts = 3;
1706 /// Constants.
1707 static const int ScoreConstants = 2;
1708 /// Instructions with the same opcode.
1709 static const int ScoreSameOpcode = 2;
1710 /// Instructions with alt opcodes (e.g, add + sub).
1711 static const int ScoreAltOpcodes = 1;
1712 /// Identical instructions (a.k.a. splat or broadcast).
1713 static const int ScoreSplat = 1;
1714 /// Matching with an undef is preferable to failing.
1715 static const int ScoreUndef = 1;
1716 /// Score for failing to find a decent match.
1717 static const int ScoreFail = 0;
1718 /// Score if all users are vectorized.
1719 static const int ScoreAllUserVectorized = 1;
1720
1721 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1722 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1723 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1724 /// MainAltOps.
1726 ArrayRef<Value *> MainAltOps) const {
1727 if (!isValidElementType(V1->getType()) ||
1728 !isValidElementType(V2->getType()))
1730
1731 if (V1 == V2) {
1732 if (isa<LoadInst>(V1)) {
1733 // Retruns true if the users of V1 and V2 won't need to be extracted.
1734 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1735 // Bail out if we have too many uses to save compilation time.
1736 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1737 return false;
1738
1739 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1740 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1741 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1742 });
1743 };
1744 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1745 };
1746 // A broadcast of a load can be cheaper on some targets.
1747 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1748 ElementCount::getFixed(NumLanes)) &&
1749 ((int)V1->getNumUses() == NumLanes ||
1750 AllUsersAreInternal(V1, V2)))
1752 }
1754 }
1755
1756 auto CheckSameEntryOrFail = [&]() {
1757 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1758 TE1 && TE1 == R.getTreeEntry(V2))
1761 };
1762
1763 auto *LI1 = dyn_cast<LoadInst>(V1);
1764 auto *LI2 = dyn_cast<LoadInst>(V2);
1765 if (LI1 && LI2) {
1766 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1767 !LI2->isSimple())
1768 return CheckSameEntryOrFail();
1769
1770 std::optional<int> Dist = getPointersDiff(
1771 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1772 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1773 if (!Dist || *Dist == 0) {
1774 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1775 getUnderlyingObject(LI2->getPointerOperand()) &&
1776 R.TTI->isLegalMaskedGather(
1777 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1779 return CheckSameEntryOrFail();
1780 }
1781 // The distance is too large - still may be profitable to use masked
1782 // loads/gathers.
1783 if (std::abs(*Dist) > NumLanes / 2)
1785 // This still will detect consecutive loads, but we might have "holes"
1786 // in some cases. It is ok for non-power-2 vectorization and may produce
1787 // better results. It should not affect current vectorization.
1790 }
1791
1792 auto *C1 = dyn_cast<Constant>(V1);
1793 auto *C2 = dyn_cast<Constant>(V2);
1794 if (C1 && C2)
1796
1797 // Extracts from consecutive indexes of the same vector better score as
1798 // the extracts could be optimized away.
1799 Value *EV1;
1800 ConstantInt *Ex1Idx;
1801 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1802 // Undefs are always profitable for extractelements.
1803 // Compiler can easily combine poison and extractelement <non-poison> or
1804 // undef and extractelement <poison>. But combining undef +
1805 // extractelement <non-poison-but-may-produce-poison> requires some
1806 // extra operations.
1807 if (isa<UndefValue>(V2))
1808 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1811 Value *EV2 = nullptr;
1812 ConstantInt *Ex2Idx = nullptr;
1813 if (match(V2,
1815 m_Undef())))) {
1816 // Undefs are always profitable for extractelements.
1817 if (!Ex2Idx)
1819 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1821 if (EV2 == EV1) {
1822 int Idx1 = Ex1Idx->getZExtValue();
1823 int Idx2 = Ex2Idx->getZExtValue();
1824 int Dist = Idx2 - Idx1;
1825 // The distance is too large - still may be profitable to use
1826 // shuffles.
1827 if (std::abs(Dist) == 0)
1829 if (std::abs(Dist) > NumLanes / 2)
1833 }
1835 }
1836 return CheckSameEntryOrFail();
1837 }
1838
1839 auto *I1 = dyn_cast<Instruction>(V1);
1840 auto *I2 = dyn_cast<Instruction>(V2);
1841 if (I1 && I2) {
1842 if (I1->getParent() != I2->getParent())
1843 return CheckSameEntryOrFail();
1844 SmallVector<Value *, 4> Ops(MainAltOps);
1845 Ops.push_back(I1);
1846 Ops.push_back(I2);
1847 InstructionsState S = getSameOpcode(Ops, TLI);
1848 // Note: Only consider instructions with <= 2 operands to avoid
1849 // complexity explosion.
1850 if (S.getOpcode() &&
1851 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1852 !S.isAltShuffle()) &&
1853 all_of(Ops, [&S](Value *V) {
1854 return isa<PoisonValue>(V) ||
1855 cast<Instruction>(V)->getNumOperands() ==
1856 S.getMainOp()->getNumOperands();
1857 }))
1858 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1860 }
1861
1862 if (I1 && isa<PoisonValue>(V2))
1864
1865 if (isa<UndefValue>(V2))
1867
1868 return CheckSameEntryOrFail();
1869 }
1870
1871 /// Go through the operands of \p LHS and \p RHS recursively until
1872 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1873 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1874 /// of \p U1 and \p U2), except at the beginning of the recursion where
1875 /// these are set to nullptr.
1876 ///
1877 /// For example:
1878 /// \verbatim
1879 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1880 /// \ / \ / \ / \ /
1881 /// + + + +
1882 /// G1 G2 G3 G4
1883 /// \endverbatim
1884 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1885 /// each level recursively, accumulating the score. It starts from matching
1886 /// the additions at level 0, then moves on to the loads (level 1). The
1887 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1888 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1889 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1890 /// Please note that the order of the operands does not matter, as we
1891 /// evaluate the score of all profitable combinations of operands. In
1892 /// other words the score of G1 and G4 is the same as G1 and G2. This
1893 /// heuristic is based on ideas described in:
1894 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1895 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1896 /// Luís F. W. Góes
1898 Instruction *U2, int CurrLevel,
1899 ArrayRef<Value *> MainAltOps) const {
1900
1901 // Get the shallow score of V1 and V2.
1902 int ShallowScoreAtThisLevel =
1903 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1904
1905 // If reached MaxLevel,
1906 // or if V1 and V2 are not instructions,
1907 // or if they are SPLAT,
1908 // or if they are not consecutive,
1909 // or if profitable to vectorize loads or extractelements, early return
1910 // the current cost.
1911 auto *I1 = dyn_cast<Instruction>(LHS);
1912 auto *I2 = dyn_cast<Instruction>(RHS);
1913 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1914 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1915 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1916 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1917 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1918 ShallowScoreAtThisLevel))
1919 return ShallowScoreAtThisLevel;
1920 assert(I1 && I2 && "Should have early exited.");
1921
1922 // Contains the I2 operand indexes that got matched with I1 operands.
1923 SmallSet<unsigned, 4> Op2Used;
1924
1925 // Recursion towards the operands of I1 and I2. We are trying all possible
1926 // operand pairs, and keeping track of the best score.
1927 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1928 OpIdx1 != NumOperands1; ++OpIdx1) {
1929 // Try to pair op1I with the best operand of I2.
1930 int MaxTmpScore = 0;
1931 unsigned MaxOpIdx2 = 0;
1932 bool FoundBest = false;
1933 // If I2 is commutative try all combinations.
1934 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1935 unsigned ToIdx = isCommutative(I2)
1936 ? I2->getNumOperands()
1937 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1938 assert(FromIdx <= ToIdx && "Bad index");
1939 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1940 // Skip operands already paired with OpIdx1.
1941 if (Op2Used.count(OpIdx2))
1942 continue;
1943 // Recursively calculate the cost at each level
1944 int TmpScore =
1945 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1946 I1, I2, CurrLevel + 1, {});
1947 // Look for the best score.
1948 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1949 TmpScore > MaxTmpScore) {
1950 MaxTmpScore = TmpScore;
1951 MaxOpIdx2 = OpIdx2;
1952 FoundBest = true;
1953 }
1954 }
1955 if (FoundBest) {
1956 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1957 Op2Used.insert(MaxOpIdx2);
1958 ShallowScoreAtThisLevel += MaxTmpScore;
1959 }
1960 }
1961 return ShallowScoreAtThisLevel;
1962 }
1963 };
1964 /// A helper data structure to hold the operands of a vector of instructions.
1965 /// This supports a fixed vector length for all operand vectors.
1967 /// For each operand we need (i) the value, and (ii) the opcode that it
1968 /// would be attached to if the expression was in a left-linearized form.
1969 /// This is required to avoid illegal operand reordering.
1970 /// For example:
1971 /// \verbatim
1972 /// 0 Op1
1973 /// |/
1974 /// Op1 Op2 Linearized + Op2
1975 /// \ / ----------> |/
1976 /// - -
1977 ///
1978 /// Op1 - Op2 (0 + Op1) - Op2
1979 /// \endverbatim
1980 ///
1981 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1982 ///
1983 /// Another way to think of this is to track all the operations across the
1984 /// path from the operand all the way to the root of the tree and to
1985 /// calculate the operation that corresponds to this path. For example, the
1986 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1987 /// corresponding operation is a '-' (which matches the one in the
1988 /// linearized tree, as shown above).
1989 ///
1990 /// For lack of a better term, we refer to this operation as Accumulated
1991 /// Path Operation (APO).
1992 struct OperandData {
1993 OperandData() = default;
1994 OperandData(Value *V, bool APO, bool IsUsed)
1995 : V(V), APO(APO), IsUsed(IsUsed) {}
1996 /// The operand value.
1997 Value *V = nullptr;
1998 /// TreeEntries only allow a single opcode, or an alternate sequence of
1999 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2000 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2001 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2002 /// (e.g., Add/Mul)
2003 bool APO = false;
2004 /// Helper data for the reordering function.
2005 bool IsUsed = false;
2006 };
2007
2008 /// During operand reordering, we are trying to select the operand at lane
2009 /// that matches best with the operand at the neighboring lane. Our
2010 /// selection is based on the type of value we are looking for. For example,
2011 /// if the neighboring lane has a load, we need to look for a load that is
2012 /// accessing a consecutive address. These strategies are summarized in the
2013 /// 'ReorderingMode' enumerator.
2014 enum class ReorderingMode {
2015 Load, ///< Matching loads to consecutive memory addresses
2016 Opcode, ///< Matching instructions based on opcode (same or alternate)
2017 Constant, ///< Matching constants
2018 Splat, ///< Matching the same instruction multiple times (broadcast)
2019 Failed, ///< We failed to create a vectorizable group
2020 };
2021
2023
2024 /// A vector of operand vectors.
2026 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2027 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2028 unsigned ArgSize = 0;
2029
2030 const TargetLibraryInfo &TLI;
2031 const DataLayout &DL;
2032 ScalarEvolution &SE;
2033 const BoUpSLP &R;
2034 const Loop *L = nullptr;
2035
2036 /// \returns the operand data at \p OpIdx and \p Lane.
2037 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2038 return OpsVec[OpIdx][Lane];
2039 }
2040
2041 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2042 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2043 return OpsVec[OpIdx][Lane];
2044 }
2045
2046 /// Clears the used flag for all entries.
2047 void clearUsed() {
2048 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2049 OpIdx != NumOperands; ++OpIdx)
2050 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2051 ++Lane)
2052 OpsVec[OpIdx][Lane].IsUsed = false;
2053 }
2054
2055 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2056 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2057 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2058 }
2059
2060 /// \param Lane lane of the operands under analysis.
2061 /// \param OpIdx operand index in \p Lane lane we're looking the best
2062 /// candidate for.
2063 /// \param Idx operand index of the current candidate value.
2064 /// \returns The additional score due to possible broadcasting of the
2065 /// elements in the lane. It is more profitable to have power-of-2 unique
2066 /// elements in the lane, it will be vectorized with higher probability
2067 /// after removing duplicates. Currently the SLP vectorizer supports only
2068 /// vectorization of the power-of-2 number of unique scalars.
2069 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2070 const SmallBitVector &UsedLanes) const {
2071 Value *IdxLaneV = getData(Idx, Lane).V;
2072 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2073 isa<ExtractElementInst>(IdxLaneV))
2074 return 0;
2076 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2077 if (Ln == Lane)
2078 continue;
2079 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2080 if (!isa<Instruction>(OpIdxLnV))
2081 return 0;
2082 Uniques.try_emplace(OpIdxLnV, Ln);
2083 }
2084 unsigned UniquesCount = Uniques.size();
2085 auto IdxIt = Uniques.find(IdxLaneV);
2086 unsigned UniquesCntWithIdxLaneV =
2087 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2088 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2089 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2090 unsigned UniquesCntWithOpIdxLaneV =
2091 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2092 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2093 return 0;
2094 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2095 UniquesCntWithOpIdxLaneV,
2096 UniquesCntWithOpIdxLaneV -
2097 bit_floor(UniquesCntWithOpIdxLaneV)) -
2098 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2099 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2100 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2101 }
2102
2103 /// \param Lane lane of the operands under analysis.
2104 /// \param OpIdx operand index in \p Lane lane we're looking the best
2105 /// candidate for.
2106 /// \param Idx operand index of the current candidate value.
2107 /// \returns The additional score for the scalar which users are all
2108 /// vectorized.
2109 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2110 Value *IdxLaneV = getData(Idx, Lane).V;
2111 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2112 // Do not care about number of uses for vector-like instructions
2113 // (extractelement/extractvalue with constant indices), they are extracts
2114 // themselves and already externally used. Vectorization of such
2115 // instructions does not add extra extractelement instruction, just may
2116 // remove it.
2117 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2118 isVectorLikeInstWithConstOps(OpIdxLaneV))
2120 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2121 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2122 return 0;
2123 return R.areAllUsersVectorized(IdxLaneI)
2125 : 0;
2126 }
2127
2128 /// Score scaling factor for fully compatible instructions but with
2129 /// different number of external uses. Allows better selection of the
2130 /// instructions with less external uses.
2131 static const int ScoreScaleFactor = 10;
2132
2133 /// \Returns the look-ahead score, which tells us how much the sub-trees
2134 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2135 /// score. This helps break ties in an informed way when we cannot decide on
2136 /// the order of the operands by just considering the immediate
2137 /// predecessors.
2138 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2139 int Lane, unsigned OpIdx, unsigned Idx,
2140 bool &IsUsed, const SmallBitVector &UsedLanes) {
2141 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2143 // Keep track of the instruction stack as we recurse into the operands
2144 // during the look-ahead score exploration.
2145 int Score =
2146 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2147 /*CurrLevel=*/1, MainAltOps);
2148 if (Score) {
2149 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2150 if (Score <= -SplatScore) {
2151 // Failed score.
2152 Score = 0;
2153 } else {
2154 Score += SplatScore;
2155 // Scale score to see the difference between different operands
2156 // and similar operands but all vectorized/not all vectorized
2157 // uses. It does not affect actual selection of the best
2158 // compatible operand in general, just allows to select the
2159 // operand with all vectorized uses.
2160 Score *= ScoreScaleFactor;
2161 Score += getExternalUseScore(Lane, OpIdx, Idx);
2162 IsUsed = true;
2163 }
2164 }
2165 return Score;
2166 }
2167
2168 /// Best defined scores per lanes between the passes. Used to choose the
2169 /// best operand (with the highest score) between the passes.
2170 /// The key - {Operand Index, Lane}.
2171 /// The value - the best score between the passes for the lane and the
2172 /// operand.
2174 BestScoresPerLanes;
2175
2176 // Search all operands in Ops[*][Lane] for the one that matches best
2177 // Ops[OpIdx][LastLane] and return its opreand index.
2178 // If no good match can be found, return std::nullopt.
2179 std::optional<unsigned>
2180 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2181 ArrayRef<ReorderingMode> ReorderingModes,
2182 ArrayRef<Value *> MainAltOps,
2183 const SmallBitVector &UsedLanes) {
2184 unsigned NumOperands = getNumOperands();
2185
2186 // The operand of the previous lane at OpIdx.
2187 Value *OpLastLane = getData(OpIdx, LastLane).V;
2188
2189 // Our strategy mode for OpIdx.
2190 ReorderingMode RMode = ReorderingModes[OpIdx];
2191 if (RMode == ReorderingMode::Failed)
2192 return std::nullopt;
2193
2194 // The linearized opcode of the operand at OpIdx, Lane.
2195 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2196
2197 // The best operand index and its score.
2198 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2199 // are using the score to differentiate between the two.
2200 struct BestOpData {
2201 std::optional<unsigned> Idx;
2202 unsigned Score = 0;
2203 } BestOp;
2204 BestOp.Score =
2205 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2206 .first->second;
2207
2208 // Track if the operand must be marked as used. If the operand is set to
2209 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2210 // want to reestimate the operands again on the following iterations).
2211 bool IsUsed = RMode == ReorderingMode::Splat ||
2212 RMode == ReorderingMode::Constant ||
2213 RMode == ReorderingMode::Load;
2214 // Iterate through all unused operands and look for the best.
2215 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2216 // Get the operand at Idx and Lane.
2217 OperandData &OpData = getData(Idx, Lane);
2218 Value *Op = OpData.V;
2219 bool OpAPO = OpData.APO;
2220
2221 // Skip already selected operands.
2222 if (OpData.IsUsed)
2223 continue;
2224
2225 // Skip if we are trying to move the operand to a position with a
2226 // different opcode in the linearized tree form. This would break the
2227 // semantics.
2228 if (OpAPO != OpIdxAPO)
2229 continue;
2230
2231 // Look for an operand that matches the current mode.
2232 switch (RMode) {
2233 case ReorderingMode::Load:
2234 case ReorderingMode::Opcode: {
2235 bool LeftToRight = Lane > LastLane;
2236 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2237 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2238 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2239 OpIdx, Idx, IsUsed, UsedLanes);
2240 if (Score > static_cast<int>(BestOp.Score) ||
2241 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2242 Idx == OpIdx)) {
2243 BestOp.Idx = Idx;
2244 BestOp.Score = Score;
2245 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2246 }
2247 break;
2248 }
2249 case ReorderingMode::Constant:
2250 if (isa<Constant>(Op) ||
2251 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2252 BestOp.Idx = Idx;
2253 if (isa<Constant>(Op)) {
2255 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2257 }
2258 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2259 IsUsed = false;
2260 }
2261 break;
2262 case ReorderingMode::Splat:
2263 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2264 IsUsed = Op == OpLastLane;
2265 if (Op == OpLastLane) {
2266 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2267 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2269 }
2270 BestOp.Idx = Idx;
2271 }
2272 break;
2273 case ReorderingMode::Failed:
2274 llvm_unreachable("Not expected Failed reordering mode.");
2275 }
2276 }
2277
2278 if (BestOp.Idx) {
2279 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2280 return BestOp.Idx;
2281 }
2282 // If we could not find a good match return std::nullopt.
2283 return std::nullopt;
2284 }
2285
2286 /// Helper for reorderOperandVecs.
2287 /// \returns the lane that we should start reordering from. This is the one
2288 /// which has the least number of operands that can freely move about or
2289 /// less profitable because it already has the most optimal set of operands.
2290 unsigned getBestLaneToStartReordering() const {
2291 unsigned Min = UINT_MAX;
2292 unsigned SameOpNumber = 0;
2293 // std::pair<unsigned, unsigned> is used to implement a simple voting
2294 // algorithm and choose the lane with the least number of operands that
2295 // can freely move about or less profitable because it already has the
2296 // most optimal set of operands. The first unsigned is a counter for
2297 // voting, the second unsigned is the counter of lanes with instructions
2298 // with same/alternate opcodes and same parent basic block.
2300 // Try to be closer to the original results, if we have multiple lanes
2301 // with same cost. If 2 lanes have the same cost, use the one with the
2302 // highest index.
2303 for (int I = getNumLanes(); I > 0; --I) {
2304 unsigned Lane = I - 1;
2305 OperandsOrderData NumFreeOpsHash =
2306 getMaxNumOperandsThatCanBeReordered(Lane);
2307 // Compare the number of operands that can move and choose the one with
2308 // the least number.
2309 if (NumFreeOpsHash.NumOfAPOs < Min) {
2310 Min = NumFreeOpsHash.NumOfAPOs;
2311 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2312 HashMap.clear();
2313 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2314 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2315 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2316 // Select the most optimal lane in terms of number of operands that
2317 // should be moved around.
2318 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2319 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2320 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2321 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2322 auto [It, Inserted] =
2323 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2324 if (!Inserted)
2325 ++It->second.first;
2326 }
2327 }
2328 // Select the lane with the minimum counter.
2329 unsigned BestLane = 0;
2330 unsigned CntMin = UINT_MAX;
2331 for (const auto &Data : reverse(HashMap)) {
2332 if (Data.second.first < CntMin) {
2333 CntMin = Data.second.first;
2334 BestLane = Data.second.second;
2335 }
2336 }
2337 return BestLane;
2338 }
2339
2340 /// Data structure that helps to reorder operands.
2341 struct OperandsOrderData {
2342 /// The best number of operands with the same APOs, which can be
2343 /// reordered.
2344 unsigned NumOfAPOs = UINT_MAX;
2345 /// Number of operands with the same/alternate instruction opcode and
2346 /// parent.
2347 unsigned NumOpsWithSameOpcodeParent = 0;
2348 /// Hash for the actual operands ordering.
2349 /// Used to count operands, actually their position id and opcode
2350 /// value. It is used in the voting mechanism to find the lane with the
2351 /// least number of operands that can freely move about or less profitable
2352 /// because it already has the most optimal set of operands. Can be
2353 /// replaced with SmallVector<unsigned> instead but hash code is faster
2354 /// and requires less memory.
2355 unsigned Hash = 0;
2356 };
2357 /// \returns the maximum number of operands that are allowed to be reordered
2358 /// for \p Lane and the number of compatible instructions(with the same
2359 /// parent/opcode). This is used as a heuristic for selecting the first lane
2360 /// to start operand reordering.
2361 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2362 unsigned CntTrue = 0;
2363 unsigned NumOperands = getNumOperands();
2364 // Operands with the same APO can be reordered. We therefore need to count
2365 // how many of them we have for each APO, like this: Cnt[APO] = x.
2366 // Since we only have two APOs, namely true and false, we can avoid using
2367 // a map. Instead we can simply count the number of operands that
2368 // correspond to one of them (in this case the 'true' APO), and calculate
2369 // the other by subtracting it from the total number of operands.
2370 // Operands with the same instruction opcode and parent are more
2371 // profitable since we don't need to move them in many cases, with a high
2372 // probability such lane already can be vectorized effectively.
2373 bool AllUndefs = true;
2374 unsigned NumOpsWithSameOpcodeParent = 0;
2375 Instruction *OpcodeI = nullptr;
2376 BasicBlock *Parent = nullptr;
2377 unsigned Hash = 0;
2378 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2379 const OperandData &OpData = getData(OpIdx, Lane);
2380 if (OpData.APO)
2381 ++CntTrue;
2382 // Use Boyer-Moore majority voting for finding the majority opcode and
2383 // the number of times it occurs.
2384 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2385 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2386 I->getParent() != Parent) {
2387 if (NumOpsWithSameOpcodeParent == 0) {
2388 NumOpsWithSameOpcodeParent = 1;
2389 OpcodeI = I;
2390 Parent = I->getParent();
2391 } else {
2392 --NumOpsWithSameOpcodeParent;
2393 }
2394 } else {
2395 ++NumOpsWithSameOpcodeParent;
2396 }
2397 }
2398 Hash = hash_combine(
2399 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2400 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2401 }
2402 if (AllUndefs)
2403 return {};
2404 OperandsOrderData Data;
2405 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2406 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2407 Data.Hash = Hash;
2408 return Data;
2409 }
2410
2411 /// Go through the instructions in VL and append their operands.
2412 void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
2413 assert(!VL.empty() && "Bad VL");
2414 assert((empty() || VL.size() == getNumLanes()) &&
2415 "Expected same number of lanes");
2416 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2417 // arguments to the intrinsic produces the same result.
2418 constexpr unsigned IntrinsicNumOperands = 2;
2419 unsigned NumOperands = VL0->getNumOperands();
2420 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2421 OpsVec.resize(NumOperands);
2422 unsigned NumLanes = VL.size();
2423 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2424 OpsVec[OpIdx].resize(NumLanes);
2425 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2426 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2427 "Expected instruction or poison value");
2428 // Our tree has just 3 nodes: the root and two operands.
2429 // It is therefore trivial to get the APO. We only need to check the
2430 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2431 // RHS operand. The LHS operand of both add and sub is never attached
2432 // to an inversese operation in the linearized form, therefore its APO
2433 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2434
2435 // Since operand reordering is performed on groups of commutative
2436 // operations or alternating sequences (e.g., +, -), we can safely
2437 // tell the inverse operations by checking commutativity.
2438 if (isa<PoisonValue>(VL[Lane])) {
2439 OpsVec[OpIdx][Lane] = {
2440 PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
2441 false};
2442 continue;
2443 }
2444 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2445 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2446 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2447 APO, false};
2448 }
2449 }
2450 }
2451
2452 /// \returns the number of operands.
2453 unsigned getNumOperands() const { return ArgSize; }
2454
2455 /// \returns the number of lanes.
2456 unsigned getNumLanes() const { return OpsVec[0].size(); }
2457
2458 /// \returns the operand value at \p OpIdx and \p Lane.
2459 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2460 return getData(OpIdx, Lane).V;
2461 }
2462
2463 /// \returns true if the data structure is empty.
2464 bool empty() const { return OpsVec.empty(); }
2465
2466 /// Clears the data.
2467 void clear() { OpsVec.clear(); }
2468
2469 /// \Returns true if there are enough operands identical to \p Op to fill
2470 /// the whole vector (it is mixed with constants or loop invariant values).
2471 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2472 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2473 assert(Op == getValue(OpIdx, Lane) &&
2474 "Op is expected to be getValue(OpIdx, Lane).");
2475 // Small number of loads - try load matching.
2476 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2477 return false;
2478 bool OpAPO = getData(OpIdx, Lane).APO;
2479 bool IsInvariant = L && L->isLoopInvariant(Op);
2480 unsigned Cnt = 0;
2481 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2482 if (Ln == Lane)
2483 continue;
2484 // This is set to true if we found a candidate for broadcast at Lane.
2485 bool FoundCandidate = false;
2486 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2487 OperandData &Data = getData(OpI, Ln);
2488 if (Data.APO != OpAPO || Data.IsUsed)
2489 continue;
2490 Value *OpILane = getValue(OpI, Lane);
2491 bool IsConstantOp = isa<Constant>(OpILane);
2492 // Consider the broadcast candidate if:
2493 // 1. Same value is found in one of the operands.
2494 if (Data.V == Op ||
2495 // 2. The operand in the given lane is not constant but there is a
2496 // constant operand in another lane (which can be moved to the
2497 // given lane). In this case we can represent it as a simple
2498 // permutation of constant and broadcast.
2499 (!IsConstantOp &&
2500 ((Lns > 2 && isa<Constant>(Data.V)) ||
2501 // 2.1. If we have only 2 lanes, need to check that value in the
2502 // next lane does not build same opcode sequence.
2503 (Lns == 2 &&
2504 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2505 .getOpcode() &&
2506 isa<Constant>(Data.V)))) ||
2507 // 3. The operand in the current lane is loop invariant (can be
2508 // hoisted out) and another operand is also a loop invariant
2509 // (though not a constant). In this case the whole vector can be
2510 // hoisted out.
2511 // FIXME: need to teach the cost model about this case for better
2512 // estimation.
2513 (IsInvariant && !isa<Constant>(Data.V) &&
2514 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2515 L->isLoopInvariant(Data.V))) {
2516 FoundCandidate = true;
2517 Data.IsUsed = Data.V == Op;
2518 if (Data.V == Op)
2519 ++Cnt;
2520 break;
2521 }
2522 }
2523 if (!FoundCandidate)
2524 return false;
2525 }
2526 return getNumLanes() == 2 || Cnt > 1;
2527 }
2528
2529 /// Checks if there is at least single compatible operand in lanes other
2530 /// than \p Lane, compatible with the operand \p Op.
2531 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2532 assert(Op == getValue(OpIdx, Lane) &&
2533 "Op is expected to be getValue(OpIdx, Lane).");
2534 bool OpAPO = getData(OpIdx, Lane).APO;
2535 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2536 if (Ln == Lane)
2537 continue;
2538 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2539 const OperandData &Data = getData(OpI, Ln);
2540 if (Data.APO != OpAPO || Data.IsUsed)
2541 return true;
2542 Value *OpILn = getValue(OpI, Ln);
2543 return (L && L->isLoopInvariant(OpILn)) ||
2544 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2545 allSameBlock({Op, OpILn}));
2546 }))
2547 return true;
2548 }
2549 return false;
2550 }
2551
2552 public:
2553 /// Initialize with all the operands of the instruction vector \p RootVL.
2555 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2556 L(R.LI->getLoopFor((VL0->getParent()))) {
2557 // Append all the operands of RootVL.
2558 appendOperandsOfVL(RootVL, VL0);
2559 }
2560
2561 /// \Returns a value vector with the operands across all lanes for the
2562 /// opearnd at \p OpIdx.
2563 ValueList getVL(unsigned OpIdx) const {
2564 ValueList OpVL(OpsVec[OpIdx].size());
2565 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2566 "Expected same num of lanes across all operands");
2567 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2568 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2569 return OpVL;
2570 }
2571
2572 // Performs operand reordering for 2 or more operands.
2573 // The original operands are in OrigOps[OpIdx][Lane].
2574 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2575 void reorder() {
2576 unsigned NumOperands = getNumOperands();
2577 unsigned NumLanes = getNumLanes();
2578 // Each operand has its own mode. We are using this mode to help us select
2579 // the instructions for each lane, so that they match best with the ones
2580 // we have selected so far.
2581 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2582
2583 // This is a greedy single-pass algorithm. We are going over each lane
2584 // once and deciding on the best order right away with no back-tracking.
2585 // However, in order to increase its effectiveness, we start with the lane
2586 // that has operands that can move the least. For example, given the
2587 // following lanes:
2588 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2589 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2590 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2591 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2592 // we will start at Lane 1, since the operands of the subtraction cannot
2593 // be reordered. Then we will visit the rest of the lanes in a circular
2594 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2595
2596 // Find the first lane that we will start our search from.
2597 unsigned FirstLane = getBestLaneToStartReordering();
2598
2599 // Initialize the modes.
2600 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2601 Value *OpLane0 = getValue(OpIdx, FirstLane);
2602 // Keep track if we have instructions with all the same opcode on one
2603 // side.
2604 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2605 // Check if OpLane0 should be broadcast.
2606 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2607 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2608 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2609 else if (isa<LoadInst>(OpILane0))
2610 ReorderingModes[OpIdx] = ReorderingMode::Load;
2611 else
2612 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2613 } else if (isa<Constant>(OpLane0)) {
2614 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2615 } else if (isa<Argument>(OpLane0)) {
2616 // Our best hope is a Splat. It may save some cost in some cases.
2617 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2618 } else {
2619 llvm_unreachable("Unexpected value kind.");
2620 }
2621 }
2622
2623 // Check that we don't have same operands. No need to reorder if operands
2624 // are just perfect diamond or shuffled diamond match. Do not do it only
2625 // for possible broadcasts or non-power of 2 number of scalars (just for
2626 // now).
2627 auto &&SkipReordering = [this]() {
2628 SmallPtrSet<Value *, 4> UniqueValues;
2629 ArrayRef<OperandData> Op0 = OpsVec.front();
2630 for (const OperandData &Data : Op0)
2631 UniqueValues.insert(Data.V);
2633 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2634 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2635 return !UniqueValues.contains(Data.V);
2636 }))
2637 return false;
2638 }
2639 // TODO: Check if we can remove a check for non-power-2 number of
2640 // scalars after full support of non-power-2 vectorization.
2641 return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2642 };
2643
2644 // If the initial strategy fails for any of the operand indexes, then we
2645 // perform reordering again in a second pass. This helps avoid assigning
2646 // high priority to the failed strategy, and should improve reordering for
2647 // the non-failed operand indexes.
2648 for (int Pass = 0; Pass != 2; ++Pass) {
2649 // Check if no need to reorder operands since they're are perfect or
2650 // shuffled diamond match.
2651 // Need to do it to avoid extra external use cost counting for
2652 // shuffled matches, which may cause regressions.
2653 if (SkipReordering())
2654 break;
2655 // Skip the second pass if the first pass did not fail.
2656 bool StrategyFailed = false;
2657 // Mark all operand data as free to use.
2658 clearUsed();
2659 // We keep the original operand order for the FirstLane, so reorder the
2660 // rest of the lanes. We are visiting the nodes in a circular fashion,
2661 // using FirstLane as the center point and increasing the radius
2662 // distance.
2663 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2664 for (unsigned I = 0; I < NumOperands; ++I)
2665 MainAltOps[I].push_back(getData(I, FirstLane).V);
2666
2667 SmallBitVector UsedLanes(NumLanes);
2668 UsedLanes.set(FirstLane);
2669 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2670 // Visit the lane on the right and then the lane on the left.
2671 for (int Direction : {+1, -1}) {
2672 int Lane = FirstLane + Direction * Distance;
2673 if (Lane < 0 || Lane >= (int)NumLanes)
2674 continue;
2675 UsedLanes.set(Lane);
2676 int LastLane = Lane - Direction;
2677 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2678 "Out of bounds");
2679 // Look for a good match for each operand.
2680 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2681 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2682 std::optional<unsigned> BestIdx =
2683 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2684 MainAltOps[OpIdx], UsedLanes);
2685 // By not selecting a value, we allow the operands that follow to
2686 // select a better matching value. We will get a non-null value in
2687 // the next run of getBestOperand().
2688 if (BestIdx) {
2689 // Swap the current operand with the one returned by
2690 // getBestOperand().
2691 swap(OpIdx, *BestIdx, Lane);
2692 } else {
2693 // Enable the second pass.
2694 StrategyFailed = true;
2695 }
2696 // Try to get the alternate opcode and follow it during analysis.
2697 if (MainAltOps[OpIdx].size() != 2) {
2698 OperandData &AltOp = getData(OpIdx, Lane);
2699 InstructionsState OpS =
2700 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2701 if (OpS.getOpcode() && OpS.isAltShuffle())
2702 MainAltOps[OpIdx].push_back(AltOp.V);
2703 }
2704 }
2705 }
2706 }
2707 // Skip second pass if the strategy did not fail.
2708 if (!StrategyFailed)
2709 break;
2710 }
2711 }
2712
2713#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2714 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2715 switch (RMode) {
2716 case ReorderingMode::Load:
2717 return "Load";
2718 case ReorderingMode::Opcode:
2719 return "Opcode";
2720 case ReorderingMode::Constant:
2721 return "Constant";
2722 case ReorderingMode::Splat:
2723 return "Splat";
2724 case ReorderingMode::Failed:
2725 return "Failed";
2726 }
2727 llvm_unreachable("Unimplemented Reordering Type");
2728 }
2729
2730 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2731 raw_ostream &OS) {
2732 return OS << getModeStr(RMode);
2733 }
2734
2735 /// Debug print.
2736 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2737 printMode(RMode, dbgs());
2738 }
2739
2740 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2741 return printMode(RMode, OS);
2742 }
2743
2745 const unsigned Indent = 2;
2746 unsigned Cnt = 0;
2747 for (const OperandDataVec &OpDataVec : OpsVec) {
2748 OS << "Operand " << Cnt++ << "\n";
2749 for (const OperandData &OpData : OpDataVec) {
2750 OS.indent(Indent) << "{";
2751 if (Value *V = OpData.V)
2752 OS << *V;
2753 else
2754 OS << "null";
2755 OS << ", APO:" << OpData.APO << "}\n";
2756 }
2757 OS << "\n";
2758 }
2759 return OS;
2760 }
2761
2762 /// Debug print.
2763 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2764#endif
2765 };
2766
2767 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2768 /// for a pair which have highest score deemed to have best chance to form
2769 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2770 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2771 /// of the cost, considered to be good enough score.
2772 std::optional<int>
2773 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2774 int Limit = LookAheadHeuristics::ScoreFail) const {
2775 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2777 int BestScore = Limit;
2778 std::optional<int> Index;
2779 for (int I : seq<int>(0, Candidates.size())) {
2780 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2781 Candidates[I].second,
2782 /*U1=*/nullptr, /*U2=*/nullptr,
2783 /*CurrLevel=*/1, {});
2784 if (Score > BestScore) {
2785 BestScore = Score;
2786 Index = I;
2787 }
2788 }
2789 return Index;
2790 }
2791
2792 /// Checks if the instruction is marked for deletion.
2793 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2794
2795 /// Removes an instruction from its block and eventually deletes it.
2796 /// It's like Instruction::eraseFromParent() except that the actual deletion
2797 /// is delayed until BoUpSLP is destructed.
2799 DeletedInstructions.insert(I);
2800 }
2801
2802 /// Remove instructions from the parent function and clear the operands of \p
2803 /// DeadVals instructions, marking for deletion trivially dead operands.
2804 template <typename T>
2807 for (T *V : DeadVals) {
2808 auto *I = cast<Instruction>(V);
2809 DeletedInstructions.insert(I);
2810 }
2811 DenseSet<Value *> Processed;
2812 for (T *V : DeadVals) {
2813 if (!V || !Processed.insert(V).second)
2814 continue;
2815 auto *I = cast<Instruction>(V);
2818 if (const TreeEntry *Entry = getTreeEntry(I)) {
2819 Entries.push_back(Entry);
2820 auto It = MultiNodeScalars.find(I);
2821 if (It != MultiNodeScalars.end())
2822 Entries.append(It->second.begin(), It->second.end());
2823 }
2824 for (Use &U : I->operands()) {
2825 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2826 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2828 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2829 return Entry->VectorizedValue == OpI;
2830 })))
2831 DeadInsts.push_back(OpI);
2832 }
2833 I->dropAllReferences();
2834 }
2835 for (T *V : DeadVals) {
2836 auto *I = cast<Instruction>(V);
2837 if (!I->getParent())
2838 continue;
2839 assert((I->use_empty() || all_of(I->uses(),
2840 [&](Use &U) {
2841 return isDeleted(
2842 cast<Instruction>(U.getUser()));
2843 })) &&
2844 "trying to erase instruction with users.");
2845 I->removeFromParent();
2846 SE->forgetValue(I);
2847 }
2848 // Process the dead instruction list until empty.
2849 while (!DeadInsts.empty()) {
2850 Value *V = DeadInsts.pop_back_val();
2851 Instruction *VI = cast_or_null<Instruction>(V);
2852 if (!VI || !VI->getParent())
2853 continue;
2855 "Live instruction found in dead worklist!");
2856 assert(VI->use_empty() && "Instructions with uses are not dead.");
2857
2858 // Don't lose the debug info while deleting the instructions.
2859 salvageDebugInfo(*VI);
2860
2861 // Null out all of the instruction's operands to see if any operand
2862 // becomes dead as we go.
2863 for (Use &OpU : VI->operands()) {
2864 Value *OpV = OpU.get();
2865 if (!OpV)
2866 continue;
2867 OpU.set(nullptr);
2868
2869 if (!OpV->use_empty())
2870 continue;
2871
2872 // If the operand is an instruction that became dead as we nulled out
2873 // the operand, and if it is 'trivially' dead, delete it in a future
2874 // loop iteration.
2875 if (auto *OpI = dyn_cast<Instruction>(OpV))
2876 if (!DeletedInstructions.contains(OpI) &&
2878 DeadInsts.push_back(OpI);
2879 }
2880
2881 VI->removeFromParent();
2882 DeletedInstructions.insert(VI);
2883 SE->forgetValue(VI);
2884 }
2885 }
2886
2887 /// Checks if the instruction was already analyzed for being possible
2888 /// reduction root.
2890 return AnalyzedReductionsRoots.count(I);
2891 }
2892 /// Register given instruction as already analyzed for being possible
2893 /// reduction root.
2895 AnalyzedReductionsRoots.insert(I);
2896 }
2897 /// Checks if the provided list of reduced values was checked already for
2898 /// vectorization.
2900 return AnalyzedReductionVals.contains(hash_value(VL));
2901 }
2902 /// Adds the list of reduced values to list of already checked values for the
2903 /// vectorization.
2905 AnalyzedReductionVals.insert(hash_value(VL));
2906 }
2907 /// Clear the list of the analyzed reduction root instructions.
2909 AnalyzedReductionsRoots.clear();
2910 AnalyzedReductionVals.clear();
2911 AnalyzedMinBWVals.clear();
2912 }
2913 /// Checks if the given value is gathered in one of the nodes.
2914 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2915 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2916 }
2917 /// Checks if the given value is gathered in one of the nodes.
2918 bool isGathered(const Value *V) const {
2919 return MustGather.contains(V);
2920 }
2921 /// Checks if the specified value was not schedule.
2922 bool isNotScheduled(const Value *V) const {
2923 return NonScheduledFirst.contains(V);
2924 }
2925
2926 /// Check if the value is vectorized in the tree.
2927 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2928
2929 ~BoUpSLP();
2930
2931private:
2932 /// Determine if a node \p E in can be demoted to a smaller type with a
2933 /// truncation. We collect the entries that will be demoted in ToDemote.
2934 /// \param E Node for analysis
2935 /// \param ToDemote indices of the nodes to be demoted.
2936 bool collectValuesToDemote(
2937 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2939 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2940 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2941
2942 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2943 /// reordering (i.e. the operands can be reordered because they have only one
2944 /// user and reordarable).
2945 /// \param ReorderableGathers List of all gather nodes that require reordering
2946 /// (e.g., gather of extractlements or partially vectorizable loads).
2947 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2948 /// reordering, subset of \p NonVectorized.
2949 bool
2950 canReorderOperands(TreeEntry *UserTE,
2951 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2952 ArrayRef<TreeEntry *> ReorderableGathers,
2953 SmallVectorImpl<TreeEntry *> &GatherOps);
2954
2955 /// Checks if the given \p TE is a gather node with clustered reused scalars
2956 /// and reorders it per given \p Mask.
2957 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2958
2959 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2960 /// if any. If it is not vectorized (gather node), returns nullptr.
2961 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2962 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2963 TreeEntry *TE = nullptr;
2964 const auto *It = find_if(VL, [&](Value *V) {
2965 TE = getTreeEntry(V);
2966 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2967 return true;
2968 auto It = MultiNodeScalars.find(V);
2969 if (It != MultiNodeScalars.end()) {
2970 for (TreeEntry *E : It->second) {
2971 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2972 TE = E;
2973 return true;
2974 }
2975 }
2976 }
2977 return false;
2978 });
2979 if (It != VL.end()) {
2980 assert(TE->isSame(VL) && "Expected same scalars.");
2981 return TE;
2982 }
2983 return nullptr;
2984 }
2985
2986 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2987 /// if any. If it is not vectorized (gather node), returns nullptr.
2988 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2989 unsigned OpIdx) const {
2990 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2991 const_cast<TreeEntry *>(UserTE), OpIdx);
2992 }
2993
2994 /// Checks if all users of \p I are the part of the vectorization tree.
2995 bool areAllUsersVectorized(
2996 Instruction *I,
2997 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2998
2999 /// Return information about the vector formed for the specified index
3000 /// of a vector of (the same) instruction.
3002
3003 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3004 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3005
3006 /// Gets the root instruction for the given node. If the node is a strided
3007 /// load/store node with the reverse order, the root instruction is the last
3008 /// one.
3009 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3010
3011 /// \returns Cast context for the given graph node.
3013 getCastContextHint(const TreeEntry &TE) const;
3014
3015 /// \returns the cost of the vectorizable entry.
3016 InstructionCost getEntryCost(const TreeEntry *E,
3017 ArrayRef<Value *> VectorizedVals,
3018 SmallPtrSetImpl<Value *> &CheckedExtracts);
3019
3020 /// This is the recursive part of buildTree.
3021 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3022 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3023
3024 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3025 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3026 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3027 /// returns false, setting \p CurrentOrder to either an empty vector or a
3028 /// non-identity permutation that allows to reuse extract instructions.
3029 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3030 /// extract order.
3031 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
3032 SmallVectorImpl<unsigned> &CurrentOrder,
3033 bool ResizeAllowed = false) const;
3034
3035 /// Vectorize a single entry in the tree.
3036 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3037 /// avoid issues with def-use order.
3038 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3039
3040 /// Returns vectorized operand node, that matches the order of the scalars
3041 /// operand number \p NodeIdx in entry \p E.
3042 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3043 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3044 unsigned NodeIdx) const {
3045 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3046 }
3047
3048 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3049 /// \p E.
3050 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3051 /// avoid issues with def-use order.
3052 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3053
3054 /// Create a new vector from a list of scalar values. Produces a sequence
3055 /// which exploits values reused across lanes, and arranges the inserts
3056 /// for ease of later optimization.
3057 template <typename BVTy, typename ResTy, typename... Args>
3058 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3059
3060 /// Create a new vector from a list of scalar values. Produces a sequence
3061 /// which exploits values reused across lanes, and arranges the inserts
3062 /// for ease of later optimization.
3063 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3064 bool PostponedPHIs);
3065
3066 /// Returns the instruction in the bundle, which can be used as a base point
3067 /// for scheduling. Usually it is the last instruction in the bundle, except
3068 /// for the case when all operands are external (in this case, it is the first
3069 /// instruction in the list).
3070 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3071
3072 /// Tries to find extractelement instructions with constant indices from fixed
3073 /// vector type and gather such instructions into a bunch, which highly likely
3074 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3075 /// was successful, the matched scalars are replaced by poison values in \p VL
3076 /// for future analysis.
3077 std::optional<TargetTransformInfo::ShuffleKind>
3078 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3079 SmallVectorImpl<int> &Mask) const;
3080
3081 /// Tries to find extractelement instructions with constant indices from fixed
3082 /// vector type and gather such instructions into a bunch, which highly likely
3083 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3084 /// was successful, the matched scalars are replaced by poison values in \p VL
3085 /// for future analysis.
3087 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3089 unsigned NumParts) const;
3090
3091 /// Checks if the gathered \p VL can be represented as a single register
3092 /// shuffle(s) of previous tree entries.
3093 /// \param TE Tree entry checked for permutation.
3094 /// \param VL List of scalars (a subset of the TE scalar), checked for
3095 /// permutations. Must form single-register vector.
3096 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3097 /// commands to build the mask using the original vector value, without
3098 /// relying on the potential reordering.
3099 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3100 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3101 std::optional<TargetTransformInfo::ShuffleKind>
3102 isGatherShuffledSingleRegisterEntry(
3103 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3104 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3105 bool ForOrder);
3106
3107 /// Checks if the gathered \p VL can be represented as multi-register
3108 /// shuffle(s) of previous tree entries.
3109 /// \param TE Tree entry checked for permutation.
3110 /// \param VL List of scalars (a subset of the TE scalar), checked for
3111 /// permutations.
3112 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3113 /// commands to build the mask using the original vector value, without
3114 /// relying on the potential reordering.
3115 /// \returns per-register series of ShuffleKind, if gathered values can be
3116 /// represented as shuffles of previous tree entries. \p Mask is filled with
3117 /// the shuffle mask (also on per-register base).
3119 isGatherShuffledEntry(
3120 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3122 unsigned NumParts, bool ForOrder = false);
3123
3124 /// \returns the cost of gathering (inserting) the values in \p VL into a
3125 /// vector.
3126 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3127 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3128 Type *ScalarTy) const;
3129
3130 /// Set the Builder insert point to one after the last instruction in
3131 /// the bundle
3132 void setInsertPointAfterBundle(const TreeEntry *E);
3133
3134 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3135 /// specified, the starting vector value is poison.
3136 Value *
3137 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3138 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3139
3140 /// \returns whether the VectorizableTree is fully vectorizable and will
3141 /// be beneficial even the tree height is tiny.
3142 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3143
3144 /// Run through the list of all gathered loads in the graph and try to find
3145 /// vector loads/masked gathers instead of regular gathers. Later these loads
3146 /// are reshufled to build final gathered nodes.
3147 void tryToVectorizeGatheredLoads(
3148 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3149 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3150 8> &GatheredLoads);
3151
3152 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3153 /// users of \p TE and collects the stores. It returns the map from the store
3154 /// pointers to the collected stores.
3156 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3157
3158 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3159 /// stores in \p StoresVec can form a vector instruction. If so it returns
3160 /// true and populates \p ReorderIndices with the shuffle indices of the
3161 /// stores when compared to the sorted vector.
3162 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3163 OrdersType &ReorderIndices) const;
3164
3165 /// Iterates through the users of \p TE, looking for scalar stores that can be
3166 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3167 /// their order and builds an order index vector for each store bundle. It
3168 /// returns all these order vectors found.
3169 /// We run this after the tree has formed, otherwise we may come across user
3170 /// instructions that are not yet in the tree.
3172 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3173
3174 /// Tries to reorder the gathering node for better vectorization
3175 /// opportunities.
3176 void reorderGatherNode(TreeEntry &TE);
3177
3178 struct TreeEntry {
3179 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3180 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3181
3182 /// \returns Common mask for reorder indices and reused scalars.
3183 SmallVector<int> getCommonMask() const {
3185 inversePermutation(ReorderIndices, Mask);
3186 ::addMask(Mask, ReuseShuffleIndices);
3187 return Mask;
3188 }
3189
3190 /// \returns true if the scalars in VL are equal to this entry.
3191 bool isSame(ArrayRef<Value *> VL) const {
3192 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3193 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3194 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3195 return VL.size() == Mask.size() &&
3196 std::equal(VL.begin(), VL.end(), Mask.begin(),
3197 [Scalars](Value *V, int Idx) {
3198 return (isa<UndefValue>(V) &&
3199 Idx == PoisonMaskElem) ||
3200 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3201 });
3202 };
3203 if (!ReorderIndices.empty()) {
3204 // TODO: implement matching if the nodes are just reordered, still can
3205 // treat the vector as the same if the list of scalars matches VL
3206 // directly, without reordering.
3208 inversePermutation(ReorderIndices, Mask);
3209 if (VL.size() == Scalars.size())
3210 return IsSame(Scalars, Mask);
3211 if (VL.size() == ReuseShuffleIndices.size()) {
3212 ::addMask(Mask, ReuseShuffleIndices);
3213 return IsSame(Scalars, Mask);
3214 }
3215 return false;
3216 }
3217 return IsSame(Scalars, ReuseShuffleIndices);
3218 }
3219
3220 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3221 return isGather() && !UserTreeIndices.empty() &&
3222 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3223 UserTreeIndices.front().UserTE == UserEI.UserTE;
3224 }
3225
3226 /// \returns true if current entry has same operands as \p TE.
3227 bool hasEqualOperands(const TreeEntry &TE) const {
3228 if (TE.getNumOperands() != getNumOperands())
3229 return false;
3230 SmallBitVector Used(getNumOperands());
3231 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3232 unsigned PrevCount = Used.count();
3233 for (unsigned K = 0; K < E; ++K) {
3234 if (Used.test(K))
3235 continue;
3236 if (getOperand(K) == TE.getOperand(I)) {
3237 Used.set(K);
3238 break;
3239 }
3240 }
3241 // Check if we actually found the matching operand.
3242 if (PrevCount == Used.count())
3243 return false;
3244 }
3245 return true;
3246 }
3247
3248 /// \return Final vectorization factor for the node. Defined by the total
3249 /// number of vectorized scalars, including those, used several times in the
3250 /// entry and counted in the \a ReuseShuffleIndices, if any.
3251 unsigned getVectorFactor() const {
3252 if (!ReuseShuffleIndices.empty())
3253 return ReuseShuffleIndices.size();
3254 return Scalars.size();
3255 };
3256
3257 /// Checks if the current node is a gather node.
3258 bool isGather() const {return State == NeedToGather; }
3259
3260 /// A vector of scalars.
3261 ValueList Scalars;
3262
3263 /// The Scalars are vectorized into this value. It is initialized to Null.
3264 WeakTrackingVH VectorizedValue = nullptr;
3265
3266 /// New vector phi instructions emitted for the vectorized phi nodes.
3267 PHINode *PHI = nullptr;
3268
3269 /// Do we need to gather this sequence or vectorize it
3270 /// (either with vector instruction or with scatter/gather
3271 /// intrinsics for store/load)?
3272 enum EntryState {
3273 Vectorize, ///< The node is regularly vectorized.
3274 ScatterVectorize, ///< Masked scatter/gather node.
3275 StridedVectorize, ///< Strided loads (and stores)
3276 NeedToGather, ///< Gather/buildvector node.
3277 CombinedVectorize, ///< Vectorized node, combined with its user into more
3278 ///< complex node like select/cmp to minmax, mul/add to
3279 ///< fma, etc. Must be used for the following nodes in
3280 ///< the pattern, not the very first one.
3281 };
3282 EntryState State;
3283
3284 /// List of combined opcodes supported by the vectorizer.
3285 enum CombinedOpcode {
3286 NotCombinedOp = -1,
3287 MinMax = Instruction::OtherOpsEnd + 1,
3288 };
3289 CombinedOpcode CombinedOp = NotCombinedOp;
3290
3291 /// Does this sequence require some shuffling?
3292 SmallVector<int, 4> ReuseShuffleIndices;
3293
3294 /// Does this entry require reordering?
3295 SmallVector<unsigned, 4> ReorderIndices;
3296
3297 /// Points back to the VectorizableTree.
3298 ///
3299 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3300 /// to be a pointer and needs to be able to initialize the child iterator.
3301 /// Thus we need a reference back to the container to translate the indices
3302 /// to entries.
3303 VecTreeTy &Container;
3304
3305 /// The TreeEntry index containing the user of this entry. We can actually
3306 /// have multiple users so the data structure is not truly a tree.
3307 SmallVector<EdgeInfo, 1> UserTreeIndices;
3308
3309 /// The index of this treeEntry in VectorizableTree.
3310 unsigned Idx = 0;
3311
3312 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3313 /// other nodes as a series of insertvector instructions.
3314 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3315
3316 private:
3317 /// The operands of each instruction in each lane Operands[op_index][lane].
3318 /// Note: This helps avoid the replication of the code that performs the
3319 /// reordering of operands during buildTree_rec() and vectorizeTree().
3321
3322 /// The main/alternate instruction.
3323 Instruction *MainOp = nullptr;
3324 Instruction *AltOp = nullptr;
3325
3326 /// Interleaving factor for interleaved loads Vectorize nodes.
3327 unsigned InterleaveFactor = 0;
3328
3329 public:
3330 /// Returns interleave factor for interleave nodes.
3331 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3332 /// Sets interleaving factor for the interleaving nodes.
3333 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3334
3335 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3336 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3337 if (Operands.size() < OpIdx + 1)
3338 Operands.resize(OpIdx + 1);
3339 assert(Operands[OpIdx].empty() && "Already resized?");
3340 assert(OpVL.size() <= Scalars.size() &&
3341 "Number of operands is greater than the number of scalars.");
3342 Operands[OpIdx].resize(OpVL.size());
3343 copy(OpVL, Operands[OpIdx].begin());
3344 }
3345
3346 /// Set this bundle's operand from Scalars.
3347 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3348 VLOperands Ops(Scalars, MainOp, R);
3349 if (RequireReorder)
3350 Ops.reorder();
3351 for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
3352 setOperand(I, Ops.getVL(I));
3353 }
3354
3355 /// Reorders operands of the node to the given mask \p Mask.
3356 void reorderOperands(ArrayRef<int> Mask) {
3357 for (ValueList &Operand : Operands)
3358 reorderScalars(Operand, Mask);
3359 }
3360
3361 /// \returns the \p OpIdx operand of this TreeEntry.
3362 ValueList &getOperand(unsigned OpIdx) {
3363 assert(OpIdx < Operands.size() && "Off bounds");
3364 return Operands[OpIdx];
3365 }
3366
3367 /// \returns the \p OpIdx operand of this TreeEntry.
3368 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3369 assert(OpIdx < Operands.size() && "Off bounds");
3370 return Operands[OpIdx];
3371 }
3372
3373 /// \returns the number of operands.
3374 unsigned getNumOperands() const { return Operands.size(); }
3375
3376 /// \return the single \p OpIdx operand.
3377 Value *getSingleOperand(unsigned OpIdx) const {
3378 assert(OpIdx < Operands.size() && "Off bounds");
3379 assert(!Operands[OpIdx].empty() && "No operand available");
3380 return Operands[OpIdx][0];
3381 }
3382
3383 /// Some of the instructions in the list have alternate opcodes.
3384 bool isAltShuffle() const { return MainOp != AltOp; }
3385
3386 bool isOpcodeOrAlt(Instruction *I) const {
3387 unsigned CheckedOpcode = I->getOpcode();
3388 return (getOpcode() == CheckedOpcode ||
3389 getAltOpcode() == CheckedOpcode);
3390 }
3391
3392 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3393 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3394 /// \p OpValue.
3395 Value *isOneOf(Value *Op) const {
3396 auto *I = dyn_cast<Instruction>(Op);
3397 if (I && isOpcodeOrAlt(I))
3398 return Op;
3399 return MainOp;
3400 }
3401
3402 void setOperations(const InstructionsState &S) {
3403 MainOp = S.getMainOp();
3404 AltOp = S.getAltOp();
3405 }
3406
3407 Instruction *getMainOp() const {
3408 return MainOp;
3409 }
3410
3411 Instruction *getAltOp() const {
3412 return AltOp;
3413 }
3414
3415 /// The main/alternate opcodes for the list of instructions.
3416 unsigned getOpcode() const {
3417 return MainOp ? MainOp->getOpcode() : 0;
3418 }
3419
3420 unsigned getAltOpcode() const {
3421 return AltOp ? AltOp->getOpcode() : 0;
3422 }
3423
3424 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3425 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3426 int findLaneForValue(Value *V) const {
3427 unsigned FoundLane = getVectorFactor();
3428 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3429 std::advance(It, 1)) {
3430 if (*It != V)
3431 continue;
3432 FoundLane = std::distance(Scalars.begin(), It);
3433 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3434 if (!ReorderIndices.empty())
3435 FoundLane = ReorderIndices[FoundLane];
3436 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3437 if (ReuseShuffleIndices.empty())
3438 break;
3439 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3440 RIt != ReuseShuffleIndices.end()) {
3441 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3442 break;
3443 }
3444 }
3445 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3446 return FoundLane;
3447 }
3448
3449 /// Build a shuffle mask for graph entry which represents a merge of main
3450 /// and alternate operations.
3451 void
3452 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3454 SmallVectorImpl<Value *> *OpScalars = nullptr,
3455 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3456
3457 /// Return true if this is a non-power-of-2 node.
3458 bool isNonPowOf2Vec() const {
3459 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3460 return IsNonPowerOf2;
3461 }
3462
3463 /// Return true if this is a node, which tries to vectorize number of
3464 /// elements, forming whole vectors.
3465 bool
3466 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3467 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3468 TTI, getValueType(Scalars.front()), Scalars.size());
3469 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3470 "Reshuffling not supported with non-power-of-2 vectors yet.");
3471 return IsNonPowerOf2;
3472 }
3473
3474 Value *getOrdered(unsigned Idx) const {
3475 assert(isGather() && "Must be used only for buildvectors/gathers.");
3476 if (ReorderIndices.empty())
3477 return Scalars[Idx];
3479 inversePermutation(ReorderIndices, Mask);
3480 return Scalars[Mask[Idx]];
3481 }
3482
3483#ifndef NDEBUG
3484 /// Debug printer.
3485 LLVM_DUMP_METHOD void dump() const {
3486 dbgs() << Idx << ".\n";
3487 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3488 dbgs() << "Operand " << OpI << ":\n";
3489 for (const Value *V : Operands[OpI])
3490 dbgs().indent(2) << *V << "\n";
3491 }
3492 dbgs() << "Scalars: \n";
3493 for (Value *V : Scalars)
3494 dbgs().indent(2) << *V << "\n";
3495 dbgs() << "State: ";
3496 switch (State) {
3497 case Vectorize:
3498 if (InterleaveFactor > 0) {
3499 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3500 << "\n";
3501 } else {
3502 dbgs() << "Vectorize\n";
3503 }
3504 break;
3505 case ScatterVectorize:
3506 dbgs() << "ScatterVectorize\n";
3507 break;
3508 case StridedVectorize:
3509 dbgs() << "StridedVectorize\n";
3510 break;
3511 case NeedToGather:
3512 dbgs() << "NeedToGather\n";
3513 break;
3514 case CombinedVectorize:
3515 dbgs() << "CombinedVectorize\n";
3516 break;
3517 }
3518 dbgs() << "MainOp: ";
3519 if (MainOp)
3520 dbgs() << *MainOp << "\n";
3521 else
3522 dbgs() << "NULL\n";
3523 dbgs() << "AltOp: ";
3524 if (AltOp)
3525 dbgs() << *AltOp << "\n";
3526 else
3527 dbgs() << "NULL\n";
3528 dbgs() << "VectorizedValue: ";
3529 if (VectorizedValue)
3530 dbgs() << *VectorizedValue << "\n";
3531 else
3532 dbgs() << "NULL\n";
3533 dbgs() << "ReuseShuffleIndices: ";
3534 if (ReuseShuffleIndices.empty())
3535 dbgs() << "Empty";
3536 else
3537 for (int ReuseIdx : ReuseShuffleIndices)
3538 dbgs() << ReuseIdx << ", ";
3539 dbgs() << "\n";
3540 dbgs() << "ReorderIndices: ";
3541 for (unsigned ReorderIdx : ReorderIndices)
3542 dbgs() << ReorderIdx << ", ";
3543 dbgs() << "\n";
3544 dbgs() << "UserTreeIndices: ";
3545 for (const auto &EInfo : UserTreeIndices)
3546 dbgs() << EInfo << ", ";
3547 dbgs() << "\n";
3548 if (!CombinedEntriesWithIndices.empty()) {
3549 dbgs() << "Combined entries: ";
3550 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
3551 dbgs() << "Entry index " << P.first << " with offset " << P.second;
3552 });
3553 dbgs() << "\n";
3554 }
3555 }
3556#endif
3557 };
3558
3559#ifndef NDEBUG
3560 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3561 InstructionCost VecCost, InstructionCost ScalarCost,
3562 StringRef Banner) const {
3563 dbgs() << "SLP: " << Banner << ":\n";
3564 E->dump();
3565 dbgs() << "SLP: Costs:\n";
3566 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3567 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3568 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3569 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3570 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3571 }
3572#endif
3573
3574 /// Create a new VectorizableTree entry.
3575 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3576 std::optional<ScheduleData *> Bundle,
3577 const InstructionsState &S,
3578 const EdgeInfo &UserTreeIdx,
3579 ArrayRef<int> ReuseShuffleIndices = {},
3580 ArrayRef<unsigned> ReorderIndices = {},
3581 unsigned InterleaveFactor = 0) {
3582 TreeEntry::EntryState EntryState =
3583 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3584 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3585 ReuseShuffleIndices, ReorderIndices);
3586 if (E && InterleaveFactor > 0)
3587 E->setInterleave(InterleaveFactor);
3588 return E;
3589 }
3590
3591 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3592 TreeEntry::EntryState EntryState,
3593 std::optional<ScheduleData *> Bundle,
3594 const InstructionsState &S,
3595 const EdgeInfo &UserTreeIdx,
3596 ArrayRef<int> ReuseShuffleIndices = {},
3597 ArrayRef<unsigned> ReorderIndices = {}) {
3598 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3599 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3600 "Need to vectorize gather entry?");
3601 // Gathered loads still gathered? Do not create entry, use the original one.
3602 if (GatheredLoadsEntriesFirst.has_value() &&
3603 EntryState == TreeEntry::NeedToGather &&
3604 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3605 !UserTreeIdx.UserTE)
3606 return nullptr;
3607 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3608 TreeEntry *Last = VectorizableTree.back().get();
3609 Last->Idx = VectorizableTree.size() - 1;
3610 Last->State = EntryState;
3611 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3612 // for non-power-of-two vectors.
3613 assert(
3615 ReuseShuffleIndices.empty()) &&
3616 "Reshuffling scalars not yet supported for nodes with padding");
3617 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3618 ReuseShuffleIndices.end());
3619 if (ReorderIndices.empty()) {
3620 Last->Scalars.assign(VL.begin(), VL.end());
3621 Last->setOperations(S);
3622 } else {
3623 // Reorder scalars and build final mask.
3624 Last->Scalars.assign(VL.size(), nullptr);
3625 transform(ReorderIndices, Last->Scalars.begin(),
3626 [VL](unsigned Idx) -> Value * {
3627 if (Idx >= VL.size())
3628 return UndefValue::get(VL.front()->getType());
3629 return VL[Idx];
3630 });
3631 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3632 Last->setOperations(S);
3633 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3634 }
3635 if (!Last->isGather()) {
3636 for (Value *V : VL) {
3637 const TreeEntry *TE = getTreeEntry(V);
3638 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3639 "Scalar already in tree!");
3640 if (TE) {
3641 if (TE != Last)
3642 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3643 continue;
3644 }
3645 ScalarToTreeEntry[V] = Last;
3646 }
3647 // Update the scheduler bundle to point to this TreeEntry.
3648 ScheduleData *BundleMember = *Bundle;
3649 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3650 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3651 doesNotNeedToSchedule(VL)) &&
3652 "Bundle and VL out of sync");
3653 if (BundleMember) {
3654 for (Value *V : VL) {
3656 continue;
3657 if (!BundleMember)
3658 continue;
3659 BundleMember->TE = Last;
3660 BundleMember = BundleMember->NextInBundle;
3661 }
3662 }
3663 assert(!BundleMember && "Bundle and VL out of sync");
3664 } else {
3665 // Build a map for gathered scalars to the nodes where they are used.
3666 bool AllConstsOrCasts = true;
3667 for (Value *V : VL)
3668 if (!isConstant(V)) {
3669 auto *I = dyn_cast<CastInst>(V);
3670 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3671 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3672 !UserTreeIdx.UserTE->isGather())
3673 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3674 }
3675 if (AllConstsOrCasts)
3676 CastMaxMinBWSizes =
3677 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3678 MustGather.insert(VL.begin(), VL.end());
3679 }
3680
3681 if (UserTreeIdx.UserTE)
3682 Last->UserTreeIndices.push_back(UserTreeIdx);
3683 return Last;
3684 }
3685
3686 /// -- Vectorization State --
3687 /// Holds all of the tree entries.
3688 TreeEntry::VecTreeTy VectorizableTree;
3689
3690#ifndef NDEBUG
3691 /// Debug printer.
3692 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3693 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3694 VectorizableTree[Id]->dump();
3695 dbgs() << "\n";
3696 }
3697 }
3698#endif
3699
3700 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3701
3702 const TreeEntry *getTreeEntry(Value *V) const {
3703 return ScalarToTreeEntry.lookup(V);
3704 }
3705
3706 /// Check that the operand node of alternate node does not generate
3707 /// buildvector sequence. If it is, then probably not worth it to build
3708 /// alternate shuffle, if number of buildvector operands + alternate
3709 /// instruction > than the number of buildvector instructions.
3710 /// \param S the instructions state of the analyzed values.
3711 /// \param VL list of the instructions with alternate opcodes.
3712 bool areAltOperandsProfitable(const InstructionsState &S,
3713 ArrayRef<Value *> VL) const;
3714
3715 /// Checks if the specified list of the instructions/values can be vectorized
3716 /// and fills required data before actual scheduling of the instructions.
3717 TreeEntry::EntryState
3718 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3719 bool IsScatterVectorizeUserTE,
3720 OrdersType &CurrentOrder,
3721 SmallVectorImpl<Value *> &PointerOps);
3722
3723 /// Maps a specific scalar to its tree entry.
3724 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3725
3726 /// List of scalars, used in several vectorize nodes, and the list of the
3727 /// nodes.
3729
3730 /// Maps a value to the proposed vectorizable size.
3731 SmallDenseMap<Value *, unsigned> InstrElementSize;
3732
3733 /// A list of scalars that we found that we need to keep as scalars.
3734 ValueSet MustGather;
3735
3736 /// A set of first non-schedulable values.
3737 ValueSet NonScheduledFirst;
3738
3739 /// A map between the vectorized entries and the last instructions in the
3740 /// bundles. The bundles are built in use order, not in the def order of the
3741 /// instructions. So, we cannot rely directly on the last instruction in the
3742 /// bundle being the last instruction in the program order during
3743 /// vectorization process since the basic blocks are affected, need to
3744 /// pre-gather them before.
3745 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3746
3747 /// List of gather nodes, depending on other gather/vector nodes, which should
3748 /// be emitted after the vector instruction emission process to correctly
3749 /// handle order of the vector instructions and shuffles.
3750 SetVector<const TreeEntry *> PostponedGathers;
3751
3752 using ValueToGatherNodesMap =
3754 ValueToGatherNodesMap ValueToGatherNodes;
3755
3756 /// A list of the load entries (node indices), which can be vectorized using
3757 /// strided or masked gather approach, but attempted to be represented as
3758 /// contiguous loads.
3759 SetVector<unsigned> LoadEntriesToVectorize;
3760
3761 /// true if graph nodes transforming mode is on.
3762 bool IsGraphTransformMode = false;
3763
3764 /// The index of the first gathered load entry in the VectorizeTree.
3765 std::optional<unsigned> GatheredLoadsEntriesFirst;
3766
3767 /// This POD struct describes one external user in the vectorized tree.
3768 struct ExternalUser {
3769 ExternalUser(Value *S, llvm::User *U, int L)
3770 : Scalar(S), User(U), Lane(L) {}
3771
3772 // Which scalar in our function.
3773 Value *Scalar;
3774
3775 // Which user that uses the scalar.
3777
3778 // Which lane does the scalar belong to.
3779 int Lane;
3780 };
3781 using UserList = SmallVector<ExternalUser, 16>;
3782
3783 /// Checks if two instructions may access the same memory.
3784 ///
3785 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3786 /// is invariant in the calling loop.
3787 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3788 Instruction *Inst2) {
3789 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3790 return true;
3791 // First check if the result is already in the cache.
3792 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3793 auto It = AliasCache.find(Key);
3794 if (It != AliasCache.end())
3795 return It->second;
3796 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3797 // Store the result in the cache.
3798 AliasCache.try_emplace(Key, Aliased);
3799 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3800 return Aliased;
3801 }
3802
3803 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3804
3805 /// Cache for alias results.
3806 /// TODO: consider moving this to the AliasAnalysis itself.
3808
3809 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3810 // globally through SLP because we don't perform any action which
3811 // invalidates capture results.
3812 BatchAAResults BatchAA;
3813
3814 /// Temporary store for deleted instructions. Instructions will be deleted
3815 /// eventually when the BoUpSLP is destructed. The deferral is required to
3816 /// ensure that there are no incorrect collisions in the AliasCache, which
3817 /// can happen if a new instruction is allocated at the same address as a
3818 /// previously deleted instruction.
3819 DenseSet<Instruction *> DeletedInstructions;
3820
3821 /// Set of the instruction, being analyzed already for reductions.
3822 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3823
3824 /// Set of hashes for the list of reduction values already being analyzed.
3825 DenseSet<size_t> AnalyzedReductionVals;
3826
3827 /// Values, already been analyzed for mininmal bitwidth and found to be
3828 /// non-profitable.
3829 DenseSet<Value *> AnalyzedMinBWVals;
3830
3831 /// A list of values that need to extracted out of the tree.
3832 /// This list holds pairs of (Internal Scalar : External User). External User
3833 /// can be nullptr, it means that this Internal Scalar will be used later,
3834 /// after vectorization.
3835 UserList ExternalUses;
3836
3837 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3838 /// extractelement instructions.
3839 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3840
3841 /// Values used only by @llvm.assume calls.
3843
3844 /// Holds all of the instructions that we gathered, shuffle instructions and
3845 /// extractelements.
3846 SetVector<Instruction *> GatherShuffleExtractSeq;
3847
3848 /// A list of blocks that we are going to CSE.
3849 DenseSet<BasicBlock *> CSEBlocks;
3850
3851 /// List of hashes of vector of loads, which are known to be non vectorizable.
3852 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3853
3854 /// Contains all scheduling relevant data for an instruction.
3855 /// A ScheduleData either represents a single instruction or a member of an
3856 /// instruction bundle (= a group of instructions which is combined into a
3857 /// vector instruction).
3858 struct ScheduleData {
3859 // The initial value for the dependency counters. It means that the
3860 // dependencies are not calculated yet.
3861 enum { InvalidDeps = -1 };
3862
3863 ScheduleData() = default;
3864
3865 void init(int BlockSchedulingRegionID, Instruction *I) {
3866 FirstInBundle = this;
3867 NextInBundle = nullptr;
3868 NextLoadStore = nullptr;
3869 IsScheduled = false;
3870 SchedulingRegionID = BlockSchedulingRegionID;
3871 clearDependencies();
3872 Inst = I;
3873 TE = nullptr;
3874 }
3875
3876 /// Verify basic self consistency properties
3877 void verify() {
3878 if (hasValidDependencies()) {
3879 assert(UnscheduledDeps <= Dependencies && "invariant");
3880 } else {
3881 assert(UnscheduledDeps == Dependencies && "invariant");
3882 }
3883
3884 if (IsScheduled) {
3885 assert(isSchedulingEntity() &&
3886 "unexpected scheduled state");
3887 for (const ScheduleData *BundleMember = this; BundleMember;
3888 BundleMember = BundleMember->NextInBundle) {
3889 assert(BundleMember->hasValidDependencies() &&
3890 BundleMember->UnscheduledDeps == 0 &&
3891 "unexpected scheduled state");
3892 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3893 "only bundle is marked scheduled");
3894 }
3895 }
3896
3897 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3898 "all bundle members must be in same basic block");
3899 }
3900
3901 /// Returns true if the dependency information has been calculated.
3902 /// Note that depenendency validity can vary between instructions within
3903 /// a single bundle.
3904 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3905
3906 /// Returns true for single instructions and for bundle representatives
3907 /// (= the head of a bundle).
3908 bool isSchedulingEntity() const { return FirstInBundle == this; }
3909
3910 /// Returns true if it represents an instruction bundle and not only a
3911 /// single instruction.
3912 bool isPartOfBundle() const {
3913 return NextInBundle != nullptr || FirstInBundle != this || TE;
3914 }
3915
3916 /// Returns true if it is ready for scheduling, i.e. it has no more
3917 /// unscheduled depending instructions/bundles.
3918 bool isReady() const {
3919 assert(isSchedulingEntity() &&
3920 "can't consider non-scheduling entity for ready list");
3921 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3922 }
3923
3924 /// Modifies the number of unscheduled dependencies for this instruction,
3925 /// and returns the number of remaining dependencies for the containing
3926 /// bundle.
3927 int incrementUnscheduledDeps(int Incr) {
3928 assert(hasValidDependencies() &&
3929 "increment of unscheduled deps would be meaningless");
3930 UnscheduledDeps += Incr;
3931 return FirstInBundle->unscheduledDepsInBundle();
3932 }
3933
3934 /// Sets the number of unscheduled dependencies to the number of
3935 /// dependencies.
3936 void resetUnscheduledDeps() {
3937 UnscheduledDeps = Dependencies;
3938 }
3939
3940 /// Clears all dependency information.
3941 void clearDependencies() {
3942 Dependencies = InvalidDeps;
3943 resetUnscheduledDeps();
3944 MemoryDependencies.clear();
3945 ControlDependencies.clear();
3946 }
3947
3948 int unscheduledDepsInBundle() const {
3949 assert(isSchedulingEntity() && "only meaningful on the bundle");
3950 int Sum = 0;
3951 for (const ScheduleData *BundleMember = this; BundleMember;
3952 BundleMember = BundleMember->NextInBundle) {
3953 if (BundleMember->UnscheduledDeps == InvalidDeps)
3954 return InvalidDeps;
3955 Sum += BundleMember->UnscheduledDeps;
3956 }
3957 return Sum;
3958 }
3959
3960 void dump(raw_ostream &os) const {
3961 if (!isSchedulingEntity()) {
3962 os << "/ " << *Inst;
3963 } else if (NextInBundle) {
3964 os << '[' << *Inst;
3965 ScheduleData *SD = NextInBundle;
3966 while (SD) {
3967 os << ';' << *SD->Inst;
3968 SD = SD->NextInBundle;
3969 }
3970 os << ']';
3971 } else {
3972 os << *Inst;
3973 }
3974 }
3975
3976 Instruction *Inst = nullptr;
3977
3978 /// The TreeEntry that this instruction corresponds to.
3979 TreeEntry *TE = nullptr;
3980
3981 /// Points to the head in an instruction bundle (and always to this for
3982 /// single instructions).
3983 ScheduleData *FirstInBundle = nullptr;
3984
3985 /// Single linked list of all instructions in a bundle. Null if it is a
3986 /// single instruction.
3987 ScheduleData *NextInBundle = nullptr;
3988
3989 /// Single linked list of all memory instructions (e.g. load, store, call)
3990 /// in the block - until the end of the scheduling region.
3991 ScheduleData *NextLoadStore = nullptr;
3992
3993 /// The dependent memory instructions.
3994 /// This list is derived on demand in calculateDependencies().
3995 SmallVector<ScheduleData *, 4> MemoryDependencies;
3996
3997 /// List of instructions which this instruction could be control dependent
3998 /// on. Allowing such nodes to be scheduled below this one could introduce
3999 /// a runtime fault which didn't exist in the original program.
4000 /// ex: this is a load or udiv following a readonly call which inf loops
4001 SmallVector<ScheduleData *, 4> ControlDependencies;
4002
4003 /// This ScheduleData is in the current scheduling region if this matches
4004 /// the current SchedulingRegionID of BlockScheduling.
4005 int SchedulingRegionID = 0;
4006
4007 /// Used for getting a "good" final ordering of instructions.
4008 int SchedulingPriority = 0;
4009
4010 /// The number of dependencies. Constitutes of the number of users of the
4011 /// instruction plus the number of dependent memory instructions (if any).
4012 /// This value is calculated on demand.
4013 /// If InvalidDeps, the number of dependencies is not calculated yet.
4014 int Dependencies = InvalidDeps;
4015
4016 /// The number of dependencies minus the number of dependencies of scheduled
4017 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4018 /// for scheduling.
4019 /// Note that this is negative as long as Dependencies is not calculated.
4020 int UnscheduledDeps = InvalidDeps;
4021
4022 /// True if this instruction is scheduled (or considered as scheduled in the
4023 /// dry-run).
4024 bool IsScheduled = false;
4025 };
4026
4027#ifndef NDEBUG
4029 const BoUpSLP::ScheduleData &SD) {
4030 SD.dump(os);
4031 return os;
4032 }
4033#endif
4034
4035 friend struct GraphTraits<BoUpSLP *>;
4036 friend struct DOTGraphTraits<BoUpSLP *>;
4037
4038 /// Contains all scheduling data for a basic block.
4039 /// It does not schedules instructions, which are not memory read/write
4040 /// instructions and their operands are either constants, or arguments, or
4041 /// phis, or instructions from others blocks, or their users are phis or from
4042 /// the other blocks. The resulting vector instructions can be placed at the
4043 /// beginning of the basic block without scheduling (if operands does not need
4044 /// to be scheduled) or at the end of the block (if users are outside of the
4045 /// block). It allows to save some compile time and memory used by the
4046 /// compiler.
4047 /// ScheduleData is assigned for each instruction in between the boundaries of
4048 /// the tree entry, even for those, which are not part of the graph. It is
4049 /// required to correctly follow the dependencies between the instructions and
4050 /// their correct scheduling. The ScheduleData is not allocated for the
4051 /// instructions, which do not require scheduling, like phis, nodes with
4052 /// extractelements/insertelements only or nodes with instructions, with
4053 /// uses/operands outside of the block.
4054 struct BlockScheduling {
4055 BlockScheduling(BasicBlock *BB)
4056 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4057
4058 void clear() {
4059 ReadyInsts.clear();
4060 ScheduleStart = nullptr;
4061 ScheduleEnd = nullptr;
4062 FirstLoadStoreInRegion = nullptr;
4063 LastLoadStoreInRegion = nullptr;
4064 RegionHasStackSave = false;
4065
4066 // Reduce the maximum schedule region size by the size of the
4067 // previous scheduling run.
4068 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4069 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4070 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4071 ScheduleRegionSize = 0;
4072
4073 // Make a new scheduling region, i.e. all existing ScheduleData is not
4074 // in the new region yet.
4075 ++SchedulingRegionID;
4076 }
4077
4078 ScheduleData *getScheduleData(Instruction *I) {
4079 if (BB != I->getParent())
4080 // Avoid lookup if can't possibly be in map.
4081 return nullptr;
4082 ScheduleData *SD = ScheduleDataMap.lookup(I);
4083 if (SD && isInSchedulingRegion(SD))
4084 return SD;
4085 return nullptr;
4086 }
4087
4088 ScheduleData *getScheduleData(Value *V) {
4089 if (auto *I = dyn_cast<Instruction>(V))
4090 return getScheduleData(I);
4091 return nullptr;
4092 }
4093
4094 bool isInSchedulingRegion(ScheduleData *SD) const {
4095 return SD->SchedulingRegionID == SchedulingRegionID;
4096 }
4097
4098 /// Marks an instruction as scheduled and puts all dependent ready
4099 /// instructions into the ready-list.
4100 template <typename ReadyListType>
4101 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4102 SD->IsScheduled = true;
4103 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4104
4105 for (ScheduleData *BundleMember = SD; BundleMember;
4106 BundleMember = BundleMember->NextInBundle) {
4107
4108 // Handle the def-use chain dependencies.
4109
4110 // Decrement the unscheduled counter and insert to ready list if ready.
4111 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4112 ScheduleData *OpDef = getScheduleData(I);
4113 if (OpDef && OpDef->hasValidDependencies() &&
4114 OpDef->incrementUnscheduledDeps(-1) == 0) {
4115 // There are no more unscheduled dependencies after
4116 // decrementing, so we can put the dependent instruction
4117 // into the ready list.
4118 ScheduleData *DepBundle = OpDef->FirstInBundle;
4119 assert(!DepBundle->IsScheduled &&
4120 "already scheduled bundle gets ready");
4121 ReadyList.insert(DepBundle);
4123 << "SLP: gets ready (def): " << *DepBundle << "\n");
4124 }
4125 };
4126
4127 // If BundleMember is a vector bundle, its operands may have been
4128 // reordered during buildTree(). We therefore need to get its operands
4129 // through the TreeEntry.
4130 if (TreeEntry *TE = BundleMember->TE) {
4131 // Need to search for the lane since the tree entry can be reordered.
4132 int Lane = std::distance(TE->Scalars.begin(),
4133 find(TE->Scalars, BundleMember->Inst));
4134 assert(Lane >= 0 && "Lane not set");
4135
4136 // Since vectorization tree is being built recursively this assertion
4137 // ensures that the tree entry has all operands set before reaching
4138 // this code. Couple of exceptions known at the moment are extracts
4139 // where their second (immediate) operand is not added. Since
4140 // immediates do not affect scheduler behavior this is considered
4141 // okay.
4142 auto *In = BundleMember->Inst;
4143 assert(
4144 In &&
4145 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4146 In->getNumOperands() == TE->getNumOperands()) &&
4147 "Missed TreeEntry operands?");
4148 (void)In; // fake use to avoid build failure when assertions disabled
4149
4150 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
4151 OpIdx != NumOperands; ++OpIdx)
4152 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4153 DecrUnsched(I);
4154 } else {
4155 // If BundleMember is a stand-alone instruction, no operand reordering
4156 // has taken place, so we directly access its operands.
4157 for (Use &U : BundleMember->Inst->operands())
4158 if (auto *I = dyn_cast<Instruction>(U.get()))
4159 DecrUnsched(I);
4160 }
4161 // Handle the memory dependencies.
4162 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4163 if (MemoryDepSD->hasValidDependencies() &&
4164 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4165 // There are no more unscheduled dependencies after decrementing,
4166 // so we can put the dependent instruction into the ready list.
4167 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4168 assert(!DepBundle->IsScheduled &&
4169 "already scheduled bundle gets ready");
4170 ReadyList.insert(DepBundle);
4172 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4173 }
4174 }
4175 // Handle the control dependencies.
4176 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4177 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4178 // There are no more unscheduled dependencies after decrementing,
4179 // so we can put the dependent instruction into the ready list.
4180 ScheduleData *DepBundle = DepSD->FirstInBundle;
4181 assert(!DepBundle->IsScheduled &&
4182 "already scheduled bundle gets ready");
4183 ReadyList.insert(DepBundle);
4185 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4186 }
4187 }
4188 }
4189 }
4190
4191 /// Verify basic self consistency properties of the data structure.
4192 void verify() {
4193 if (!ScheduleStart)
4194 return;
4195
4196 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4197 ScheduleStart->comesBefore(ScheduleEnd) &&
4198 "Not a valid scheduling region?");
4199
4200 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4201 auto *SD = getScheduleData(I);
4202 if (!SD)
4203 continue;
4204 assert(isInSchedulingRegion(SD) &&
4205 "primary schedule data not in window?");
4206 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4207 "entire bundle in window!");
4208 SD->verify();
4209 }
4210
4211 for (auto *SD : ReadyInsts) {
4212 assert(SD->isSchedulingEntity() && SD->isReady() &&
4213 "item in ready list not ready?");
4214 (void)SD;
4215 }
4216 }
4217
4218 /// Put all instructions into the ReadyList which are ready for scheduling.
4219 template <typename ReadyListType>
4220 void initialFillReadyList(ReadyListType &ReadyList) {
4221 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4222 ScheduleData *SD = getScheduleData(I);
4223 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4224 SD->isReady()) {
4225 ReadyList.insert(SD);
4227 << "SLP: initially in ready list: " << *SD << "\n");
4228 }
4229 }
4230 }
4231
4232 /// Build a bundle from the ScheduleData nodes corresponding to the
4233 /// scalar instruction for each lane.
4234 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4235
4236 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4237 /// cyclic dependencies. This is only a dry-run, no instructions are
4238 /// actually moved at this stage.
4239 /// \returns the scheduling bundle. The returned Optional value is not
4240 /// std::nullopt if \p VL is allowed to be scheduled.
4241 std::optional<ScheduleData *>
4242 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4243 const InstructionsState &S);
4244
4245 /// Un-bundles a group of instructions.
4246 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4247
4248 /// Allocates schedule data chunk.
4249 ScheduleData *allocateScheduleDataChunks();
4250
4251 /// Extends the scheduling region so that V is inside the region.
4252 /// \returns true if the region size is within the limit.
4253 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4254
4255 /// Initialize the ScheduleData structures for new instructions in the
4256 /// scheduling region.
4257 void initScheduleData(Instruction *FromI, Instruction *ToI,
4258 ScheduleData *PrevLoadStore,
4259 ScheduleData *NextLoadStore);
4260
4261 /// Updates the dependency information of a bundle and of all instructions/
4262 /// bundles which depend on the original bundle.
4263 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4264 BoUpSLP *SLP);
4265
4266 /// Sets all instruction in the scheduling region to un-scheduled.
4267 void resetSchedule();
4268
4269 BasicBlock *BB;
4270
4271 /// Simple memory allocation for ScheduleData.
4273
4274 /// The size of a ScheduleData array in ScheduleDataChunks.
4275 int ChunkSize;
4276
4277 /// The allocator position in the current chunk, which is the last entry
4278 /// of ScheduleDataChunks.
4279 int ChunkPos;
4280
4281 /// Attaches ScheduleData to Instruction.
4282 /// Note that the mapping survives during all vectorization iterations, i.e.
4283 /// ScheduleData structures are recycled.
4285
4286 /// The ready-list for scheduling (only used for the dry-run).
4287 SetVector<ScheduleData *> ReadyInsts;
4288
4289 /// The first instruction of the scheduling region.
4290 Instruction *ScheduleStart = nullptr;
4291
4292 /// The first instruction _after_ the scheduling region.
4293 Instruction *ScheduleEnd = nullptr;
4294
4295 /// The first memory accessing instruction in the scheduling region
4296 /// (can be null).
4297 ScheduleData *FirstLoadStoreInRegion = nullptr;
4298
4299 /// The last memory accessing instruction in the scheduling region
4300 /// (can be null).
4301 ScheduleData *LastLoadStoreInRegion = nullptr;
4302
4303 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4304 /// region? Used to optimize the dependence calculation for the
4305 /// common case where there isn't.
4306 bool RegionHasStackSave = false;
4307
4308 /// The current size of the scheduling region.
4309 int ScheduleRegionSize = 0;
4310
4311 /// The maximum size allowed for the scheduling region.
4312 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4313
4314 /// The ID of the scheduling region. For a new vectorization iteration this
4315 /// is incremented which "removes" all ScheduleData from the region.
4316 /// Make sure that the initial SchedulingRegionID is greater than the
4317 /// initial SchedulingRegionID in ScheduleData (which is 0).
4318 int SchedulingRegionID = 1;
4319 };
4320
4321 /// Attaches the BlockScheduling structures to basic blocks.
4323
4324 /// Performs the "real" scheduling. Done before vectorization is actually
4325 /// performed in a basic block.
4326 void scheduleBlock(BlockScheduling *BS);
4327
4328 /// List of users to ignore during scheduling and that don't need extracting.
4329 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4330
4331 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4332 /// sorted SmallVectors of unsigned.
4333 struct OrdersTypeDenseMapInfo {
4334 static OrdersType getEmptyKey() {
4335 OrdersType V;
4336 V.push_back(~1U);
4337 return V;
4338 }
4339
4340 static OrdersType getTombstoneKey() {
4341 OrdersType V;
4342 V.push_back(~2U);
4343 return V;
4344 }
4345
4346 static unsigned getHashValue(const OrdersType &V) {
4347 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4348 }
4349
4350 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4351 return LHS == RHS;
4352 }
4353 };
4354
4355 // Analysis and block reference.
4356 Function *F;
4357 ScalarEvolution *SE;
4359 TargetLibraryInfo *TLI;
4360 LoopInfo *LI;
4361 DominatorTree *DT;
4362 AssumptionCache *AC;
4363 DemandedBits *DB;
4364 const DataLayout *DL;
4366
4367 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4368 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4369
4370 /// Instruction builder to construct the vectorized tree.
4372
4373 /// A map of scalar integer values to the smallest bit width with which they
4374 /// can legally be represented. The values map to (width, signed) pairs,
4375 /// where "width" indicates the minimum bit width and "signed" is True if the
4376 /// value must be signed-extended, rather than zero-extended, back to its
4377 /// original width.
4379
4380 /// Final size of the reduced vector, if the current graph represents the
4381 /// input for the reduction and it was possible to narrow the size of the
4382 /// reduction.
4383 unsigned ReductionBitWidth = 0;
4384
4385 /// Canonical graph size before the transformations.
4386 unsigned BaseGraphSize = 1;
4387
4388 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4389 /// type sizes, used in the tree.
4390 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4391
4392 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4393 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4394 DenseSet<unsigned> ExtraBitWidthNodes;
4395};
4396
4397} // end namespace slpvectorizer
4398
4399template <> struct GraphTraits<BoUpSLP *> {
4400 using TreeEntry = BoUpSLP::TreeEntry;
4401
4402 /// NodeRef has to be a pointer per the GraphWriter.
4404
4406
4407 /// Add the VectorizableTree to the index iterator to be able to return
4408 /// TreeEntry pointers.
4409 struct ChildIteratorType
4410 : public iterator_adaptor_base<
4411 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4413
4415 ContainerTy &VT)
4416 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4417
4418 NodeRef operator*() { return I->UserTE; }
4419 };
4420
4422 return R.VectorizableTree[0].get();
4423 }
4424
4425 static ChildIteratorType child_begin(NodeRef N) {
4426 return {N->UserTreeIndices.begin(), N->Container};
4427 }
4428
4429 static ChildIteratorType child_end(NodeRef N) {
4430 return {N->UserTreeIndices.end(), N->Container};
4431 }
4432
4433 /// For the node iterator we just need to turn the TreeEntry iterator into a
4434 /// TreeEntry* iterator so that it dereferences to NodeRef.
4435 class nodes_iterator {
4437 ItTy It;
4438
4439 public:
4440 nodes_iterator(const ItTy &It2) : It(It2) {}
4441 NodeRef operator*() { return It->get(); }
4442 nodes_iterator operator++() {
4443 ++It;
4444 return *this;
4445 }
4446 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4447 };
4448
4449 static nodes_iterator nodes_begin(BoUpSLP *R) {
4450 return nodes_iterator(R->VectorizableTree.begin());
4451 }
4452
4453 static nodes_iterator nodes_end(BoUpSLP *R) {
4454 return nodes_iterator(R->VectorizableTree.end());
4455 }
4456
4457 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4458};
4459
4460template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4461 using TreeEntry = BoUpSLP::TreeEntry;
4462
4463 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4464
4465 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4466 std::string Str;
4468 OS << Entry->Idx << ".\n";
4469 if (isSplat(Entry->Scalars))
4470 OS << "<splat> ";
4471 for (auto *V : Entry->Scalars) {
4472 OS << *V;
4473 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4474 return EU.Scalar == V;
4475 }))
4476 OS << " <extract>";
4477 OS << "\n";
4478 }
4479 return Str;
4480 }
4481
4482 static std::string getNodeAttributes(const TreeEntry *Entry,
4483 const BoUpSLP *) {
4484 if (Entry->isGather())
4485 return "color=red";
4486 if (Entry->State == TreeEntry::ScatterVectorize ||
4487 Entry->State == TreeEntry::StridedVectorize)
4488 return "color=blue";
4489 return "";
4490 }
4491};
4492
4493} // end namespace llvm
4494
4497 for (auto *I : DeletedInstructions) {
4498 if (!I->getParent()) {
4499 // Temporarily insert instruction back to erase them from parent and
4500 // memory later.
4501 if (isa<PHINode>(I))
4502 // Phi nodes must be the very first instructions in the block.
4503 I->insertBefore(F->getEntryBlock(),
4504 F->getEntryBlock().getFirstNonPHIIt());
4505 else
4506 I->insertBefore(F->getEntryBlock().getTerminator());
4507 continue;
4508 }
4509 for (Use &U : I->operands()) {
4510 auto *Op = dyn_cast<Instruction>(U.get());
4511 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4513 DeadInsts.emplace_back(Op);
4514 }
4515 I->dropAllReferences();
4516 }
4517 for (auto *I : DeletedInstructions) {
4518 assert(I->use_empty() &&
4519 "trying to erase instruction with users.");
4520 I->eraseFromParent();
4521 }
4522
4523 // Cleanup any dead scalar code feeding the vectorized instructions
4525
4526#ifdef EXPENSIVE_CHECKS
4527 // If we could guarantee that this call is not extremely slow, we could
4528 // remove the ifdef limitation (see PR47712).
4529 assert(!verifyFunction(*F, &dbgs()));
4530#endif
4531}
4532
4533/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4534/// contains original mask for the scalars reused in the node. Procedure
4535/// transform this mask in accordance with the given \p Mask.
4537 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4538 "Expected non-empty mask.");
4539 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4540 Prev.swap(Reuses);
4541 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4542 if (Mask[I] != PoisonMaskElem)
4543 Reuses[Mask[I]] = Prev[I];
4544}
4545
4546/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4547/// the original order of the scalars. Procedure transforms the provided order
4548/// in accordance with the given \p Mask. If the resulting \p Order is just an
4549/// identity order, \p Order is cleared.
4551 bool BottomOrder = false) {
4552 assert(!Mask.empty() && "Expected non-empty mask.");
4553 unsigned Sz = Mask.size();
4554 if (BottomOrder) {
4555 SmallVector<unsigned> PrevOrder;
4556 if (Order.empty()) {
4557 PrevOrder.resize(Sz);
4558 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4559 } else {
4560 PrevOrder.swap(Order);
4561 }
4562 Order.assign(Sz, Sz);
4563 for (unsigned I = 0; I < Sz; ++I)
4564 if (Mask[I] != PoisonMaskElem)
4565 Order[I] = PrevOrder[Mask[I]];
4566 if (all_of(enumerate(Order), [&](const auto &Data) {
4567 return Data.value() == Sz || Data.index() == Data.value();
4568 })) {
4569 Order.clear();
4570 return;
4571 }
4572 fixupOrderingIndices(Order);
4573 return;
4574 }
4575 SmallVector<int> MaskOrder;
4576 if (Order.empty()) {
4577 MaskOrder.resize(Sz);
4578 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4579 } else {
4580 inversePermutation(Order, MaskOrder);
4581 }
4582 reorderReuses(MaskOrder, Mask);
4583 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4584 Order.clear();
4585 return;
4586 }
4587 Order.assign(Sz, Sz);
4588 for (unsigned I = 0; I < Sz; ++I)
4589 if (MaskOrder[I] != PoisonMaskElem)
4590 Order[MaskOrder[I]] = I;
4591 fixupOrderingIndices(Order);
4592}
4593
4594std::optional<BoUpSLP::OrdersType>
4595BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4596 assert(TE.isGather() && "Expected gather node only.");
4597 // Try to find subvector extract/insert patterns and reorder only such
4598 // patterns.
4599 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4600 Type *ScalarTy = GatheredScalars.front()->getType();
4601 int NumScalars = GatheredScalars.size();
4602 if (!isValidElementType(ScalarTy))
4603 return std::nullopt;
4604 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4605 int NumParts = TTI->getNumberOfParts(VecTy);
4606 if (NumParts == 0 || NumParts >= NumScalars ||
4607 VecTy->getNumElements() % NumParts != 0 ||
4608 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4609 VecTy->getNumElements() / NumParts))
4610 NumParts = 1;
4611 SmallVector<int> ExtractMask;
4612 SmallVector<int> Mask;
4615 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4617 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4618 /*ForOrder=*/true);
4619 // No shuffled operands - ignore.
4620 if (GatherShuffles.empty() && ExtractShuffles.empty())
4621 return std::nullopt;
4622 OrdersType CurrentOrder(NumScalars, NumScalars);
4623 if (GatherShuffles.size() == 1 &&
4624 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4625 Entries.front().front()->isSame(TE.Scalars)) {
4626 // Perfect match in the graph, will reuse the previously vectorized
4627 // node. Cost is 0.
4628 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4629 return CurrentOrder;
4630 }
4631 auto IsSplatMask = [](ArrayRef<int> Mask) {
4632 int SingleElt = PoisonMaskElem;
4633 return all_of(Mask, [&](int I) {
4634 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4635 SingleElt = I;
4636 return I == PoisonMaskElem || I == SingleElt;
4637 });
4638 };
4639 // Exclusive broadcast mask - ignore.
4640 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4641 (Entries.size() != 1 ||
4642 Entries.front().front()->ReorderIndices.empty())) ||
4643 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4644 return std::nullopt;
4645 SmallBitVector ShuffledSubMasks(NumParts);
4646 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4647 ArrayRef<int> Mask, int PartSz, int NumParts,
4648 function_ref<unsigned(unsigned)> GetVF) {
4649 for (int I : seq<int>(0, NumParts)) {
4650 if (ShuffledSubMasks.test(I))
4651 continue;
4652 const int VF = GetVF(I);
4653 if (VF == 0)
4654 continue;
4655 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4656 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4657 // Shuffle of at least 2 vectors - ignore.
4658 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4659 std::fill(Slice.begin(), Slice.end(), NumScalars);
4660 ShuffledSubMasks.set(I);
4661 continue;
4662 }
4663 // Try to include as much elements from the mask as possible.
4664 int FirstMin = INT_MAX;
4665 int SecondVecFound = false;
4666 for (int K : seq<int>(Limit)) {
4667 int Idx = Mask[I * PartSz + K];
4668 if (Idx == PoisonMaskElem) {
4669 Value *V = GatheredScalars[I * PartSz + K];
4670 if (isConstant(V) && !isa<PoisonValue>(V)) {
4671 SecondVecFound = true;
4672 break;
4673 }
4674 continue;
4675 }
4676 if (Idx < VF) {
4677 if (FirstMin > Idx)
4678 FirstMin = Idx;
4679 } else {
4680 SecondVecFound = true;
4681 break;
4682 }
4683 }
4684 FirstMin = (FirstMin / PartSz) * PartSz;
4685 // Shuffle of at least 2 vectors - ignore.
4686 if (SecondVecFound) {
4687 std::fill(Slice.begin(), Slice.end(), NumScalars);
4688 ShuffledSubMasks.set(I);
4689 continue;
4690 }
4691 for (int K : seq<int>(Limit)) {
4692 int Idx = Mask[I * PartSz + K];
4693 if (Idx == PoisonMaskElem)
4694 continue;
4695 Idx -= FirstMin;
4696 if (Idx >= PartSz) {
4697 SecondVecFound = true;
4698 break;
4699 }
4700 if (CurrentOrder[I * PartSz + Idx] >
4701 static_cast<unsigned>(I * PartSz + K) &&
4702 CurrentOrder[I * PartSz + Idx] !=
4703 static_cast<unsigned>(I * PartSz + Idx))
4704 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4705 }
4706 // Shuffle of at least 2 vectors - ignore.
4707 if (SecondVecFound) {
4708 std::fill(Slice.begin(), Slice.end(), NumScalars);
4709 ShuffledSubMasks.set(I);
4710 continue;
4711 }
4712 }
4713 };
4714 int PartSz = getPartNumElems(NumScalars, NumParts);
4715 if (!ExtractShuffles.empty())
4716 TransformMaskToOrder(
4717 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4718 if (!ExtractShuffles[I])
4719 return 0U;
4720 unsigned VF = 0;
4721 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4722 for (unsigned Idx : seq<unsigned>(Sz)) {
4723 int K = I * PartSz + Idx;
4724 if (ExtractMask[K] == PoisonMaskElem)
4725 continue;
4726 if (!TE.ReuseShuffleIndices.empty())
4727 K = TE.ReuseShuffleIndices[K];
4728 if (K == PoisonMaskElem)
4729 continue;
4730 if (!TE.ReorderIndices.empty())
4731 K = std::distance(TE.ReorderIndices.begin(),
4732 find(TE.ReorderIndices, K));
4733 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4734 if (!EI)
4735 continue;
4736 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4737 ->getElementCount()
4738 .getKnownMinValue());
4739 }
4740 return VF;
4741 });
4742 // Check special corner case - single shuffle of the same entry.
4743 if (GatherShuffles.size() == 1 && NumParts != 1) {
4744 if (ShuffledSubMasks.any())
4745 return std::nullopt;
4746 PartSz = NumScalars;
4747 NumParts = 1;
4748 }
4749 if (!Entries.empty())
4750 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4751 if (!GatherShuffles[I])
4752 return 0U;
4753 return std::max(Entries[I].front()->getVectorFactor(),
4754 Entries[I].back()->getVectorFactor());
4755 });
4756 int NumUndefs =
4757 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4758 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4759 return std::nullopt;
4760 return std::move(CurrentOrder);
4761}
4762
4763static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4764 const TargetLibraryInfo &TLI,
4765 bool CompareOpcodes = true) {
4768 return false;
4769 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4770 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4771 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4772 (!GEP2 || GEP2->getNumOperands() == 2) &&
4773 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4774 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4775 !CompareOpcodes ||
4776 (GEP1 && GEP2 &&
4777 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4778 .getOpcode()));
4779}
4780
4781/// Calculates minimal alignment as a common alignment.
4782template <typename T>
4784 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4785 for (Value *V : VL.drop_front())
4786 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4787 return CommonAlignment;
4788}
4789
4790/// Check if \p Order represents reverse order.
4792 assert(!Order.empty() &&
4793 "Order is empty. Please check it before using isReverseOrder.");
4794 unsigned Sz = Order.size();
4795 return all_of(enumerate(Order), [&](const auto &Pair) {
4796 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4797 });
4798}
4799
4800/// Checks if the provided list of pointers \p Pointers represents the strided
4801/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4802/// Otherwise, if \p Inst is not specified, just initialized optional value is
4803/// returned to show that the pointers represent strided pointers. If \p Inst
4804/// specified, the runtime stride is materialized before the given \p Inst.
4805/// \returns std::nullopt if the pointers are not pointers with the runtime
4806/// stride, nullptr or actual stride value, otherwise.
4807static std::optional<Value *>
4809 const DataLayout &DL, ScalarEvolution &SE,
4810 SmallVectorImpl<unsigned> &SortedIndices,
4811 Instruction *Inst = nullptr) {
4813 const SCEV *PtrSCEVLowest = nullptr;
4814 const SCEV *PtrSCEVHighest = nullptr;
4815 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4816 // addresses).
4817 for (Value *Ptr : PointerOps) {
4818 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4819 if (!PtrSCEV)
4820 return std::nullopt;
4821 SCEVs.push_back(PtrSCEV);
4822 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4823 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4824 continue;
4825 }
4826 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4827 if (isa<SCEVCouldNotCompute>(Diff))
4828 return std::nullopt;
4829 if (Diff->isNonConstantNegative()) {
4830 PtrSCEVLowest = PtrSCEV;
4831 continue;
4832 }
4833 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4834 if (isa<SCEVCouldNotCompute>(Diff1))
4835 return std::nullopt;
4836 if (Diff1->isNonConstantNegative()) {
4837 PtrSCEVHighest = PtrSCEV;
4838 continue;
4839 }
4840 }
4841 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4842 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4843 if (isa<SCEVCouldNotCompute>(Dist))
4844 return std::nullopt;
4845 int Size = DL.getTypeStoreSize(ElemTy);
4846 auto TryGetStride = [&](const SCEV *Dist,
4847 const SCEV *Multiplier) -> const SCEV * {
4848 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4849 if (M->getOperand(0) == Multiplier)
4850 return M->getOperand(1);
4851 if (M->getOperand(1) == Multiplier)
4852 return M->getOperand(0);
4853 return nullptr;
4854 }
4855 if (Multiplier == Dist)
4856 return SE.getConstant(Dist->getType(), 1);
4857 return SE.getUDivExactExpr(Dist, Multiplier);
4858 };
4859 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4860 const SCEV *Stride = nullptr;
4861 if (Size != 1 || SCEVs.size() > 2) {
4862 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4863 Stride = TryGetStride(Dist, Sz);
4864 if (!Stride)
4865 return std::nullopt;
4866 }
4867 if (!Stride || isa<SCEVConstant>(Stride))
4868 return std::nullopt;
4869 // Iterate through all pointers and check if all distances are
4870 // unique multiple of Stride.
4871 using DistOrdPair = std::pair<int64_t, int>;
4872 auto Compare = llvm::less_first();
4873 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4874 int Cnt = 0;
4875 bool IsConsecutive = true;
4876 for (const SCEV *PtrSCEV : SCEVs) {
4877 unsigned Dist = 0;
4878 if (PtrSCEV != PtrSCEVLowest) {
4879 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4880 const SCEV *Coeff = TryGetStride(Diff, Stride);
4881 if (!Coeff)
4882 return std::nullopt;
4883 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4884 if (!SC || isa<SCEVCouldNotCompute>(SC))
4885 return std::nullopt;
4886 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4887 SE.getMulExpr(Stride, SC)))
4888 ->isZero())
4889 return std::nullopt;
4890 Dist = SC->getAPInt().getZExtValue();
4891 }
4892 // If the strides are not the same or repeated, we can't vectorize.
4893 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4894 return std::nullopt;
4895 auto Res = Offsets.emplace(Dist, Cnt);
4896 if (!Res.second)
4897 return std::nullopt;
4898 // Consecutive order if the inserted element is the last one.
4899 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4900 ++Cnt;
4901 }
4902 if (Offsets.size() != SCEVs.size())
4903 return std::nullopt;
4904 SortedIndices.clear();
4905 if (!IsConsecutive) {
4906 // Fill SortedIndices array only if it is non-consecutive.
4907 SortedIndices.resize(PointerOps.size());
4908 Cnt = 0;
4909 for (const std::pair<int64_t, int> &Pair : Offsets) {
4910 SortedIndices[Cnt] = Pair.second;
4911 ++Cnt;
4912 }
4913 }
4914 if (!Inst)
4915 return nullptr;
4916 SCEVExpander Expander(SE, DL, "strided-load-vec");
4917 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4918}
4919
4920static std::pair<InstructionCost, InstructionCost>
4922 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4923 Type *ScalarTy, VectorType *VecTy);
4924
4925/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4926/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4927/// subvector pattern.
4928static InstructionCost
4930 VectorType *Tp, ArrayRef<int> Mask = {},
4932 int Index = 0, VectorType *SubTp = nullptr,
4934 if (Kind != TTI::SK_PermuteTwoSrc)
4935 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4936 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4937 int NumSubElts;
4939 Mask, NumSrcElts, NumSubElts, Index)) {
4940 if (Index + NumSubElts > NumSrcElts &&
4941 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4942 return TTI.getShuffleCost(
4944 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4946 }
4947 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4948}
4949
4953 SmallVectorImpl<Value *> &PointerOps,
4954 unsigned *BestVF, bool TryRecursiveCheck) const {
4955 // Check that a vectorized load would load the same memory as a scalar
4956 // load. For example, we don't want to vectorize loads that are smaller
4957 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4958 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4959 // from such a struct, we read/write packed bits disagreeing with the
4960 // unvectorized version.
4961 if (BestVF)
4962 *BestVF = 0;
4964 return LoadsState::Gather;
4965 Type *ScalarTy = VL0->getType();
4966
4967 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4968 return LoadsState::Gather;
4969
4970 // Make sure all loads in the bundle are simple - we can't vectorize
4971 // atomic or volatile loads.
4972 PointerOps.clear();
4973 const unsigned Sz = VL.size();
4974 PointerOps.resize(Sz);
4975 auto *POIter = PointerOps.begin();
4976 for (Value *V : VL) {
4977 auto *L = dyn_cast<LoadInst>(V);
4978 if (!L || !L->isSimple())
4979 return LoadsState::Gather;
4980 *POIter = L->getPointerOperand();
4981 ++POIter;
4982 }
4983
4984 Order.clear();
4985 // Check the order of pointer operands or that all pointers are the same.
4986 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4987
4988 auto *VecTy = getWidenedType(ScalarTy, Sz);
4989 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4990 if (!IsSorted) {
4991 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
4992 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4993 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4995 }
4996
4997 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
4998 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
4999 return LoadsState::Gather;
5000
5001 if (!all_of(PointerOps, [&](Value *P) {
5002 return arePointersCompatible(P, PointerOps.front(), *TLI);
5003 }))
5004 return LoadsState::Gather;
5005
5006 } else {
5007 Value *Ptr0;
5008 Value *PtrN;
5009 if (Order.empty()) {
5010 Ptr0 = PointerOps.front();
5011 PtrN = PointerOps.back();
5012 } else {
5013 Ptr0 = PointerOps[Order.front()];
5014 PtrN = PointerOps[Order.back()];
5015 }
5016 std::optional<int> Diff =
5017 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5018 // Check that the sorted loads are consecutive.
5019 if (static_cast<unsigned>(*Diff) == Sz - 1)
5020 return LoadsState::Vectorize;
5021 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5022 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5023 return LoadsState::Gather;
5024 // Simple check if not a strided access - clear order.
5025 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5026 // Try to generate strided load node if:
5027 // 1. Target with strided load support is detected.
5028 // 2. The number of loads is greater than MinProfitableStridedLoads,
5029 // or the potential stride <= MaxProfitableLoadStride and the
5030 // potential stride is power-of-2 (to avoid perf regressions for the very
5031 // small number of loads) and max distance > number of loads, or potential
5032 // stride is -1.
5033 // 3. The loads are ordered, or number of unordered loads <=
5034 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5035 // (this check is to avoid extra costs for very expensive shuffles).
5036 // 4. Any pointer operand is an instruction with the users outside of the
5037 // current graph (for masked gathers extra extractelement instructions
5038 // might be required).
5039 auto IsAnyPointerUsedOutGraph =
5040 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5041 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5042 return !getTreeEntry(U) && !MustGather.contains(U);
5043 });
5044 });
5045 const unsigned AbsoluteDiff = std::abs(*Diff);
5046 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5048 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5049 has_single_bit(AbsoluteDiff))) &&
5050 AbsoluteDiff > Sz) ||
5051 *Diff == -(static_cast<int>(Sz) - 1))) {
5052 int Stride = *Diff / static_cast<int>(Sz - 1);
5053 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5054 Align Alignment =
5055 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5056 ->getAlign();
5057 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5058 // Iterate through all pointers and check if all distances are
5059 // unique multiple of Dist.
5060 SmallSet<int, 4> Dists;
5061 for (Value *Ptr : PointerOps) {
5062 int Dist = 0;
5063 if (Ptr == PtrN)
5064 Dist = *Diff;
5065 else if (Ptr != Ptr0)
5066 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5067 // If the strides are not the same or repeated, we can't
5068 // vectorize.
5069 if (((Dist / Stride) * Stride) != Dist ||
5070 !Dists.insert(Dist).second)
5071 break;
5072 }
5073 if (Dists.size() == Sz)
5075 }
5076 }
5077 }
5078 }
5079 // Correctly identify compare the cost of loads + shuffles rather than
5080 // strided/masked gather loads. Returns true if vectorized + shuffles
5081 // representation is better than just gather.
5082 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5083 unsigned *BestVF,
5084 bool ProfitableGatherPointers) {
5085 if (BestVF)
5086 *BestVF = 0;
5087 // Compare masked gather cost and loads + insert subvector costs.
5089 auto [ScalarGEPCost, VectorGEPCost] =
5090 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5091 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5092 // Estimate the cost of masked gather GEP. If not a splat, roughly
5093 // estimate as a buildvector, otherwise estimate as splat.
5094 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5095 VectorType *PtrVecTy =
5096 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5097 VecTy->getNumElements());
5098 if (static_cast<unsigned>(count_if(
5099 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5100 any_of(PointerOps, [&](Value *V) {
5101 return getUnderlyingObject(V) !=
5102 getUnderlyingObject(PointerOps.front());
5103 }))
5104 VectorGEPCost += TTI.getScalarizationOverhead(
5105 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5106 else
5107 VectorGEPCost +=
5109 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5110 /*Insert=*/true, /*Extract=*/false, CostKind) +
5112 // The cost of scalar loads.
5113 InstructionCost ScalarLoadsCost =
5114 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5115 [&](InstructionCost C, Value *V) {
5116 return C + TTI.getInstructionCost(
5117 cast<Instruction>(V), CostKind);
5118 }) +
5119 ScalarGEPCost;
5120 // The cost of masked gather.
5121 InstructionCost MaskedGatherCost =
5123 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5124 /*VariableMask=*/false, CommonAlignment, CostKind) +
5125 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5126 InstructionCost GatherCost =
5127 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5128 /*Extract=*/false, CostKind) +
5129 ScalarLoadsCost;
5130 // The list of loads is small or perform partial check already - directly
5131 // compare masked gather cost and gather cost.
5132 constexpr unsigned ListLimit = 4;
5133 if (!TryRecursiveCheck || VL.size() < ListLimit)
5134 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5135
5136 // FIXME: The following code has not been updated for non-power-of-2
5137 // vectors. The splitting logic here does not cover the original
5138 // vector if the vector factor is not a power of two. FIXME
5139 if (!has_single_bit(VL.size()))
5140 return false;
5141
5142 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5143 unsigned MinVF = getMinVF(2 * Sz);
5144 DemandedElts.clearAllBits();
5145 // Iterate through possible vectorization factors and check if vectorized +
5146 // shuffles is better than just gather.
5147 for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
5149 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5150 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5152 SmallVector<Value *> PointerOps;
5153 LoadsState LS =
5154 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5155 /*TryRecursiveCheck=*/false);
5156 // Check that the sorted loads are consecutive.
5157 if (LS == LoadsState::Gather) {
5158 if (BestVF) {
5159 DemandedElts.setAllBits();
5160 break;
5161 }
5162 DemandedElts.setBits(Cnt, Cnt + VF);
5163 continue;
5164 }
5165 // If need the reorder - consider as high-cost masked gather for now.
5166 if ((LS == LoadsState::Vectorize ||
5168 !Order.empty() && !isReverseOrder(Order))
5170 States.push_back(LS);
5171 }
5172 if (DemandedElts.isAllOnes())
5173 // All loads gathered - try smaller VF.
5174 continue;
5175 // Can be vectorized later as a serie of loads/insertelements.
5176 InstructionCost VecLdCost = 0;
5177 if (!DemandedElts.isZero()) {
5178 VecLdCost =
5179 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5180 /*Extract=*/false, CostKind) +
5181 ScalarGEPCost;
5182 for (unsigned Idx : seq<unsigned>(VL.size()))
5183 if (DemandedElts[Idx])
5184 VecLdCost +=
5185 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5186 }
5187 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5188 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5189 for (auto [I, LS] : enumerate(States)) {
5190 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5191 InstructionCost VectorGEPCost =
5192 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5193 ? 0
5194 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5195 LI0->getPointerOperand(),
5196 Instruction::GetElementPtr, CostKind, ScalarTy,
5197 SubVecTy)
5198 .second;
5199 if (LS == LoadsState::ScatterVectorize) {
5200 if (static_cast<unsigned>(
5201 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5202 PointerOps.size() - 1 ||
5203 any_of(PointerOps, [&](Value *V) {
5204 return getUnderlyingObject(V) !=
5205 getUnderlyingObject(PointerOps.front());
5206 }))
5207 VectorGEPCost += TTI.getScalarizationOverhead(
5208 SubVecTy, APInt::getAllOnes(VF),
5209 /*Insert=*/true, /*Extract=*/false, CostKind);
5210 else
5211 VectorGEPCost +=
5213 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5214 /*Insert=*/true, /*Extract=*/false, CostKind) +
5215 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5216 CostKind);
5217 }
5218 switch (LS) {
5220 VecLdCost +=
5221 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5222 LI0->getPointerAddressSpace(), CostKind,
5224 VectorGEPCost;
5225 break;
5227 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5228 LI0->getPointerOperand(),
5229 /*VariableMask=*/false,
5230 CommonAlignment, CostKind) +
5231 VectorGEPCost;
5232 break;
5234 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5235 LI0->getPointerOperand(),
5236 /*VariableMask=*/false,
5237 CommonAlignment, CostKind) +
5238 VectorGEPCost;
5239 break;
5240 case LoadsState::Gather:
5241 // Gathers are already calculated - ignore.
5242 continue;
5243 }
5244 SmallVector<int> ShuffleMask(VL.size());
5245 for (int Idx : seq<int>(0, VL.size()))
5246 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5247 if (I > 0)
5248 VecLdCost +=
5249 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5250 CostKind, I * VF, SubVecTy);
5251 }
5252 // If masked gather cost is higher - better to vectorize, so
5253 // consider it as a gather node. It will be better estimated
5254 // later.
5255 if (MaskedGatherCost >= VecLdCost &&
5256 VecLdCost - GatherCost < -SLPCostThreshold) {
5257 if (BestVF)
5258 *BestVF = VF;
5259 return true;
5260 }
5261 }
5262 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5263 };
5264 // TODO: need to improve analysis of the pointers, if not all of them are
5265 // GEPs or have > 2 operands, we end up with a gather node, which just
5266 // increases the cost.
5267 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5268 bool ProfitableGatherPointers =
5269 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5270 return L->isLoopInvariant(V);
5271 })) <= Sz / 2;
5272 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5273 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5274 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5275 (GEP && GEP->getNumOperands() == 2 &&
5276 isa<Constant, Instruction>(GEP->getOperand(1)));
5277 })) {
5278 // Check if potential masked gather can be represented as series
5279 // of loads + insertsubvectors.
5280 // If masked gather cost is higher - better to vectorize, so
5281 // consider it as a gather node. It will be better estimated
5282 // later.
5283 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5284 ProfitableGatherPointers))
5286 }
5287
5288 return LoadsState::Gather;
5289}
5290
5292 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5293 const DataLayout &DL, ScalarEvolution &SE,
5294 SmallVectorImpl<unsigned> &SortedIndices) {
5295 assert(
5296 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5297 "Expected list of pointer operands.");
5298 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5299 // Ptr into, sort and return the sorted indices with values next to one
5300 // another.
5303 Bases;
5304 Bases
5305 .try_emplace(std::make_pair(
5307 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5308
5309 SortedIndices.clear();
5310 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5311 auto Key = std::make_pair(BBs[Cnt + 1],
5313 bool Found = any_of(Bases.try_emplace(Key).first->second,
5314 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5315 std::optional<int> Diff = getPointersDiff(
5316 ElemTy, std::get<0>(Base.front()), ElemTy,
5317 Ptr, DL, SE,
5318 /*StrictCheck=*/true);
5319 if (!Diff)
5320 return false;
5321
5322 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5323 return true;
5324 });
5325
5326 if (!Found) {
5327 // If we haven't found enough to usefully cluster, return early.
5328 if (Bases.size() > VL.size() / 2 - 1)
5329 return false;
5330
5331 // Not found already - add a new Base
5332 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5333 }
5334 }
5335
5336 if (Bases.size() == VL.size())
5337 return false;
5338
5339 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5340 Bases.front().second.size() == VL.size()))
5341 return false;
5342
5343 // For each of the bases sort the pointers by Offset and check if any of the
5344 // base become consecutively allocated.
5345 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5346 SmallPtrSet<Value *, 13> FirstPointers;
5347 SmallPtrSet<Value *, 13> SecondPointers;
5348 Value *P1 = Ptr1;
5349 Value *P2 = Ptr2;
5350 if (P1 == P2)
5351 return false;
5352 unsigned Depth = 0;
5353 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) &&
5355 FirstPointers.insert(P1);
5356 SecondPointers.insert(P2);
5357 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5358 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5359 ++Depth;
5360 }
5361 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5362 "Unable to find matching root.");
5363 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5364 };
5365 for (auto &Base : Bases) {
5366 for (auto &Vec : Base.second) {
5367 if (Vec.size() > 1) {
5368 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5369 const std::tuple<Value *, int, unsigned> &Y) {
5370 return std::get<1>(X) < std::get<1>(Y);
5371 });
5372 int InitialOffset = std::get<1>(Vec[0]);
5373 bool AnyConsecutive =
5374 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5375 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5376 });
5377 // Fill SortedIndices array only if it looks worth-while to sort the
5378 // ptrs.
5379 if (!AnyConsecutive)
5380 return false;
5381 }
5382 }
5383 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5384 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5385 });
5386 }
5387
5388 for (auto &T : Bases)
5389 for (const auto &Vec : T.second)
5390 for (const auto &P : Vec)
5391 SortedIndices.push_back(std::get<2>(P));
5392
5393 assert(SortedIndices.size() == VL.size() &&
5394 "Expected SortedIndices to be the size of VL");
5395 return true;
5396}
5397
5398std::optional<BoUpSLP::OrdersType>
5399BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5400 assert(TE.isGather() && "Expected gather node only.");
5401 Type *ScalarTy = TE.Scalars[0]->getType();
5402
5404 Ptrs.reserve(TE.Scalars.size());
5406 BBs.reserve(TE.Scalars.size());
5407 for (Value *V : TE.Scalars) {
5408 auto *L = dyn_cast<LoadInst>(V);
5409 if (!L || !L->isSimple())
5410 return std::nullopt;
5411 Ptrs.push_back(L->getPointerOperand());
5412 BBs.push_back(L->getParent());
5413 }
5414
5415 BoUpSLP::OrdersType Order;
5416 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5417 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5418 return std::move(Order);
5419 return std::nullopt;
5420}
5421
5422/// Check if two insertelement instructions are from the same buildvector.
5425 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5426 // Instructions must be from the same basic blocks.
5427 if (VU->getParent() != V->getParent())
5428 return false;
5429 // Checks if 2 insertelements are from the same buildvector.
5430 if (VU->getType() != V->getType())
5431 return false;
5432 // Multiple used inserts are separate nodes.
5433 if (!VU->hasOneUse() && !V->hasOneUse())
5434 return false;
5435 auto *IE1 = VU;
5436 auto *IE2 = V;
5437 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5438 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5439 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5440 return false;
5441 // Go through the vector operand of insertelement instructions trying to find
5442 // either VU as the original vector for IE2 or V as the original vector for
5443 // IE1.
5444 SmallBitVector ReusedIdx(
5445 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5446 bool IsReusedIdx = false;
5447 do {
5448 if (IE2 == VU && !IE1)
5449 return VU->hasOneUse();
5450 if (IE1 == V && !IE2)
5451 return V->hasOneUse();
5452 if (IE1 && IE1 != V) {
5453 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5454 IsReusedIdx |= ReusedIdx.test(Idx1);
5455 ReusedIdx.set(Idx1);
5456 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5457 IE1 = nullptr;
5458 else
5459 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5460 }
5461 if (IE2 && IE2 != VU) {
5462 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5463 IsReusedIdx |= ReusedIdx.test(Idx2);
5464 ReusedIdx.set(Idx2);
5465 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5466 IE2 = nullptr;
5467 else
5468 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5469 }
5470 } while (!IsReusedIdx && (IE1 || IE2));
5471 return false;
5472}
5473
5474std::optional<BoUpSLP::OrdersType>
5475BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5476 // No need to reorder if need to shuffle reuses, still need to shuffle the
5477 // node.
5478 if (!TE.ReuseShuffleIndices.empty()) {
5479 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5480 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5481 "Reshuffling scalars not yet supported for nodes with padding");
5482
5483 if (isSplat(TE.Scalars))
5484 return std::nullopt;
5485 // Check if reuse shuffle indices can be improved by reordering.
5486 // For this, check that reuse mask is "clustered", i.e. each scalar values
5487 // is used once in each submask of size <number_of_scalars>.
5488 // Example: 4 scalar values.
5489 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5490 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5491 // element 3 is used twice in the second submask.
5492 unsigned Sz = TE.Scalars.size();
5493 if (TE.isGather()) {
5494 if (std::optional<OrdersType> CurrentOrder =
5496 SmallVector<int> Mask;
5497 fixupOrderingIndices(*CurrentOrder);
5498 inversePermutation(*CurrentOrder, Mask);
5499 ::addMask(Mask, TE.ReuseShuffleIndices);
5500 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5501 unsigned Sz = TE.Scalars.size();
5502 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5503 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5504 if (Idx != PoisonMaskElem)
5505 Res[Idx + K * Sz] = I + K * Sz;
5506 }
5507 return std::move(Res);
5508 }
5509 }
5510 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5511 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5512 2 * TE.getVectorFactor())) == 1)
5513 return std::nullopt;
5514 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5515 Sz)) {
5516 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5517 if (TE.ReorderIndices.empty())
5518 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5519 else
5520 inversePermutation(TE.ReorderIndices, ReorderMask);
5521 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5522 unsigned VF = ReorderMask.size();
5523 OrdersType ResOrder(VF, VF);
5524 unsigned NumParts = divideCeil(VF, Sz);
5525 SmallBitVector UsedVals(NumParts);
5526 for (unsigned I = 0; I < VF; I += Sz) {
5527 int Val = PoisonMaskElem;
5528 unsigned UndefCnt = 0;
5529 unsigned Limit = std::min(Sz, VF - I);
5530 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5531 [&](int Idx) {
5532 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5533 Val = Idx;
5534 if (Idx == PoisonMaskElem)
5535 ++UndefCnt;
5536 return Idx != PoisonMaskElem && Idx != Val;
5537 }) ||
5538 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5539 UndefCnt > Sz / 2)
5540 return std::nullopt;
5541 UsedVals.set(Val);
5542 for (unsigned K = 0; K < NumParts; ++K) {
5543 unsigned Idx = Val + Sz * K;
5544 if (Idx < VF)
5545 ResOrder[Idx] = I + K;
5546 }
5547 }
5548 return std::move(ResOrder);
5549 }
5550 unsigned VF = TE.getVectorFactor();
5551 // Try build correct order for extractelement instructions.
5552 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5553 TE.ReuseShuffleIndices.end());
5554 if (TE.getOpcode() == Instruction::ExtractElement &&
5555 all_of(TE.Scalars, [Sz](Value *V) {
5556 if (isa<PoisonValue>(V))
5557 return true;
5558 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5559 return Idx && *Idx < Sz;
5560 })) {
5561 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5562 "by BinaryOperator and CastInst.");
5563 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5564 if (TE.ReorderIndices.empty())
5565 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5566 else
5567 inversePermutation(TE.ReorderIndices, ReorderMask);
5568 for (unsigned I = 0; I < VF; ++I) {
5569 int &Idx = ReusedMask[I];
5570 if (Idx == PoisonMaskElem)
5571 continue;
5572 Value *V = TE.Scalars[ReorderMask[Idx]];
5573 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5574 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5575 }
5576 }
5577 // Build the order of the VF size, need to reorder reuses shuffles, they are
5578 // always of VF size.
5579 OrdersType ResOrder(VF);
5580 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5581 auto *It = ResOrder.begin();
5582 for (unsigned K = 0; K < VF; K += Sz) {
5583 OrdersType CurrentOrder(TE.ReorderIndices);
5584 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5585 if (SubMask.front() == PoisonMaskElem)
5586 std::iota(SubMask.begin(), SubMask.end(), 0);
5587 reorderOrder(CurrentOrder, SubMask);
5588 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5589 std::advance(It, Sz);
5590 }
5591 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5592 return Data.index() == Data.value();
5593 }))
5594 return std::nullopt; // No need to reorder.
5595 return std::move(ResOrder);
5596 }
5597 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5598 any_of(TE.UserTreeIndices,
5599 [](const EdgeInfo &EI) {
5600 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5601 }) &&
5602 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5603 return std::nullopt;
5604 if ((TE.State == TreeEntry::Vectorize ||
5605 TE.State == TreeEntry::StridedVectorize) &&
5606 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5607 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5608 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5609 "BinaryOperator and CastInst.");
5610 return TE.ReorderIndices;
5611 }
5612 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5613 if (!TE.ReorderIndices.empty())
5614 return TE.ReorderIndices;
5615
5616 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5617 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5618 if (!V->hasNUsesOrMore(1))
5619 continue;
5620 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5621 if (!II)
5622 continue;
5623 Instruction *BVHead = nullptr;
5624 BasicBlock *BB = II->getParent();
5625 while (II && II->hasOneUse() && II->getParent() == BB) {
5626 BVHead = II;
5627 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5628 }
5629 I = BVHead;
5630 }
5631
5632 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5633 assert(BB1 != BB2 && "Expected different basic blocks.");
5634 auto *NodeA = DT->getNode(BB1);
5635 auto *NodeB = DT->getNode(BB2);
5636 assert(NodeA && "Should only process reachable instructions");
5637 assert(NodeB && "Should only process reachable instructions");
5638 assert((NodeA == NodeB) ==
5639 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5640 "Different nodes should have different DFS numbers");
5641 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5642 };
5643 auto PHICompare = [&](unsigned I1, unsigned I2) {
5644 Value *V1 = TE.Scalars[I1];
5645 Value *V2 = TE.Scalars[I2];
5646 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5647 return false;
5648 if (isa<PoisonValue>(V1))
5649 return true;
5650 if (isa<PoisonValue>(V2))
5651 return false;
5652 if (V1->getNumUses() < V2->getNumUses())
5653 return true;
5654 if (V1->getNumUses() > V2->getNumUses())
5655 return false;
5656 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5657 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5658 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5659 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5660 FirstUserOfPhi2->getParent());
5661 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5662 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5663 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5664 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5665 if (IE1 && !IE2)
5666 return true;
5667 if (!IE1 && IE2)
5668 return false;
5669 if (IE1 && IE2) {
5670 if (UserBVHead[I1] && !UserBVHead[I2])
5671 return true;
5672 if (!UserBVHead[I1])
5673 return false;
5674 if (UserBVHead[I1] == UserBVHead[I2])
5675 return getElementIndex(IE1) < getElementIndex(IE2);
5676 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5677 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5678 UserBVHead[I2]->getParent());
5679 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5680 }
5681 if (EE1 && !EE2)
5682 return true;
5683 if (!EE1 && EE2)
5684 return false;
5685 if (EE1 && EE2) {
5686 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5687 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5688 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5689 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5690 if (!Inst2 && !P2)
5691 return Inst1 || P1;
5692 if (EE1->getOperand(0) == EE2->getOperand(0))
5693 return getElementIndex(EE1) < getElementIndex(EE2);
5694 if (!Inst1 && Inst2)
5695 return false;
5696 if (Inst1 && Inst2) {
5697 if (Inst1->getParent() != Inst2->getParent())
5698 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5699 return Inst1->comesBefore(Inst2);
5700 }
5701 if (!P1 && P2)
5702 return false;
5703 assert(P1 && P2 &&
5704 "Expected either instructions or arguments vector operands.");
5705 return P1->getArgNo() < P2->getArgNo();
5706 }
5707 return false;
5708 };
5709 OrdersType Phis(TE.Scalars.size());
5710 std::iota(Phis.begin(), Phis.end(), 0);
5711 stable_sort(Phis, PHICompare);
5712 if (isIdentityOrder(Phis))
5713 return std::nullopt; // No need to reorder.
5714 return std::move(Phis);
5715 }
5716 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5717 // TODO: add analysis of other gather nodes with extractelement
5718 // instructions and other values/instructions, not only undefs.
5719 if ((TE.getOpcode() == Instruction::ExtractElement ||
5720 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5721 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5722 all_of(TE.Scalars, [](Value *V) {
5723 auto *EE = dyn_cast<ExtractElementInst>(V);
5724 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5725 })) {
5726 // Check that gather of extractelements can be represented as
5727 // just a shuffle of a single vector.
5728 OrdersType CurrentOrder;
5729 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5730 /*ResizeAllowed=*/true);
5731 if (Reuse || !CurrentOrder.empty())
5732 return std::move(CurrentOrder);
5733 }
5734 // If the gather node is <undef, v, .., poison> and
5735 // insertelement poison, v, 0 [+ permute]
5736 // is cheaper than
5737 // insertelement poison, v, n - try to reorder.
5738 // If rotating the whole graph, exclude the permute cost, the whole graph
5739 // might be transformed.
5740 int Sz = TE.Scalars.size();
5741 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5742 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5743 const auto *It =
5744 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5745 if (It == TE.Scalars.begin())
5746 return OrdersType();
5747 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5748 if (It != TE.Scalars.end()) {
5749 OrdersType Order(Sz, Sz);
5750 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5751 Order[Idx] = 0;
5752 fixupOrderingIndices(Order);
5753 SmallVector<int> Mask;
5754 inversePermutation(Order, Mask);
5755 InstructionCost PermuteCost =
5756 TopToBottom
5757 ? 0
5759 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5760 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5761 PoisonValue::get(Ty), *It);
5762 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5763 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5764 PoisonValue::get(Ty), *It);
5765 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5766 OrdersType Order(Sz, Sz);
5767 Order[Idx] = 0;
5768 return std::move(Order);
5769 }
5770 }
5771 }
5772 if (isSplat(TE.Scalars))
5773 return std::nullopt;
5774 if (TE.Scalars.size() >= 3)
5775 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5776 return Order;
5777 // Check if can include the order of vectorized loads. For masked gathers do
5778 // extra analysis later, so include such nodes into a special list.
5779 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5780 SmallVector<Value *> PointerOps;
5781 OrdersType CurrentOrder;
5782 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5783 CurrentOrder, PointerOps);
5785 return std::move(CurrentOrder);
5786 }
5787 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5788 // has been auditted for correctness with non-power-of-two vectors.
5789 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5790 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5791 return CurrentOrder;
5792 }
5793 return std::nullopt;
5794}
5795
5796/// Checks if the given mask is a "clustered" mask with the same clusters of
5797/// size \p Sz, which are not identity submasks.
5799 unsigned Sz) {
5800 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5801 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5802 return false;
5803 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5804 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5805 if (Cluster != FirstCluster)
5806 return false;
5807 }
5808 return true;
5809}
5810
5811void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5812 // Reorder reuses mask.
5813 reorderReuses(TE.ReuseShuffleIndices, Mask);
5814 const unsigned Sz = TE.Scalars.size();
5815 // For vectorized and non-clustered reused no need to do anything else.
5816 if (!TE.isGather() ||
5818 Sz) ||
5819 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5820 return;
5821 SmallVector<int> NewMask;
5822 inversePermutation(TE.ReorderIndices, NewMask);
5823 addMask(NewMask, TE.ReuseShuffleIndices);
5824 // Clear reorder since it is going to be applied to the new mask.
5825 TE.ReorderIndices.clear();
5826 // Try to improve gathered nodes with clustered reuses, if possible.
5827 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5828 SmallVector<unsigned> NewOrder(Slice);
5829 inversePermutation(NewOrder, NewMask);
5830 reorderScalars(TE.Scalars, NewMask);
5831 // Fill the reuses mask with the identity submasks.
5832 for (auto *It = TE.ReuseShuffleIndices.begin(),
5833 *End = TE.ReuseShuffleIndices.end();
5834 It != End; std::advance(It, Sz))
5835 std::iota(It, std::next(It, Sz), 0);
5836}
5837
5839 ArrayRef<unsigned> SecondaryOrder) {
5840 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5841 "Expected same size of orders");
5842 unsigned Sz = Order.size();
5843 SmallBitVector UsedIndices(Sz);
5844 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5845 if (Order[Idx] != Sz)
5846 UsedIndices.set(Order[Idx]);
5847 }
5848 if (SecondaryOrder.empty()) {
5849 for (unsigned Idx : seq<unsigned>(0, Sz))
5850 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5851 Order[Idx] = Idx;
5852 } else {
5853 for (unsigned Idx : seq<unsigned>(0, Sz))
5854 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5855 !UsedIndices.test(SecondaryOrder[Idx]))
5856 Order[Idx] = SecondaryOrder[Idx];
5857 }
5858}
5859
5861 // Maps VF to the graph nodes.
5863 // ExtractElement gather nodes which can be vectorized and need to handle
5864 // their ordering.
5866
5867 // Phi nodes can have preferred ordering based on their result users
5869
5870 // AltShuffles can also have a preferred ordering that leads to fewer
5871 // instructions, e.g., the addsub instruction in x86.
5872 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5873
5874 // Maps a TreeEntry to the reorder indices of external users.
5876 ExternalUserReorderMap;
5877 // Find all reorderable nodes with the given VF.
5878 // Currently the are vectorized stores,loads,extracts + some gathering of
5879 // extracts.
5880 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5881 const std::unique_ptr<TreeEntry> &TE) {
5882 // Look for external users that will probably be vectorized.
5883 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5884 findExternalStoreUsersReorderIndices(TE.get());
5885 if (!ExternalUserReorderIndices.empty()) {
5886 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5887 ExternalUserReorderMap.try_emplace(TE.get(),
5888 std::move(ExternalUserReorderIndices));
5889 }
5890
5891 // Patterns like [fadd,fsub] can be combined into a single instruction in
5892 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5893 // to take into account their order when looking for the most used order.
5894 if (TE->isAltShuffle()) {
5895 VectorType *VecTy =
5896 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5897 unsigned Opcode0 = TE->getOpcode();
5898 unsigned Opcode1 = TE->getAltOpcode();
5899 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5900 // If this pattern is supported by the target then we consider the order.
5901 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5902 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5903 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5904 }
5905 // TODO: Check the reverse order too.
5906 }
5907
5908 if (std::optional<OrdersType> CurrentOrder =
5909 getReorderingData(*TE, /*TopToBottom=*/true)) {
5910 // Do not include ordering for nodes used in the alt opcode vectorization,
5911 // better to reorder them during bottom-to-top stage. If follow the order
5912 // here, it causes reordering of the whole graph though actually it is
5913 // profitable just to reorder the subgraph that starts from the alternate
5914 // opcode vectorization node. Such nodes already end-up with the shuffle
5915 // instruction and it is just enough to change this shuffle rather than
5916 // rotate the scalars for the whole graph.
5917 unsigned Cnt = 0;
5918 const TreeEntry *UserTE = TE.get();
5919 while (UserTE && Cnt < RecursionMaxDepth) {
5920 if (UserTE->UserTreeIndices.size() != 1)
5921 break;
5922 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5923 return EI.UserTE->State == TreeEntry::Vectorize &&
5924 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5925 }))
5926 return;
5927 UserTE = UserTE->UserTreeIndices.back().UserTE;
5928 ++Cnt;
5929 }
5930 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5931 if (!(TE->State == TreeEntry::Vectorize ||
5932 TE->State == TreeEntry::StridedVectorize) ||
5933 !TE->ReuseShuffleIndices.empty())
5934 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5935 if (TE->State == TreeEntry::Vectorize &&
5936 TE->getOpcode() == Instruction::PHI)
5937 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5938 }
5939 });
5940
5941 // Reorder the graph nodes according to their vectorization factor.
5942 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5943 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5944 auto It = VFToOrderedEntries.find(VF);
5945 if (It == VFToOrderedEntries.end())
5946 continue;
5947 // Try to find the most profitable order. We just are looking for the most
5948 // used order and reorder scalar elements in the nodes according to this
5949 // mostly used order.
5950 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5951 // Delete VF entry upon exit.
5952 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5953
5954 // All operands are reordered and used only in this node - propagate the
5955 // most used order to the user node.
5958 OrdersUses;
5960 for (const TreeEntry *OpTE : OrderedEntries) {
5961 // No need to reorder this nodes, still need to extend and to use shuffle,
5962 // just need to merge reordering shuffle and the reuse shuffle.
5963 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5964 continue;
5965 // Count number of orders uses.
5966 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5967 &PhisToOrders]() -> const OrdersType & {
5968 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5969 auto It = GathersToOrders.find(OpTE);
5970 if (It != GathersToOrders.end())
5971 return It->second;
5972 }
5973 if (OpTE->isAltShuffle()) {
5974 auto It = AltShufflesToOrders.find(OpTE);
5975 if (It != AltShufflesToOrders.end())
5976 return It->second;
5977 }
5978 if (OpTE->State == TreeEntry::Vectorize &&
5979 OpTE->getOpcode() == Instruction::PHI) {
5980 auto It = PhisToOrders.find(OpTE);
5981 if (It != PhisToOrders.end())
5982 return It->second;
5983 }
5984 return OpTE->ReorderIndices;
5985 }();
5986 // First consider the order of the external scalar users.
5987 auto It = ExternalUserReorderMap.find(OpTE);
5988 if (It != ExternalUserReorderMap.end()) {
5989 const auto &ExternalUserReorderIndices = It->second;
5990 // If the OpTE vector factor != number of scalars - use natural order,
5991 // it is an attempt to reorder node with reused scalars but with
5992 // external uses.
5993 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5994 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5995 ExternalUserReorderIndices.size();
5996 } else {
5997 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5998 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5999 }
6000 // No other useful reorder data in this entry.
6001 if (Order.empty())
6002 continue;
6003 }
6004 // Stores actually store the mask, not the order, need to invert.
6005 if (OpTE->State == TreeEntry::Vectorize &&
6006 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6007 assert(!OpTE->isAltShuffle() &&
6008 "Alternate instructions are only supported by BinaryOperator "
6009 "and CastInst.");
6010 SmallVector<int> Mask;
6011 inversePermutation(Order, Mask);
6012 unsigned E = Order.size();
6013 OrdersType CurrentOrder(E, E);
6014 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6015 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6016 });
6017 fixupOrderingIndices(CurrentOrder);
6018 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6019 } else {
6020 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6021 }
6022 }
6023 if (OrdersUses.empty())
6024 continue;
6025 // Choose the most used order.
6026 unsigned IdentityCnt = 0;
6027 unsigned FilledIdentityCnt = 0;
6028 OrdersType IdentityOrder(VF, VF);
6029 for (auto &Pair : OrdersUses) {
6030 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6031 if (!Pair.first.empty())
6032 FilledIdentityCnt += Pair.second;
6033 IdentityCnt += Pair.second;
6034 combineOrders(IdentityOrder, Pair.first);
6035 }
6036 }
6037 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6038 unsigned Cnt = IdentityCnt;
6039 for (auto &Pair : OrdersUses) {
6040 // Prefer identity order. But, if filled identity found (non-empty order)
6041 // with same number of uses, as the new candidate order, we can choose
6042 // this candidate order.
6043 if (Cnt < Pair.second ||
6044 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6045 Cnt == Pair.second && !BestOrder.empty() &&
6046 isIdentityOrder(BestOrder))) {
6047 combineOrders(Pair.first, BestOrder);
6048 BestOrder = Pair.first;
6049 Cnt = Pair.second;
6050 } else {
6051 combineOrders(BestOrder, Pair.first);
6052 }
6053 }
6054 // Set order of the user node.
6055 if (isIdentityOrder(BestOrder))
6056 continue;
6057 fixupOrderingIndices(BestOrder);
6058 SmallVector<int> Mask;
6059 inversePermutation(BestOrder, Mask);
6060 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6061 unsigned E = BestOrder.size();
6062 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6063 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6064 });
6065 // Do an actual reordering, if profitable.
6066 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6067 // Just do the reordering for the nodes with the given VF.
6068 if (TE->Scalars.size() != VF) {
6069 if (TE->ReuseShuffleIndices.size() == VF) {
6070 // Need to reorder the reuses masks of the operands with smaller VF to
6071 // be able to find the match between the graph nodes and scalar
6072 // operands of the given node during vectorization/cost estimation.
6073 assert(all_of(TE->UserTreeIndices,
6074 [VF, &TE](const EdgeInfo &EI) {
6075 return EI.UserTE->Scalars.size() == VF ||
6076 EI.UserTE->Scalars.size() ==
6077 TE->Scalars.size();
6078 }) &&
6079 "All users must be of VF size.");
6080 if (SLPReVec) {
6081 assert(SLPReVec && "Only supported by REVEC.");
6082 // ShuffleVectorInst does not do reorderOperands (and it should not
6083 // because ShuffleVectorInst supports only a limited set of
6084 // patterns). Only do reorderNodeWithReuses if all of the users are
6085 // not ShuffleVectorInst.
6086 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6087 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6088 }))
6089 continue;
6090 assert(none_of(TE->UserTreeIndices,
6091 [&](const EdgeInfo &EI) {
6092 return isa<ShuffleVectorInst>(
6093 EI.UserTE->getMainOp());
6094 }) &&
6095 "Does not know how to reorder.");
6096 }
6097 // Update ordering of the operands with the smaller VF than the given
6098 // one.
6099 reorderNodeWithReuses(*TE, Mask);
6100 }
6101 continue;
6102 }
6103 if ((TE->State == TreeEntry::Vectorize ||
6104 TE->State == TreeEntry::StridedVectorize) &&
6106 InsertElementInst>(TE->getMainOp()) ||
6107 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6108 assert(!TE->isAltShuffle() &&
6109 "Alternate instructions are only supported by BinaryOperator "
6110 "and CastInst.");
6111 // Build correct orders for extract{element,value}, loads and
6112 // stores.
6113 reorderOrder(TE->ReorderIndices, Mask);
6114 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6115 TE->reorderOperands(Mask);
6116 } else {
6117 // Reorder the node and its operands.
6118 TE->reorderOperands(Mask);
6119 assert(TE->ReorderIndices.empty() &&
6120 "Expected empty reorder sequence.");
6121 reorderScalars(TE->Scalars, Mask);
6122 }
6123 if (!TE->ReuseShuffleIndices.empty()) {
6124 // Apply reversed order to keep the original ordering of the reused
6125 // elements to avoid extra reorder indices shuffling.
6126 OrdersType CurrentOrder;
6127 reorderOrder(CurrentOrder, MaskOrder);
6128 SmallVector<int> NewReuses;
6129 inversePermutation(CurrentOrder, NewReuses);
6130 addMask(NewReuses, TE->ReuseShuffleIndices);
6131 TE->ReuseShuffleIndices.swap(NewReuses);
6132 }
6133 }
6134 }
6135}
6136
6137bool BoUpSLP::canReorderOperands(
6138 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6139 ArrayRef<TreeEntry *> ReorderableGathers,
6140 SmallVectorImpl<TreeEntry *> &GatherOps) {
6141 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6142 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6143 return OpData.first == I &&
6144 (OpData.second->State == TreeEntry::Vectorize ||
6145 OpData.second->State == TreeEntry::StridedVectorize);
6146 }))
6147 continue;
6148 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6149 // Do not reorder if operand node is used by many user nodes.
6150 if (any_of(TE->UserTreeIndices,
6151 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6152 return false;
6153 // Add the node to the list of the ordered nodes with the identity
6154 // order.
6155 Edges.emplace_back(I, TE);
6156 // Add ScatterVectorize nodes to the list of operands, where just
6157 // reordering of the scalars is required. Similar to the gathers, so
6158 // simply add to the list of gathered ops.
6159 // If there are reused scalars, process this node as a regular vectorize
6160 // node, just reorder reuses mask.
6161 if (TE->State != TreeEntry::Vectorize &&
6162 TE->State != TreeEntry::StridedVectorize &&
6163 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6164 GatherOps.push_back(TE);
6165 continue;
6166 }
6167 TreeEntry *Gather = nullptr;
6168 if (count_if(ReorderableGathers,
6169 [&Gather, UserTE, I](TreeEntry *TE) {
6170 assert(TE->State != TreeEntry::Vectorize &&
6171 TE->State != TreeEntry::StridedVectorize &&
6172 "Only non-vectorized nodes are expected.");
6173 if (any_of(TE->UserTreeIndices,
6174 [UserTE, I](const EdgeInfo &EI) {
6175 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6176 })) {
6177 assert(TE->isSame(UserTE->getOperand(I)) &&
6178 "Operand entry does not match operands.");
6179 Gather = TE;
6180 return true;
6181 }
6182 return false;
6183 }) > 1 &&
6184 !allConstant(UserTE->getOperand(I)))
6185 return false;
6186 if (Gather)
6187 GatherOps.push_back(Gather);
6188 }
6189 return true;
6190}
6191
6192void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6193 SetVector<TreeEntry *> OrderedEntries;
6194 DenseSet<const TreeEntry *> GathersToOrders;
6195 // Find all reorderable leaf nodes with the given VF.
6196 // Currently the are vectorized loads,extracts without alternate operands +
6197 // some gathering of extracts.
6198 SmallVector<TreeEntry *> NonVectorized;
6199 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6200 if (TE->State != TreeEntry::Vectorize &&
6201 TE->State != TreeEntry::StridedVectorize)
6202 NonVectorized.push_back(TE.get());
6203 if (std::optional<OrdersType> CurrentOrder =
6204 getReorderingData(*TE, /*TopToBottom=*/false)) {
6205 OrderedEntries.insert(TE.get());
6206 if (!(TE->State == TreeEntry::Vectorize ||
6207 TE->State == TreeEntry::StridedVectorize) ||
6208 !TE->ReuseShuffleIndices.empty())
6209 GathersToOrders.insert(TE.get());
6210 }
6211 }
6212
6213 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6214 // I.e., if the node has operands, that are reordered, try to make at least
6215 // one operand order in the natural order and reorder others + reorder the
6216 // user node itself.
6218 while (!OrderedEntries.empty()) {
6219 // 1. Filter out only reordered nodes.
6220 // 2. If the entry has multiple uses - skip it and jump to the next node.
6222 SmallVector<TreeEntry *> Filtered;
6223 for (TreeEntry *TE : OrderedEntries) {
6224 if (!(TE->State == TreeEntry::Vectorize ||
6225 TE->State == TreeEntry::StridedVectorize ||
6226 (TE->isGather() && GathersToOrders.contains(TE))) ||
6227 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6228 !all_of(drop_begin(TE->UserTreeIndices),
6229 [TE](const EdgeInfo &EI) {
6230 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6231 }) ||
6232 !Visited.insert(TE).second) {
6233 Filtered.push_back(TE);
6234 continue;
6235 }
6236 // Build a map between user nodes and their operands order to speedup
6237 // search. The graph currently does not provide this dependency directly.
6238 for (EdgeInfo &EI : TE->UserTreeIndices)
6239 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6240 }
6241 // Erase filtered entries.
6242 for (TreeEntry *TE : Filtered)
6243 OrderedEntries.remove(TE);
6245 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6246 UsersVec(Users.begin(), Users.end());
6247 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6248 return Data1.first->Idx > Data2.first->Idx;
6249 });
6250 for (auto &Data : UsersVec) {
6251 // Check that operands are used only in the User node.
6252 SmallVector<TreeEntry *> GatherOps;
6253 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6254 GatherOps)) {
6255 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6256 OrderedEntries.remove(Op.second);
6257 continue;
6258 }
6259 // All operands are reordered and used only in this node - propagate the
6260 // most used order to the user node.
6263 OrdersUses;
6264 // Do the analysis for each tree entry only once, otherwise the order of
6265 // the same node my be considered several times, though might be not
6266 // profitable.
6269 for (const auto &Op : Data.second) {
6270 TreeEntry *OpTE = Op.second;
6271 if (!VisitedOps.insert(OpTE).second)
6272 continue;
6273 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6274 continue;
6275 const auto Order = [&]() -> const OrdersType {
6276 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6277 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6278 .value_or(OrdersType(1));
6279 return OpTE->ReorderIndices;
6280 }();
6281 // The order is partially ordered, skip it in favor of fully non-ordered
6282 // orders.
6283 if (Order.size() == 1)
6284 continue;
6285 unsigned NumOps = count_if(
6286 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6287 return P.second == OpTE;
6288 });
6289 // Stores actually store the mask, not the order, need to invert.
6290 if (OpTE->State == TreeEntry::Vectorize &&
6291 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6292 assert(!OpTE->isAltShuffle() &&
6293 "Alternate instructions are only supported by BinaryOperator "
6294 "and CastInst.");
6295 SmallVector<int> Mask;
6296 inversePermutation(Order, Mask);
6297 unsigned E = Order.size();
6298 OrdersType CurrentOrder(E, E);
6299 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6300 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6301 });
6302 fixupOrderingIndices(CurrentOrder);
6303 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6304 NumOps;
6305 } else {
6306 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6307 }
6308 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6309 const auto AllowsReordering = [&](const TreeEntry *TE) {
6310 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6311 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6312 (IgnoreReorder && TE->Idx == 0))
6313 return true;
6314 if (TE->isGather()) {
6315 if (GathersToOrders.contains(TE))
6316 return !getReorderingData(*TE, /*TopToBottom=*/false)
6317 .value_or(OrdersType(1))
6318 .empty();
6319 return true;
6320 }
6321 return false;
6322 };
6323 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6324 TreeEntry *UserTE = EI.UserTE;
6325 if (!VisitedUsers.insert(UserTE).second)
6326 continue;
6327 // May reorder user node if it requires reordering, has reused
6328 // scalars, is an alternate op vectorize node or its op nodes require
6329 // reordering.
6330 if (AllowsReordering(UserTE))
6331 continue;
6332 // Check if users allow reordering.
6333 // Currently look up just 1 level of operands to avoid increase of
6334 // the compile time.
6335 // Profitable to reorder if definitely more operands allow
6336 // reordering rather than those with natural order.
6338 if (static_cast<unsigned>(count_if(
6339 Ops, [UserTE, &AllowsReordering](
6340 const std::pair<unsigned, TreeEntry *> &Op) {
6341 return AllowsReordering(Op.second) &&
6342 all_of(Op.second->UserTreeIndices,
6343 [UserTE](const EdgeInfo &EI) {
6344 return EI.UserTE == UserTE;
6345 });
6346 })) <= Ops.size() / 2)
6347 ++Res.first->second;
6348 }
6349 }
6350 if (OrdersUses.empty()) {
6351 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6352 OrderedEntries.remove(Op.second);
6353 continue;
6354 }
6355 // Choose the most used order.
6356 unsigned IdentityCnt = 0;
6357 unsigned VF = Data.second.front().second->getVectorFactor();
6358 OrdersType IdentityOrder(VF, VF);
6359 for (auto &Pair : OrdersUses) {
6360 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6361 IdentityCnt += Pair.second;
6362 combineOrders(IdentityOrder, Pair.first);
6363 }
6364 }
6365 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6366 unsigned Cnt = IdentityCnt;
6367 for (auto &Pair : OrdersUses) {
6368 // Prefer identity order. But, if filled identity found (non-empty
6369 // order) with same number of uses, as the new candidate order, we can
6370 // choose this candidate order.
6371 if (Cnt < Pair.second) {
6372 combineOrders(Pair.first, BestOrder);
6373 BestOrder = Pair.first;
6374 Cnt = Pair.second;
6375 } else {
6376 combineOrders(BestOrder, Pair.first);
6377 }
6378 }
6379 // Set order of the user node.
6380 if (isIdentityOrder(BestOrder)) {
6381 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6382 OrderedEntries.remove(Op.second);
6383 continue;
6384 }
6385 fixupOrderingIndices(BestOrder);
6386 // Erase operands from OrderedEntries list and adjust their orders.
6387 VisitedOps.clear();
6388 SmallVector<int> Mask;
6389 inversePermutation(BestOrder, Mask);
6390 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6391 unsigned E = BestOrder.size();
6392 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6393 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6394 });
6395 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6396 TreeEntry *TE = Op.second;
6397 OrderedEntries.remove(TE);
6398 if (!VisitedOps.insert(TE).second)
6399 continue;
6400 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6401 reorderNodeWithReuses(*TE, Mask);
6402 continue;
6403 }
6404 // Gathers are processed separately.
6405 if (TE->State != TreeEntry::Vectorize &&
6406 TE->State != TreeEntry::StridedVectorize &&
6407 (TE->State != TreeEntry::ScatterVectorize ||
6408 TE->ReorderIndices.empty()))
6409 continue;
6410 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6411 TE->ReorderIndices.empty()) &&
6412 "Non-matching sizes of user/operand entries.");
6413 reorderOrder(TE->ReorderIndices, Mask);
6414 if (IgnoreReorder && TE == VectorizableTree.front().get())
6415 IgnoreReorder = false;
6416 }
6417 // For gathers just need to reorder its scalars.
6418 for (TreeEntry *Gather : GatherOps) {
6419 assert(Gather->ReorderIndices.empty() &&
6420 "Unexpected reordering of gathers.");
6421 if (!Gather->ReuseShuffleIndices.empty()) {
6422 // Just reorder reuses indices.
6423 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6424 continue;
6425 }
6426 reorderScalars(Gather->Scalars, Mask);
6427 OrderedEntries.remove(Gather);
6428 }
6429 // Reorder operands of the user node and set the ordering for the user
6430 // node itself.
6431 if (Data.first->State != TreeEntry::Vectorize ||
6432 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6433 Data.first->getMainOp()) ||
6434 Data.first->isAltShuffle())
6435 Data.first->reorderOperands(Mask);
6436 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6437 Data.first->isAltShuffle() ||
6438 Data.first->State == TreeEntry::StridedVectorize) {
6439 reorderScalars(Data.first->Scalars, Mask);
6440 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6441 /*BottomOrder=*/true);
6442 if (Data.first->ReuseShuffleIndices.empty() &&
6443 !Data.first->ReorderIndices.empty() &&
6444 !Data.first->isAltShuffle()) {
6445 // Insert user node to the list to try to sink reordering deeper in
6446 // the graph.
6447 OrderedEntries.insert(Data.first);
6448 }
6449 } else {
6450 reorderOrder(Data.first->ReorderIndices, Mask);
6451 }
6452 }
6453 }
6454 // If the reordering is unnecessary, just remove the reorder.
6455 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6456 VectorizableTree.front()->ReuseShuffleIndices.empty())
6457 VectorizableTree.front()->ReorderIndices.clear();
6458}
6459
6460Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6461 if ((Entry.getOpcode() == Instruction::Store ||
6462 Entry.getOpcode() == Instruction::Load) &&
6463 Entry.State == TreeEntry::StridedVectorize &&
6464 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6465 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6466 return dyn_cast<Instruction>(Entry.Scalars.front());
6467}
6468
6470 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6471 DenseMap<Value *, unsigned> ScalarToExtUses;
6472 // Collect the values that we need to extract from the tree.
6473 for (auto &TEPtr : VectorizableTree) {
6474 TreeEntry *Entry = TEPtr.get();
6475
6476 // No need to handle users of gathered values.
6477 if (Entry->isGather())
6478 continue;
6479
6480 // For each lane:
6481 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6482 Value *Scalar = Entry->Scalars[Lane];
6483 if (!isa<Instruction>(Scalar))
6484 continue;
6485 // All uses must be replaced already? No need to do it again.
6486 auto It = ScalarToExtUses.find(Scalar);
6487 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6488 continue;
6489
6490 // Check if the scalar is externally used as an extra arg.
6491 const auto ExtI = ExternallyUsedValues.find(Scalar);
6492 if (ExtI != ExternallyUsedValues.end()) {
6493 int FoundLane = Entry->findLaneForValue(Scalar);
6494 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6495 << FoundLane << " from " << *Scalar << ".\n");
6496 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6497 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6498 continue;
6499 }
6500 for (User *U : Scalar->users()) {
6501 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6502
6503 Instruction *UserInst = dyn_cast<Instruction>(U);
6504 if (!UserInst || isDeleted(UserInst))
6505 continue;
6506
6507 // Ignore users in the user ignore list.
6508 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6509 continue;
6510
6511 // Skip in-tree scalars that become vectors
6512 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6513 // Some in-tree scalars will remain as scalar in vectorized
6514 // instructions. If that is the case, the one in FoundLane will
6515 // be used.
6516 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6518 Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
6519 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6520 << ".\n");
6521 assert(!UseEntry->isGather() && "Bad state");
6522 continue;
6523 }
6524 U = nullptr;
6525 if (It != ScalarToExtUses.end()) {
6526 ExternalUses[It->second].User = nullptr;
6527 break;
6528 }
6529 }
6530
6531 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6532 U = nullptr;
6533 int FoundLane = Entry->findLaneForValue(Scalar);
6534 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6535 << " from lane " << FoundLane << " from " << *Scalar
6536 << ".\n");
6537 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6538 ExternalUses.emplace_back(Scalar, U, FoundLane);
6539 if (!U)
6540 break;
6541 }
6542 }
6543 }
6544}
6545
6547BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6550 PtrToStoresMap;
6551 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6552 Value *V = TE->Scalars[Lane];
6553 // Don't iterate over the users of constant data.
6554 if (!isa<Instruction>(V))
6555 continue;
6556 // To save compilation time we don't visit if we have too many users.
6557 if (V->hasNUsesOrMore(UsesLimit))
6558 break;
6559
6560 // Collect stores per pointer object.
6561 for (User *U : V->users()) {
6562 auto *SI = dyn_cast<StoreInst>(U);
6563 // Test whether we can handle the store. V might be a global, which could
6564 // be used in a different function.
6565 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6566 !isValidElementType(SI->getValueOperand()->getType()))
6567 continue;
6568 // Skip entry if already
6569 if (getTreeEntry(U))
6570 continue;
6571
6572 Value *Ptr =
6573 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6574 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6575 SI->getValueOperand()->getType(), Ptr}];
6576 // For now just keep one store per pointer object per lane.
6577 // TODO: Extend this to support multiple stores per pointer per lane
6578 if (StoresVec.size() > Lane)
6579 continue;
6580 if (!StoresVec.empty()) {
6581 std::optional<int> Diff = getPointersDiff(
6582 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6583 SI->getValueOperand()->getType(),
6584 StoresVec.front()->getPointerOperand(), *DL, *SE,
6585 /*StrictCheck=*/true);
6586 // We failed to compare the pointers so just abandon this store.
6587 if (!Diff)
6588 continue;
6589 }
6590 StoresVec.push_back(SI);
6591 }
6592 }
6593 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6594 unsigned I = 0;
6595 for (auto &P : PtrToStoresMap) {
6596 Res[I].swap(P.second);
6597 ++I;
6598 }
6599 return Res;
6600}
6601
6602bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6603 OrdersType &ReorderIndices) const {
6604 // We check whether the stores in StoreVec can form a vector by sorting them
6605 // and checking whether they are consecutive.
6606
6607 // To avoid calling getPointersDiff() while sorting we create a vector of
6608 // pairs {store, offset from first} and sort this instead.
6610 StoreInst *S0 = StoresVec[0];
6611 StoreOffsetVec.emplace_back(0, 0);
6612 Type *S0Ty = S0->getValueOperand()->getType();
6613 Value *S0Ptr = S0->getPointerOperand();
6614 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6615 StoreInst *SI = StoresVec[Idx];
6616 std::optional<int> Diff =
6617 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6618 SI->getPointerOperand(), *DL, *SE,
6619 /*StrictCheck=*/true);
6620 StoreOffsetVec.emplace_back(*Diff, Idx);
6621 }
6622
6623 // Check if the stores are consecutive by checking if their difference is 1.
6624 if (StoreOffsetVec.size() != StoresVec.size())
6625 return false;
6626 sort(StoreOffsetVec,
6627 [](const std::pair<int, unsigned> &L,
6628 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6629 unsigned Idx = 0;
6630 int PrevDist = 0;
6631 for (const auto &P : StoreOffsetVec) {
6632 if (Idx > 0 && P.first != PrevDist + 1)
6633 return false;
6634 PrevDist = P.first;
6635 ++Idx;
6636 }
6637
6638 // Calculate the shuffle indices according to their offset against the sorted
6639 // StoreOffsetVec.
6640 ReorderIndices.assign(StoresVec.size(), 0);
6641 bool IsIdentity = true;
6642 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6643 ReorderIndices[P.second] = I;
6644 IsIdentity &= P.second == I;
6645 }
6646 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6647 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6648 // same convention here.
6649 if (IsIdentity)
6650 ReorderIndices.clear();
6651
6652 return true;
6653}
6654
6655#ifndef NDEBUG
6657 for (unsigned Idx : Order)
6658 dbgs() << Idx << ", ";
6659 dbgs() << "\n";
6660}
6661#endif
6662
6664BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6665 unsigned NumLanes = TE->Scalars.size();
6666
6667 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6668
6669 // Holds the reorder indices for each candidate store vector that is a user of
6670 // the current TreeEntry.
6671 SmallVector<OrdersType, 1> ExternalReorderIndices;
6672
6673 // Now inspect the stores collected per pointer and look for vectorization
6674 // candidates. For each candidate calculate the reorder index vector and push
6675 // it into `ExternalReorderIndices`
6676 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6677 // If we have fewer than NumLanes stores, then we can't form a vector.
6678 if (StoresVec.size() != NumLanes)
6679 continue;
6680
6681 // If the stores are not consecutive then abandon this StoresVec.
6682 OrdersType ReorderIndices;
6683 if (!canFormVector(StoresVec, ReorderIndices))
6684 continue;
6685
6686 // We now know that the scalars in StoresVec can form a vector instruction,
6687 // so set the reorder indices.
6688 ExternalReorderIndices.push_back(ReorderIndices);
6689 }
6690 return ExternalReorderIndices;
6691}
6692
6694 const SmallDenseSet<Value *> &UserIgnoreLst) {
6695 deleteTree();
6696 UserIgnoreList = &UserIgnoreLst;
6697 if (!allSameType(Roots))
6698 return;
6699 buildTree_rec(Roots, 0, EdgeInfo());
6700}
6701
6703 deleteTree();
6704 if (!allSameType(Roots))
6705 return;
6706 buildTree_rec(Roots, 0, EdgeInfo());
6707}
6708
6709/// Tries to find subvector of loads and builds new vector of only loads if can
6710/// be profitable.
6712 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6714 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6715 bool AddNew = true) {
6716 if (VL.empty())
6717 return;
6718 Type *ScalarTy = getValueType(VL.front());
6719 if (!isValidElementType(ScalarTy))
6720 return;
6722 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6723 for (Value *V : VL) {
6724 auto *LI = dyn_cast<LoadInst>(V);
6725 if (!LI)
6726 continue;
6727 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6728 continue;
6729 bool IsFound = false;
6730 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6731 assert(LI->getParent() == Data.front().first->getParent() &&
6732 LI->getType() == Data.front().first->getType() &&
6733 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6734 getUnderlyingObject(Data.front().first->getPointerOperand(),
6736 "Expected loads with the same type, same parent and same "
6737 "underlying pointer.");
6738 std::optional<int> Dist = getPointersDiff(
6739 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6740 Data.front().first->getPointerOperand(), DL, SE,
6741 /*StrictCheck=*/true);
6742 if (!Dist)
6743 continue;
6744 auto It = Map.find(*Dist);
6745 if (It != Map.end() && It->second != LI)
6746 continue;
6747 if (It == Map.end()) {
6748 Data.emplace_back(LI, *Dist);
6749 Map.try_emplace(*Dist, LI);
6750 }
6751 IsFound = true;
6752 break;
6753 }
6754 if (!IsFound) {
6755 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6756 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6757 }
6758 }
6759 auto FindMatchingLoads =
6762 &GatheredLoads,
6763 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6764 int &Offset, unsigned &Start) {
6765 if (Loads.empty())
6766 return GatheredLoads.end();
6768 LoadInst *LI = Loads.front().first;
6769 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6770 if (Idx < Start)
6771 continue;
6772 ToAdd.clear();
6773 if (LI->getParent() != Data.front().first->getParent() ||
6774 LI->getType() != Data.front().first->getType())
6775 continue;
6776 std::optional<int> Dist =
6778 Data.front().first->getType(),
6779 Data.front().first->getPointerOperand(), DL, SE,
6780 /*StrictCheck=*/true);
6781 if (!Dist)
6782 continue;
6783 SmallSet<int, 4> DataDists;
6785 for (std::pair<LoadInst *, int> P : Data) {
6786 DataDists.insert(P.second);
6787 DataLoads.insert(P.first);
6788 }
6789 // Found matching gathered loads - check if all loads are unique or
6790 // can be effectively vectorized.
6791 unsigned NumUniques = 0;
6792 for (auto [Cnt, Pair] : enumerate(Loads)) {
6793 bool Used = DataLoads.contains(Pair.first);
6794 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6795 ++NumUniques;
6796 ToAdd.insert(Cnt);
6797 } else if (Used) {
6798 Repeated.insert(Cnt);
6799 }
6800 }
6801 if (NumUniques > 0 &&
6802 (Loads.size() == NumUniques ||
6803 (Loads.size() - NumUniques >= 2 &&
6804 Loads.size() - NumUniques >= Loads.size() / 2 &&
6805 (has_single_bit(Data.size() + NumUniques) ||
6806 bit_ceil(Data.size()) <
6807 bit_ceil(Data.size() + NumUniques))))) {
6808 Offset = *Dist;
6809 Start = Idx + 1;
6810 return std::next(GatheredLoads.begin(), Idx);
6811 }
6812 }
6813 ToAdd.clear();
6814 return GatheredLoads.end();
6815 };
6816 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6817 unsigned Start = 0;
6818 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6819 int Offset = 0;
6820 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6821 Offset, Start);
6822 while (It != GatheredLoads.end()) {
6823 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6824 for (unsigned Idx : LocalToAdd)
6825 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6826 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6827 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6828 Start);
6829 }
6830 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6831 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6832 })) {
6833 auto AddNewLoads =
6835 for (unsigned Idx : seq<unsigned>(Data.size())) {
6836 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6837 continue;
6838 Loads.push_back(Data[Idx]);
6839 }
6840 };
6841 if (!AddNew) {
6842 LoadInst *LI = Data.front().first;
6843 It = find_if(
6844 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6845 return PD.front().first->getParent() == LI->getParent() &&
6846 PD.front().first->getType() == LI->getType();
6847 });
6848 while (It != GatheredLoads.end()) {
6849 AddNewLoads(*It);
6850 It = std::find_if(
6851 std::next(It), GatheredLoads.end(),
6852 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6853 return PD.front().first->getParent() == LI->getParent() &&
6854 PD.front().first->getType() == LI->getType();
6855 });
6856 }
6857 }
6858 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6859 AddNewLoads(GatheredLoads.emplace_back());
6860 }
6861 }
6862}
6863
6864void BoUpSLP::tryToVectorizeGatheredLoads(
6865 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6866 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6867 8> &GatheredLoads) {
6868 GatheredLoadsEntriesFirst = VectorizableTree.size();
6869
6870 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6871 LoadEntriesToVectorize.size());
6872 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6873 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6874 VectorizableTree[Idx]->Scalars.end());
6875
6876 // Sort loads by distance.
6877 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6878 const std::pair<LoadInst *, int> &L2) {
6879 return L1.second > L2.second;
6880 };
6881
6882 auto IsMaskedGatherSupported = [&](ArrayRef<LoadInst *> Loads) {
6883 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6884 Loads.size());
6885 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6886 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6887 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6888 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6889 };
6890
6891 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6892 BoUpSLP::ValueSet &VectorizedLoads,
6893 SmallVectorImpl<LoadInst *> &NonVectorized,
6894 bool Final, unsigned MaxVF) {
6896 unsigned StartIdx = 0;
6897 SmallVector<int> CandidateVFs;
6898 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6899 CandidateVFs.push_back(MaxVF);
6900 for (int NumElts = getFloorFullVectorNumberOfElements(
6901 *TTI, Loads.front()->getType(), MaxVF);
6902 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6903 *TTI, Loads.front()->getType(), NumElts - 1)) {
6904 CandidateVFs.push_back(NumElts);
6905 if (VectorizeNonPowerOf2 && NumElts > 2)
6906 CandidateVFs.push_back(NumElts - 1);
6907 }
6908
6909 if (Final && CandidateVFs.empty())
6910 return Results;
6911
6912 unsigned BestVF = Final ? CandidateVFs.back() : 0;
6913 for (unsigned NumElts : CandidateVFs) {
6914 if (Final && NumElts > BestVF)
6915 continue;
6916 SmallVector<unsigned> MaskedGatherVectorized;
6917 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
6918 ++Cnt) {
6919 ArrayRef<LoadInst *> Slice =
6920 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
6921 if (VectorizedLoads.count(Slice.front()) ||
6922 VectorizedLoads.count(Slice.back()) ||
6924 continue;
6925 // Check if it is profitable to try vectorizing gathered loads. It is
6926 // profitable if we have more than 3 consecutive loads or if we have
6927 // less but all users are vectorized or deleted.
6928 bool AllowToVectorize = false;
6929 // Check if it is profitable to vectorize 2-elements loads.
6930 if (NumElts == 2) {
6931 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
6932 Slice.front()->getType(), ElementCount::getFixed(NumElts));
6933 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
6934 for (LoadInst *LI : Slice) {
6935 // If single use/user - allow to vectorize.
6936 if (LI->hasOneUse())
6937 continue;
6938 // 1. Check if number of uses equals number of users.
6939 // 2. All users are deleted.
6940 // 3. The load broadcasts are not allowed or the load is not
6941 // broadcasted.
6942 if (static_cast<unsigned int>(std::distance(
6943 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6944 return false;
6945 if (!IsLegalBroadcastLoad)
6946 continue;
6947 if (LI->hasNUsesOrMore(UsesLimit))
6948 return false;
6949 for (User *U : LI->users()) {
6950 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
6951 continue;
6952 if (const TreeEntry *UTE = getTreeEntry(U)) {
6953 for (int I : seq<int>(UTE->getNumOperands())) {
6954 if (all_of(UTE->getOperand(I),
6955 [LI](Value *V) { return V == LI; }))
6956 // Found legal broadcast - do not vectorize.
6957 return false;
6958 }
6959 }
6960 }
6961 }
6962 return true;
6963 };
6964 AllowToVectorize = CheckIfAllowed(Slice);
6965 } else {
6966 AllowToVectorize =
6967 (NumElts >= 3 ||
6968 any_of(ValueToGatherNodes.at(Slice.front()),
6969 [=](const TreeEntry *TE) {
6970 return TE->Scalars.size() == 2 &&
6971 ((TE->Scalars.front() == Slice.front() &&
6972 TE->Scalars.back() == Slice.back()) ||
6973 (TE->Scalars.front() == Slice.back() &&
6974 TE->Scalars.back() == Slice.front()));
6975 })) &&
6976 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
6977 Slice.size());
6978 }
6979 if (AllowToVectorize) {
6980 SmallVector<Value *> PointerOps;
6981 OrdersType CurrentOrder;
6982 // Try to build vector load.
6983 ArrayRef<Value *> Values(
6984 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
6985 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
6986 PointerOps, &BestVF);
6987 if (LS != LoadsState::Gather ||
6988 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
6989 if (LS == LoadsState::ScatterVectorize) {
6990 if (MaskedGatherVectorized.empty() ||
6991 Cnt >= MaskedGatherVectorized.back() + NumElts)
6992 MaskedGatherVectorized.push_back(Cnt);
6993 continue;
6994 }
6995 if (LS != LoadsState::Gather) {
6996 Results.emplace_back(Values, LS);
6997 VectorizedLoads.insert(Slice.begin(), Slice.end());
6998 // If we vectorized initial block, no need to try to vectorize it
6999 // again.
7000 if (Cnt == StartIdx)
7001 StartIdx += NumElts;
7002 }
7003 // Check if the whole array was vectorized already - exit.
7004 if (StartIdx >= Loads.size())
7005 break;
7006 // Erase last masked gather candidate, if another candidate within
7007 // the range is found to be better.
7008 if (!MaskedGatherVectorized.empty() &&
7009 Cnt < MaskedGatherVectorized.back() + NumElts)
7010 MaskedGatherVectorized.pop_back();
7011 Cnt += NumElts - 1;
7012 continue;
7013 }
7014 }
7015 if (!AllowToVectorize || BestVF == 0)
7017 }
7018 // Mark masked gathers candidates as vectorized, if any.
7019 for (unsigned Cnt : MaskedGatherVectorized) {
7020 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7021 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7022 ArrayRef<Value *> Values(
7023 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7024 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7025 VectorizedLoads.insert(Slice.begin(), Slice.end());
7026 // If we vectorized initial block, no need to try to vectorize it again.
7027 if (Cnt == StartIdx)
7028 StartIdx += NumElts;
7029 }
7030 }
7031 for (LoadInst *LI : Loads) {
7032 if (!VectorizedLoads.contains(LI))
7033 NonVectorized.push_back(LI);
7034 }
7035 return Results;
7036 };
7037 auto ProcessGatheredLoads =
7038 [&, &TTI = *TTI](
7040 bool Final = false) {
7041 SmallVector<LoadInst *> NonVectorized;
7042 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7043 if (LoadsDists.size() <= 1) {
7044 NonVectorized.push_back(LoadsDists.back().first);
7045 continue;
7046 }
7047 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7048 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7049 transform(
7050 LoadsDists, OriginalLoads.begin(),
7051 [](const std::pair<LoadInst *, int> &L) { return L.first; });
7052 stable_sort(LocalLoadsDists, LoadSorter);
7054 unsigned MaxConsecutiveDistance = 0;
7055 unsigned CurrentConsecutiveDist = 1;
7056 int LastDist = LocalLoadsDists.front().second;
7057 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7058 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7059 if (getTreeEntry(L.first))
7060 continue;
7061 assert(LastDist >= L.second &&
7062 "Expected first distance always not less than second");
7063 if (static_cast<unsigned>(LastDist - L.second) ==
7064 CurrentConsecutiveDist) {
7065 ++CurrentConsecutiveDist;
7066 MaxConsecutiveDistance =
7067 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7068 Loads.push_back(L.first);
7069 continue;
7070 }
7071 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7072 !Loads.empty())
7073 Loads.pop_back();
7074 CurrentConsecutiveDist = 1;
7075 LastDist = L.second;
7076 Loads.push_back(L.first);
7077 }
7078 if (Loads.size() <= 1)
7079 continue;
7080 if (AllowMaskedGather)
7081 MaxConsecutiveDistance = Loads.size();
7082 else if (MaxConsecutiveDistance < 2)
7083 continue;
7084 BoUpSLP::ValueSet VectorizedLoads;
7085 SmallVector<LoadInst *> SortedNonVectorized;
7087 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7088 Final, MaxConsecutiveDistance);
7089 if (!Results.empty() && !SortedNonVectorized.empty() &&
7090 OriginalLoads.size() == Loads.size() &&
7091 MaxConsecutiveDistance == Loads.size() &&
7093 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7094 return P.second == LoadsState::ScatterVectorize;
7095 })) {
7096 VectorizedLoads.clear();
7097 SmallVector<LoadInst *> UnsortedNonVectorized;
7099 UnsortedResults =
7100 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7101 UnsortedNonVectorized, Final,
7102 OriginalLoads.size());
7103 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7104 SortedNonVectorized.swap(UnsortedNonVectorized);
7105 Results.swap(UnsortedResults);
7106 }
7107 }
7108 for (auto [Slice, _] : Results) {
7109 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7110 << Slice.size() << ")\n");
7111 if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
7112 for (Value *L : Slice)
7113 if (!getTreeEntry(L))
7114 SortedNonVectorized.push_back(cast<LoadInst>(L));
7115 continue;
7116 }
7117
7118 // Select maximum VF as a maximum of user gathered nodes and
7119 // distance between scalar loads in these nodes.
7120 unsigned MaxVF = Slice.size();
7121 unsigned UserMaxVF = 0;
7122 unsigned InterleaveFactor = 0;
7123 if (MaxVF == 2) {
7124 UserMaxVF = MaxVF;
7125 } else {
7126 // Found distance between segments of the interleaved loads.
7127 std::optional<unsigned> InterleavedLoadsDistance = 0;
7128 unsigned Order = 0;
7129 std::optional<unsigned> CommonVF = 0;
7131 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7132 for (auto [Idx, V] : enumerate(Slice)) {
7133 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7134 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7135 unsigned Pos =
7136 EntryToPosition.try_emplace(E, Idx).first->second;
7137 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7138 if (CommonVF) {
7139 if (*CommonVF == 0) {
7140 CommonVF = E->Scalars.size();
7141 continue;
7142 }
7143 if (*CommonVF != E->Scalars.size())
7144 CommonVF.reset();
7145 }
7146 // Check if the load is the part of the interleaved load.
7147 if (Pos != Idx && InterleavedLoadsDistance) {
7148 if (!DeinterleavedNodes.contains(E) &&
7149 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7150 if (isa<Constant>(V))
7151 return false;
7152 if (getTreeEntry(V))
7153 return true;
7154 const auto &Nodes = ValueToGatherNodes.at(V);
7155 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7156 !is_contained(Slice, V);
7157 })) {
7158 InterleavedLoadsDistance.reset();
7159 continue;
7160 }
7161 DeinterleavedNodes.insert(E);
7162 if (*InterleavedLoadsDistance == 0) {
7163 InterleavedLoadsDistance = Idx - Pos;
7164 continue;
7165 }
7166 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7167 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7168 InterleavedLoadsDistance.reset();
7169 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7170 }
7171 }
7172 }
7173 DeinterleavedNodes.clear();
7174 // Check if the large load represents interleaved load operation.
7175 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7176 CommonVF.value_or(0) != 0) {
7177 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7178 unsigned VF = *CommonVF;
7179 OrdersType Order;
7180 SmallVector<Value *> PointerOps;
7181 // Segmented load detected - vectorize at maximum vector factor.
7182 if (InterleaveFactor <= Slice.size() &&
7184 getWidenedType(Slice.front()->getType(), VF),
7185 InterleaveFactor,
7186 cast<LoadInst>(Slice.front())->getAlign(),
7187 cast<LoadInst>(Slice.front())
7189 canVectorizeLoads(Slice, Slice.front(), Order,
7190 PointerOps) == LoadsState::Vectorize) {
7191 UserMaxVF = InterleaveFactor * VF;
7192 } else {
7193 InterleaveFactor = 0;
7194 }
7195 }
7196 // Cannot represent the loads as consecutive vectorizable nodes -
7197 // just exit.
7198 unsigned ConsecutiveNodesSize = 0;
7199 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7200 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7201 [&, Slice = Slice](const auto &P) {
7202 const auto *It = find_if(Slice, [&](Value *V) {
7203 return std::get<1>(P).contains(V);
7204 });
7205 if (It == Slice.end())
7206 return false;
7208 VectorizableTree[std::get<0>(P)]->Scalars;
7209 ConsecutiveNodesSize += VL.size();
7210 unsigned Start = std::distance(Slice.begin(), It);
7211 unsigned Sz = Slice.size() - Start;
7212 return Sz < VL.size() ||
7213 Slice.slice(std::distance(Slice.begin(), It),
7214 VL.size()) != VL;
7215 }))
7216 continue;
7217 // Try to build long masked gather loads.
7218 UserMaxVF = bit_ceil(UserMaxVF);
7219 if (InterleaveFactor == 0 &&
7220 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7221 [&, Slice = Slice](unsigned Idx) {
7222 OrdersType Order;
7223 SmallVector<Value *> PointerOps;
7224 return canVectorizeLoads(
7225 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7226 Slice[Idx * UserMaxVF], Order,
7227 PointerOps) ==
7228 LoadsState::ScatterVectorize;
7229 }))
7230 UserMaxVF = MaxVF;
7231 if (Slice.size() != ConsecutiveNodesSize)
7232 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7233 }
7234 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7235 bool IsVectorized = true;
7236 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7237 ArrayRef<Value *> SubSlice =
7238 Slice.slice(I, std::min(VF, E - I));
7239 if (getTreeEntry(SubSlice.front()))
7240 continue;
7241 // Check if the subslice is to be-vectorized entry, which is not
7242 // equal to entry.
7243 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7244 [&](const auto &P) {
7245 return !SubSlice.equals(
7246 VectorizableTree[std::get<0>(P)]
7247 ->Scalars) &&
7248 set_is_subset(SubSlice, std::get<1>(P));
7249 }))
7250 continue;
7251 unsigned Sz = VectorizableTree.size();
7252 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7253 if (Sz == VectorizableTree.size()) {
7254 IsVectorized = false;
7255 // Try non-interleaved vectorization with smaller vector
7256 // factor.
7257 if (InterleaveFactor > 0) {
7258 VF = 2 * (MaxVF / InterleaveFactor);
7259 InterleaveFactor = 0;
7260 }
7261 continue;
7262 }
7263 }
7264 if (IsVectorized)
7265 break;
7266 }
7267 }
7268 NonVectorized.append(SortedNonVectorized);
7269 }
7270 return NonVectorized;
7271 };
7272 for (const auto &GLs : GatheredLoads) {
7273 const auto &Ref = GLs.second;
7274 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7275 if (!Ref.empty() && !NonVectorized.empty() &&
7276 std::accumulate(
7277 Ref.begin(), Ref.end(), 0u,
7278 [](unsigned S, ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
7279 return S + LoadsDists.size();
7280 }) != NonVectorized.size() &&
7281 IsMaskedGatherSupported(NonVectorized)) {
7283 for (LoadInst *LI : NonVectorized) {
7284 // Reinsert non-vectorized loads to other list of loads with the same
7285 // base pointers.
7286 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7287 FinalGatheredLoads,
7288 /*AddNew=*/false);
7289 }
7290 // Final attempt to vectorize non-vectorized loads.
7291 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7292 }
7293 }
7294 // Try to vectorize postponed load entries, previously marked as gathered.
7295 for (unsigned Idx : LoadEntriesToVectorize) {
7296 const TreeEntry &E = *VectorizableTree[Idx];
7297 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7298 // Avoid reordering, if possible.
7299 if (!E.ReorderIndices.empty()) {
7300 // Build a mask out of the reorder indices and reorder scalars per this
7301 // mask.
7302 SmallVector<int> ReorderMask;
7303 inversePermutation(E.ReorderIndices, ReorderMask);
7304 reorderScalars(GatheredScalars, ReorderMask);
7305 }
7306 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7307 }
7308 // If no new entries created, consider it as no gathered loads entries must be
7309 // handled.
7310 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7311 VectorizableTree.size())
7312 GatheredLoadsEntriesFirst.reset();
7313}
7314
7315/// \return true if the specified list of values has only one instruction that
7316/// requires scheduling, false otherwise.
7317#ifndef NDEBUG
7319 Value *NeedsScheduling = nullptr;
7320 for (Value *V : VL) {
7322 continue;
7323 if (!NeedsScheduling) {
7324 NeedsScheduling = V;
7325 continue;
7326 }
7327 return false;
7328 }
7329 return NeedsScheduling;
7330}
7331#endif
7332
7333/// Generates key/subkey pair for the given value to provide effective sorting
7334/// of the values and better detection of the vectorizable values sequences. The
7335/// keys/subkeys can be used for better sorting of the values themselves (keys)
7336/// and in values subgroups (subkeys).
7337static std::pair<size_t, size_t> generateKeySubkey(
7338 Value *V, const TargetLibraryInfo *TLI,
7339 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7340 bool AllowAlternate) {
7341 hash_code Key = hash_value(V->getValueID() + 2);
7342 hash_code SubKey = hash_value(0);
7343 // Sort the loads by the distance between the pointers.
7344 if (auto *LI = dyn_cast<LoadInst>(V)) {
7345 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7346 if (LI->isSimple())
7347 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7348 else
7349 Key = SubKey = hash_value(LI);
7350 } else if (isVectorLikeInstWithConstOps(V)) {
7351 // Sort extracts by the vector operands.
7352 if (isa<ExtractElementInst, UndefValue>(V))
7353 Key = hash_value(Value::UndefValueVal + 1);
7354 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7355 if (!isUndefVector(EI->getVectorOperand()).all() &&
7356 !isa<UndefValue>(EI->getIndexOperand()))
7357 SubKey = hash_value(EI->getVectorOperand());
7358 }
7359 } else if (auto *I = dyn_cast<Instruction>(V)) {
7360 // Sort other instructions just by the opcodes except for CMPInst.
7361 // For CMP also sort by the predicate kind.
7362 if ((isa<BinaryOperator, CastInst>(I)) &&
7363 isValidForAlternation(I->getOpcode())) {
7364 if (AllowAlternate)
7365 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7366 else
7367 Key = hash_combine(hash_value(I->getOpcode()), Key);
7368 SubKey = hash_combine(
7369 hash_value(I->getOpcode()), hash_value(I->getType()),
7370 hash_value(isa<BinaryOperator>(I)
7371 ? I->getType()
7372 : cast<CastInst>(I)->getOperand(0)->getType()));
7373 // For casts, look through the only operand to improve compile time.
7374 if (isa<CastInst>(I)) {
7375 std::pair<size_t, size_t> OpVals =
7376 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7377 /*AllowAlternate=*/true);
7378 Key = hash_combine(OpVals.first, Key);
7379 SubKey = hash_combine(OpVals.first, SubKey);
7380 }
7381 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7382 CmpInst::Predicate Pred = CI->getPredicate();
7383 if (CI->isCommutative())
7384 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7386 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7387 hash_value(SwapPred),
7388 hash_value(CI->getOperand(0)->getType()));
7389 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7392 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7393 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7394 SubKey = hash_combine(hash_value(I->getOpcode()),
7395 hash_value(Call->getCalledFunction()));
7396 } else {
7397 Key = hash_combine(hash_value(Call), Key);
7398 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7399 }
7400 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7401 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7402 hash_value(Op.Tag), SubKey);
7403 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7404 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7405 SubKey = hash_value(Gep->getPointerOperand());
7406 else
7407 SubKey = hash_value(Gep);
7408 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7409 !isa<ConstantInt>(I->getOperand(1))) {
7410 // Do not try to vectorize instructions with potentially high cost.
7411 SubKey = hash_value(I);
7412 } else {
7413 SubKey = hash_value(I->getOpcode());
7414 }
7415 Key = hash_combine(hash_value(I->getParent()), Key);
7416 }
7417 return std::make_pair(Key, SubKey);
7418}
7419
7420/// Checks if the specified instruction \p I is an alternate operation for
7421/// the given \p MainOp and \p AltOp instructions.
7422static bool isAlternateInstruction(const Instruction *I,
7423 const Instruction *MainOp,
7424 const Instruction *AltOp,
7425 const TargetLibraryInfo &TLI);
7426
7427bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7428 ArrayRef<Value *> VL) const {
7429 unsigned Opcode0 = S.getOpcode();
7430 unsigned Opcode1 = S.getAltOpcode();
7431 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7432 // If this pattern is supported by the target then consider it profitable.
7433 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7434 Opcode0, Opcode1, OpcodeMask))
7435 return true;
7437 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7438 Operands.emplace_back();
7439 // Prepare the operand vector.
7440 for (Value *V : VL) {
7441 if (isa<PoisonValue>(V)) {
7442 Operands.back().push_back(
7443 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7444 continue;
7445 }
7446 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7447 }
7448 }
7449 if (Operands.size() == 2) {
7450 // Try find best operands candidates.
7451 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7453 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7454 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7455 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7456 std::optional<int> Res = findBestRootPair(Candidates);
7457 switch (Res.value_or(0)) {
7458 case 0:
7459 break;
7460 case 1:
7461 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7462 break;
7463 case 2:
7464 std::swap(Operands[0][I], Operands[1][I]);
7465 break;
7466 default:
7467 llvm_unreachable("Unexpected index.");
7468 }
7469 }
7470 }
7471 DenseSet<unsigned> UniqueOpcodes;
7472 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7473 unsigned NonInstCnt = 0;
7474 // Estimate number of instructions, required for the vectorized node and for
7475 // the buildvector node.
7476 unsigned UndefCnt = 0;
7477 // Count the number of extra shuffles, required for vector nodes.
7478 unsigned ExtraShuffleInsts = 0;
7479 // Check that operands do not contain same values and create either perfect
7480 // diamond match or shuffled match.
7481 if (Operands.size() == 2) {
7482 // Do not count same operands twice.
7483 if (Operands.front() == Operands.back()) {
7484 Operands.erase(Operands.begin());
7485 } else if (!allConstant(Operands.front()) &&
7486 all_of(Operands.front(), [&](Value *V) {
7487 return is_contained(Operands.back(), V);
7488 })) {
7489 Operands.erase(Operands.begin());
7490 ++ExtraShuffleInsts;
7491 }
7492 }
7493 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7494 // Vectorize node, if:
7495 // 1. at least single operand is constant or splat.
7496 // 2. Operands have many loop invariants (the instructions are not loop
7497 // invariants).
7498 // 3. At least single unique operands is supposed to vectorized.
7499 return none_of(Operands,
7500 [&](ArrayRef<Value *> Op) {
7501 if (allConstant(Op) ||
7502 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7503 getSameOpcode(Op, *TLI).getMainOp()))
7504 return false;
7506 for (Value *V : Op) {
7507 if (isa<Constant, ExtractElementInst>(V) ||
7508 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
7509 if (isa<UndefValue>(V))
7510 ++UndefCnt;
7511 continue;
7512 }
7513 auto Res = Uniques.try_emplace(V, 0);
7514 // Found first duplicate - need to add shuffle.
7515 if (!Res.second && Res.first->second == 1)
7516 ++ExtraShuffleInsts;
7517 ++Res.first->getSecond();
7518 if (auto *I = dyn_cast<Instruction>(V))
7519 UniqueOpcodes.insert(I->getOpcode());
7520 else if (Res.second)
7521 ++NonInstCnt;
7522 }
7523 return none_of(Uniques, [&](const auto &P) {
7524 return P.first->hasNUsesOrMore(P.second + 1) &&
7525 none_of(P.first->users(), [&](User *U) {
7526 return getTreeEntry(U) || Uniques.contains(U);
7527 });
7528 });
7529 }) ||
7530 // Do not vectorize node, if estimated number of vector instructions is
7531 // more than estimated number of buildvector instructions. Number of
7532 // vector operands is number of vector instructions + number of vector
7533 // instructions for operands (buildvectors). Number of buildvector
7534 // instructions is just number_of_operands * number_of_scalars.
7535 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7536 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7537 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7538}
7539
7540BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7541 const InstructionsState &S, ArrayRef<Value *> VL,
7542 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7543 SmallVectorImpl<Value *> &PointerOps) {
7544 assert(S.getMainOp() &&
7545 "Expected instructions with same/alternate opcodes only.");
7546
7547 unsigned ShuffleOrOp =
7548 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7549 Instruction *VL0 = S.getMainOp();
7550 switch (ShuffleOrOp) {
7551 case Instruction::PHI: {
7552 // Too many operands - gather, most probably won't be vectorized.
7553 if (VL0->getNumOperands() > MaxPHINumOperands)
7554 return TreeEntry::NeedToGather;
7555 // Check for terminator values (e.g. invoke).
7556 for (Value *V : VL) {
7557 auto *PHI = dyn_cast<PHINode>(V);
7558 if (!PHI)
7559 continue;
7560 for (Value *Incoming : PHI->incoming_values()) {
7561 Instruction *Term = dyn_cast<Instruction>(Incoming);
7562 if (Term && Term->isTerminator()) {
7564 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7565 return TreeEntry::NeedToGather;
7566 }
7567 }
7568 }
7569
7570 return TreeEntry::Vectorize;
7571 }
7572 case Instruction::ExtractValue:
7573 case Instruction::ExtractElement: {
7574 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7575 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7576 if (!has_single_bit(VL.size()))
7577 return TreeEntry::NeedToGather;
7578 if (Reuse || !CurrentOrder.empty())
7579 return TreeEntry::Vectorize;
7580 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7581 return TreeEntry::NeedToGather;
7582 }
7583 case Instruction::InsertElement: {
7584 // Check that we have a buildvector and not a shuffle of 2 or more
7585 // different vectors.
7586 ValueSet SourceVectors;
7587 for (Value *V : VL) {
7588 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7589 assert(getElementIndex(V) != std::nullopt &&
7590 "Non-constant or undef index?");
7591 }
7592
7593 if (count_if(VL, [&SourceVectors](Value *V) {
7594 return !SourceVectors.contains(V);
7595 }) >= 2) {
7596 // Found 2nd source vector - cancel.
7597 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7598 "different source vectors.\n");
7599 return TreeEntry::NeedToGather;
7600 }
7601
7602 if (any_of(VL, [&SourceVectors](Value *V) {
7603 // The last InsertElement can have multiple uses.
7604 return SourceVectors.contains(V) && !V->hasOneUse();
7605 })) {
7606 assert(SLPReVec && "Only supported by REVEC.");
7607 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7608 "multiple uses.\n");
7609 return TreeEntry::NeedToGather;
7610 }
7611
7612 return TreeEntry::Vectorize;
7613 }
7614 case Instruction::Load: {
7615 // Check that a vectorized load would load the same memory as a scalar
7616 // load. For example, we don't want to vectorize loads that are smaller
7617 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7618 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7619 // from such a struct, we read/write packed bits disagreeing with the
7620 // unvectorized version.
7621 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7623 return TreeEntry::Vectorize;
7625 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7626 // Delay slow vectorized nodes for better vectorization attempts.
7627 LoadEntriesToVectorize.insert(VectorizableTree.size());
7628 return TreeEntry::NeedToGather;
7629 }
7630 return TreeEntry::ScatterVectorize;
7632 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7633 // Delay slow vectorized nodes for better vectorization attempts.
7634 LoadEntriesToVectorize.insert(VectorizableTree.size());
7635 return TreeEntry::NeedToGather;
7636 }
7637 return TreeEntry::StridedVectorize;
7638 case LoadsState::Gather:
7639#ifndef NDEBUG
7640 Type *ScalarTy = VL0->getType();
7641 if (DL->getTypeSizeInBits(ScalarTy) !=
7642 DL->getTypeAllocSizeInBits(ScalarTy))
7643 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7644 else if (any_of(VL, [](Value *V) {
7645 auto *LI = dyn_cast<LoadInst>(V);
7646 return !LI || !LI->isSimple();
7647 }))
7648 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7649 else
7650 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7651#endif // NDEBUG
7653 return TreeEntry::NeedToGather;
7654 }
7655 llvm_unreachable("Unexpected state of loads");
7656 }
7657 case Instruction::ZExt:
7658 case Instruction::SExt:
7659 case Instruction::FPToUI:
7660 case Instruction::FPToSI:
7661 case Instruction::FPExt:
7662 case Instruction::PtrToInt:
7663 case Instruction::IntToPtr:
7664 case Instruction::SIToFP:
7665 case Instruction::UIToFP:
7666 case Instruction::Trunc:
7667 case Instruction::FPTrunc:
7668 case Instruction::BitCast: {
7669 Type *SrcTy = VL0->getOperand(0)->getType();
7670 for (Value *V : VL) {
7671 if (isa<PoisonValue>(V))
7672 continue;
7673 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7674 if (Ty != SrcTy || !isValidElementType(Ty)) {
7675 LLVM_DEBUG(
7676 dbgs() << "SLP: Gathering casts with different src types.\n");
7677 return TreeEntry::NeedToGather;
7678 }
7679 }
7680 return TreeEntry::Vectorize;
7681 }
7682 case Instruction::ICmp:
7683 case Instruction::FCmp: {
7684 // Check that all of the compares have the same predicate.
7685 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7687 Type *ComparedTy = VL0->getOperand(0)->getType();
7688 for (Value *V : VL) {
7689 if (isa<PoisonValue>(V))
7690 continue;
7691 auto *Cmp = cast<CmpInst>(V);
7692 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7693 Cmp->getOperand(0)->getType() != ComparedTy) {
7694 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7695 return TreeEntry::NeedToGather;
7696 }
7697 }
7698 return TreeEntry::Vectorize;
7699 }
7700 case Instruction::Select:
7701 case Instruction::FNeg:
7702 case Instruction::Add:
7703 case Instruction::FAdd:
7704 case Instruction::Sub:
7705 case Instruction::FSub:
7706 case Instruction::Mul:
7707 case Instruction::FMul:
7708 case Instruction::UDiv:
7709 case Instruction::SDiv:
7710 case Instruction::FDiv:
7711 case Instruction::URem:
7712 case Instruction::SRem:
7713 case Instruction::FRem:
7714 case Instruction::Shl:
7715 case Instruction::LShr:
7716 case Instruction::AShr:
7717 case Instruction::And:
7718 case Instruction::Or:
7719 case Instruction::Xor:
7720 case Instruction::Freeze:
7721 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7723 auto *I = dyn_cast<Instruction>(V);
7724 return I && I->isBinaryOp() && !I->isFast();
7725 }))
7726 return TreeEntry::NeedToGather;
7727 return TreeEntry::Vectorize;
7728 case Instruction::GetElementPtr: {
7729 // We don't combine GEPs with complicated (nested) indexing.
7730 for (Value *V : VL) {
7731 auto *I = dyn_cast<GetElementPtrInst>(V);
7732 if (!I)
7733 continue;
7734 if (I->getNumOperands() != 2) {
7735 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7736 return TreeEntry::NeedToGather;
7737 }
7738 }
7739
7740 // We can't combine several GEPs into one vector if they operate on
7741 // different types.
7742 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7743 for (Value *V : VL) {
7744 auto *GEP = dyn_cast<GEPOperator>(V);
7745 if (!GEP)
7746 continue;
7747 Type *CurTy = GEP->getSourceElementType();
7748 if (Ty0 != CurTy) {
7749 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7750 return TreeEntry::NeedToGather;
7751 }
7752 }
7753
7754 // We don't combine GEPs with non-constant indexes.
7755 Type *Ty1 = VL0->getOperand(1)->getType();
7756 for (Value *V : VL) {
7757 auto *I = dyn_cast<GetElementPtrInst>(V);
7758 if (!I)
7759 continue;
7760 auto *Op = I->getOperand(1);
7761 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7762 (Op->getType() != Ty1 &&
7763 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7764 Op->getType()->getScalarSizeInBits() >
7765 DL->getIndexSizeInBits(
7766 V->getType()->getPointerAddressSpace())))) {
7767 LLVM_DEBUG(
7768 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7769 return TreeEntry::NeedToGather;
7770 }
7771 }
7772
7773 return TreeEntry::Vectorize;
7774 }
7775 case Instruction::Store: {
7776 // Check if the stores are consecutive or if we need to swizzle them.
7777 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7778 // Avoid types that are padded when being allocated as scalars, while
7779 // being packed together in a vector (such as i1).
7780 if (DL->getTypeSizeInBits(ScalarTy) !=
7781 DL->getTypeAllocSizeInBits(ScalarTy)) {
7782 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7783 return TreeEntry::NeedToGather;
7784 }
7785 // Make sure all stores in the bundle are simple - we can't vectorize
7786 // atomic or volatile stores.
7787 for (Value *V : VL) {
7788 auto *SI = cast<StoreInst>(V);
7789 if (!SI->isSimple()) {
7790 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7791 return TreeEntry::NeedToGather;
7792 }
7793 PointerOps.push_back(SI->getPointerOperand());
7794 }
7795
7796 // Check the order of pointer operands.
7797 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7798 Value *Ptr0;
7799 Value *PtrN;
7800 if (CurrentOrder.empty()) {
7801 Ptr0 = PointerOps.front();
7802 PtrN = PointerOps.back();
7803 } else {
7804 Ptr0 = PointerOps[CurrentOrder.front()];
7805 PtrN = PointerOps[CurrentOrder.back()];
7806 }
7807 std::optional<int> Dist =
7808 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7809 // Check that the sorted pointer operands are consecutive.
7810 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7811 return TreeEntry::Vectorize;
7812 }
7813
7814 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7815 return TreeEntry::NeedToGather;
7816 }
7817 case Instruction::Call: {
7818 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7820 auto *I = dyn_cast<Instruction>(V);
7821 return I && !I->isFast();
7822 }))
7823 return TreeEntry::NeedToGather;
7824 // Check if the calls are all to the same vectorizable intrinsic or
7825 // library function.
7826 CallInst *CI = cast<CallInst>(VL0);
7828
7829 VFShape Shape = VFShape::get(
7830 CI->getFunctionType(),
7831 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7832 false /*HasGlobalPred*/);
7833 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7834
7835 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7836 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7837 return TreeEntry::NeedToGather;
7838 }
7839 Function *F = CI->getCalledFunction();
7840 unsigned NumArgs = CI->arg_size();
7841 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7842 for (unsigned J = 0; J != NumArgs; ++J)
7844 ScalarArgs[J] = CI->getArgOperand(J);
7845 for (Value *V : VL) {
7846 CallInst *CI2 = dyn_cast<CallInst>(V);
7847 if (!CI2 || CI2->getCalledFunction() != F ||
7848 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7849 (VecFunc &&
7850 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7852 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7853 << "\n");
7854 return TreeEntry::NeedToGather;
7855 }
7856 // Some intrinsics have scalar arguments and should be same in order for
7857 // them to be vectorized.
7858 for (unsigned J = 0; J != NumArgs; ++J) {
7860 Value *A1J = CI2->getArgOperand(J);
7861 if (ScalarArgs[J] != A1J) {
7863 << "SLP: mismatched arguments in call:" << *CI
7864 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7865 return TreeEntry::NeedToGather;
7866 }
7867 }
7868 }
7869 // Verify that the bundle operands are identical between the two calls.
7870 if (CI->hasOperandBundles() &&
7871 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7872 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7873 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7874 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7875 << "!=" << *V << '\n');
7876 return TreeEntry::NeedToGather;
7877 }
7878 }
7879
7880 return TreeEntry::Vectorize;
7881 }
7882 case Instruction::ShuffleVector: {
7883 if (!S.isAltShuffle()) {
7884 // REVEC can support non alternate shuffle.
7886 return TreeEntry::Vectorize;
7887 // If this is not an alternate sequence of opcode like add-sub
7888 // then do not vectorize this instruction.
7889 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7890 return TreeEntry::NeedToGather;
7891 }
7892 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7893 LLVM_DEBUG(
7894 dbgs()
7895 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7896 "the whole alt sequence is not profitable.\n");
7897 return TreeEntry::NeedToGather;
7898 }
7899
7900 return TreeEntry::Vectorize;
7901 }
7902 default:
7903 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
7904 return TreeEntry::NeedToGather;
7905 }
7906}
7907
7908namespace {
7909/// Allows to correctly handle operands of the phi nodes based on the \p Main
7910/// PHINode order of incoming basic blocks/values.
7911class PHIHandler {
7912 DominatorTree &DT;
7913 PHINode *Main = nullptr;
7916
7917public:
7918 PHIHandler() = delete;
7919 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
7920 : DT(DT), Main(Main), Phis(Phis),
7921 Operands(Main->getNumIncomingValues(),
7922 SmallVector<Value *>(Phis.size(), nullptr)) {}
7923 void buildOperands() {
7924 constexpr unsigned FastLimit = 4;
7925 if (Main->getNumIncomingValues() <= FastLimit) {
7926 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7927 BasicBlock *InBB = Main->getIncomingBlock(I);
7928 if (!DT.isReachableFromEntry(InBB)) {
7929 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7930 continue;
7931 }
7932 // Prepare the operand vector.
7933 for (auto [Idx, V] : enumerate(Phis)) {
7934 auto *P = dyn_cast<PHINode>(V);
7935 if (!P) {
7936 assert(isa<PoisonValue>(V) &&
7937 "Expected isa instruction or poison value.");
7938 Operands[I][Idx] = V;
7939 continue;
7940 }
7941 if (P->getIncomingBlock(I) == InBB)
7942 Operands[I][Idx] = P->getIncomingValue(I);
7943 else
7944 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
7945 }
7946 }
7947 return;
7948 }
7950 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7951 BasicBlock *InBB = Main->getIncomingBlock(I);
7952 if (!DT.isReachableFromEntry(InBB)) {
7953 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7954 continue;
7955 }
7956 Blocks.try_emplace(InBB).first->second.push_back(I);
7957 }
7958 for (auto [Idx, V] : enumerate(Phis)) {
7959 if (isa<PoisonValue>(V)) {
7960 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
7961 Operands[I][Idx] = V;
7962 continue;
7963 }
7964 auto *P = cast<PHINode>(V);
7965 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
7966 BasicBlock *InBB = P->getIncomingBlock(I);
7967 if (InBB == Main->getIncomingBlock(I)) {
7968 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
7969 continue;
7970 Operands[I][Idx] = P->getIncomingValue(I);
7971 continue;
7972 }
7973 auto It = Blocks.find(InBB);
7974 if (It == Blocks.end())
7975 continue;
7976 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
7977 }
7978 }
7979 for (const auto &P : Blocks) {
7980 if (P.getSecond().size() <= 1)
7981 continue;
7982 unsigned BasicI = P.getSecond().front();
7983 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
7985 [&](const auto &Data) {
7986 return !Data.value() ||
7987 Data.value() == Operands[BasicI][Data.index()];
7988 }) &&
7989 "Expected empty operands list.");
7990 Operands[I] = Operands[BasicI];
7991 }
7992 }
7993 }
7994 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
7995};
7996} // namespace
7997
7998void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7999 const EdgeInfo &UserTreeIdx,
8000 unsigned InterleaveFactor) {
8001 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
8002
8003 SmallVector<int> ReuseShuffleIndices;
8004 SmallVector<Value *> UniqueValues;
8005 SmallVector<Value *> NonUniqueValueVL;
8006 auto TryToFindDuplicates = [&](const InstructionsState &S,
8007 bool DoNotFail = false) {
8008 // Check that every instruction appears once in this bundle.
8009 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8010 for (Value *V : VL) {
8011 if (isConstant(V)) {
8012 ReuseShuffleIndices.emplace_back(
8013 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8014 UniqueValues.emplace_back(V);
8015 continue;
8016 }
8017 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8018 ReuseShuffleIndices.emplace_back(Res.first->second);
8019 if (Res.second)
8020 UniqueValues.emplace_back(V);
8021 }
8022 size_t NumUniqueScalarValues = UniqueValues.size();
8023 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8024 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8025 if (NumUniqueScalarValues == VL.size() &&
8026 (VectorizeNonPowerOf2 || IsFullVectors)) {
8027 ReuseShuffleIndices.clear();
8028 } else {
8029 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8030 if ((UserTreeIdx.UserTE &&
8031 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8032 !has_single_bit(VL.size())) {
8033 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8034 "for nodes with padding.\n");
8035 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8036 return false;
8037 }
8038 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8039 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8040 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8041 return isa<UndefValue>(V) || !isConstant(V);
8042 }))) {
8043 if (DoNotFail && UniquePositions.size() > 1 &&
8044 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8045 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8046 // Find the number of elements, which forms full vectors.
8047 unsigned PWSz = getFullVectorNumberOfElements(
8048 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8049 if (PWSz == VL.size()) {
8050 ReuseShuffleIndices.clear();
8051 } else {
8052 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8053 NonUniqueValueVL.append(
8054 PWSz - UniqueValues.size(),
8055 PoisonValue::get(UniqueValues.front()->getType()));
8056 VL = NonUniqueValueVL;
8057 }
8058 return true;
8059 }
8060 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8061 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8062 return false;
8063 }
8064 VL = UniqueValues;
8065 }
8066 return true;
8067 };
8068
8069 InstructionsState S = getSameOpcode(VL, *TLI);
8070
8071 // Don't go into catchswitch blocks, which can happen with PHIs.
8072 // Such blocks can only have PHIs and the catchswitch. There is no
8073 // place to insert a shuffle if we need to, so just avoid that issue.
8074 if (S.getMainOp() &&
8075 isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8076 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8077 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8078 return;
8079 }
8080
8081 // Check if this is a duplicate of another entry.
8082 if (S.getOpcode()) {
8083 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8084 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
8085 << ".\n");
8086 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8087 auto It = MultiNodeScalars.find(S.getMainOp());
8088 if (It != MultiNodeScalars.end()) {
8089 auto *TEIt = find_if(It->getSecond(),
8090 [&](TreeEntry *ME) { return ME->isSame(VL); });
8091 if (TEIt != It->getSecond().end())
8092 E = *TEIt;
8093 else
8094 E = nullptr;
8095 } else {
8096 E = nullptr;
8097 }
8098 }
8099 if (!E) {
8100 if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8101 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
8102 if (TryToFindDuplicates(S))
8103 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8104 ReuseShuffleIndices);
8105 return;
8106 }
8108 Nodes.insert(getTreeEntry(S.getMainOp()));
8109 for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8110 Nodes.insert(E);
8111 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8112 if (any_of(Nodes, [&](const TreeEntry *E) {
8113 if (all_of(E->Scalars,
8114 [&](Value *V) { return Values.contains(V); }))
8115 return true;
8116 SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8117 E->Scalars.end());
8118 return (
8119 all_of(VL, [&](Value *V) { return EValues.contains(V); }));
8120 })) {
8121 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8122 if (TryToFindDuplicates(S))
8123 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8124 ReuseShuffleIndices);
8125 return;
8126 }
8127 } else {
8128 // Record the reuse of the tree node. FIXME, currently this is only
8129 // used to properly draw the graph rather than for the actual
8130 // vectorization.
8131 E->UserTreeIndices.push_back(UserTreeIdx);
8132 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8133 << ".\n");
8134 return;
8135 }
8136 }
8137 }
8138
8139 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8140 // a load), in which case peek through to include it in the tree, without
8141 // ballooning over-budget.
8142 if (Depth >= RecursionMaxDepth &&
8143 !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 &&
8144 (match(S.getMainOp(), m_Load(m_Value())) ||
8145 all_of(VL, [&S](const Value *I) {
8146 return match(I,
8148 cast<Instruction>(I)->getOpcode() ==
8149 S.getMainOp()->getOpcode();
8150 })))) {
8151 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8152 if (TryToFindDuplicates(S))
8153 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8154 ReuseShuffleIndices);
8155 return;
8156 }
8157
8158 // Don't handle scalable vectors
8159 if (S.getOpcode() == Instruction::ExtractElement &&
8160 isa<ScalableVectorType>(
8161 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8162 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8163 if (TryToFindDuplicates(S))
8164 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8165 ReuseShuffleIndices);
8166 return;
8167 }
8168
8169 // Don't handle vectors.
8170 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8171 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8172 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8173 return;
8174 }
8175
8176 // If all of the operands are identical or constant we have a simple solution.
8177 // If we deal with insert/extract instructions, they all must have constant
8178 // indices, otherwise we should gather them, not try to vectorize.
8179 // If alternate op node with 2 elements with gathered operands - do not
8180 // vectorize.
8181 auto &&NotProfitableForVectorization = [&S, this,
8183 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
8184 return false;
8185 if (VectorizableTree.size() < MinTreeSize)
8186 return false;
8187 if (Depth >= RecursionMaxDepth - 1)
8188 return true;
8189 // Check if all operands are extracts, part of vector node or can build a
8190 // regular vectorize node.
8191 SmallVector<unsigned, 8> InstsCount;
8192 for (Value *V : VL) {
8193 auto *I = cast<Instruction>(V);
8194 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8195 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8196 }));
8197 }
8198 bool IsCommutative =
8199 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8200 if ((IsCommutative &&
8201 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8202 (!IsCommutative &&
8203 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8204 return true;
8205 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8207 auto *I1 = cast<Instruction>(VL.front());
8208 auto *I2 = cast<Instruction>(VL.back());
8209 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8210 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8211 I2->getOperand(Op));
8212 if (static_cast<unsigned>(count_if(
8213 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8215 })) >= S.getMainOp()->getNumOperands() / 2)
8216 return false;
8217 if (S.getMainOp()->getNumOperands() > 2)
8218 return true;
8219 if (IsCommutative) {
8220 // Check permuted operands.
8221 Candidates.clear();
8222 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8223 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8224 I2->getOperand((Op + 1) % E));
8225 if (any_of(
8226 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8228 }))
8229 return false;
8230 }
8231 return true;
8232 };
8233 SmallVector<unsigned> SortedIndices;
8234 BasicBlock *BB = nullptr;
8235 bool IsScatterVectorizeUserTE =
8236 UserTreeIdx.UserTE &&
8237 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8238 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
8239 bool AreScatterAllGEPSameBlock =
8240 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8241 VL.size() > 2 &&
8242 all_of(VL,
8243 [&BB](Value *V) {
8244 auto *I = dyn_cast<GetElementPtrInst>(V);
8245 if (!I)
8246 return doesNotNeedToBeScheduled(V);
8247 if (!BB)
8248 BB = I->getParent();
8249 return BB == I->getParent() && I->getNumOperands() == 2;
8250 }) &&
8251 BB &&
8252 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8253 SortedIndices));
8254 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8255 if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) ||
8256 (isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8257 S.getMainOp()) &&
8259 NotProfitableForVectorization(VL)) {
8260 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8261 if (TryToFindDuplicates(S))
8262 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8263 ReuseShuffleIndices);
8264 return;
8265 }
8266
8267 // Don't vectorize ephemeral values.
8268 if (S.getOpcode() && !EphValues.empty()) {
8269 for (Value *V : VL) {
8270 if (EphValues.count(V)) {
8271 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8272 << ") is ephemeral.\n");
8273 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8274 return;
8275 }
8276 }
8277 }
8278
8279 // We now know that this is a vector of instructions of the same type from
8280 // the same block.
8281
8282 // Check that none of the instructions in the bundle are already in the tree.
8283 for (Value *V : VL) {
8284 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8286 continue;
8287 if (getTreeEntry(V)) {
8288 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8289 << ") is already in tree.\n");
8290 if (TryToFindDuplicates(S))
8291 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8292 ReuseShuffleIndices);
8293 return;
8294 }
8295 }
8296
8297 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8298 if (UserIgnoreList && !UserIgnoreList->empty()) {
8299 for (Value *V : VL) {
8300 if (UserIgnoreList->contains(V)) {
8301 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8302 if (TryToFindDuplicates(S))
8303 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8304 ReuseShuffleIndices);
8305 return;
8306 }
8307 }
8308 }
8309
8310 // Special processing for sorted pointers for ScatterVectorize node with
8311 // constant indeces only.
8312 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8313 assert(VL.front()->getType()->isPointerTy() &&
8314 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8315 "Expected pointers only.");
8316 // Reset S to make it GetElementPtr kind of node.
8317 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8318 assert(It != VL.end() && "Expected at least one GEP.");
8319 S = getSameOpcode(*It, *TLI);
8320 }
8321
8322 // Check that all of the users of the scalars that we want to vectorize are
8323 // schedulable.
8324 Instruction *VL0 = S.getMainOp();
8325 BB = VL0->getParent();
8326
8327 if (S.getMainOp() &&
8328 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8329 !DT->isReachableFromEntry(BB))) {
8330 // Don't go into unreachable blocks. They may contain instructions with
8331 // dependency cycles which confuse the final scheduling.
8332 // Do not vectorize EH and non-returning blocks, not profitable in most
8333 // cases.
8334 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8335 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8336 return;
8337 }
8338
8339 // Check that every instruction appears once in this bundle.
8340 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8341 return;
8342
8343 // Perform specific checks for each particular instruction kind.
8344 OrdersType CurrentOrder;
8345 SmallVector<Value *> PointerOps;
8346 TreeEntry::EntryState State = getScalarsVectorizationState(
8347 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8348 if (State == TreeEntry::NeedToGather) {
8349 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8350 ReuseShuffleIndices);
8351 return;
8352 }
8353
8354 auto &BSRef = BlocksSchedules[BB];
8355 if (!BSRef)
8356 BSRef = std::make_unique<BlockScheduling>(BB);
8357
8358 BlockScheduling &BS = *BSRef;
8359
8360 std::optional<ScheduleData *> Bundle =
8361 BS.tryScheduleBundle(UniqueValues, this, S);
8362#ifdef EXPENSIVE_CHECKS
8363 // Make sure we didn't break any internal invariants
8364 BS.verify();
8365#endif
8366 if (!Bundle) {
8367 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8368 assert((!BS.getScheduleData(VL0) ||
8369 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8370 "tryScheduleBundle should cancelScheduling on failure");
8371 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8372 ReuseShuffleIndices);
8373 NonScheduledFirst.insert(VL.front());
8374 if (S.getOpcode() == Instruction::Load &&
8375 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8377 return;
8378 }
8379 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8380
8381 unsigned ShuffleOrOp = S.isAltShuffle() ?
8382 (unsigned) Instruction::ShuffleVector : S.getOpcode();
8383 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8384 // Postpone PHI nodes creation
8385 SmallVector<unsigned> PHIOps;
8386 for (unsigned I : seq<unsigned>(Operands.size())) {
8388 if (Op.empty())
8389 continue;
8390 InstructionsState S = getSameOpcode(Op, *TLI);
8391 if (S.getOpcode() != Instruction::PHI || S.isAltShuffle())
8392 buildTree_rec(Op, Depth + 1, {TE, I});
8393 else
8394 PHIOps.push_back(I);
8395 }
8396 for (unsigned I : PHIOps)
8397 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8398 };
8399 switch (ShuffleOrOp) {
8400 case Instruction::PHI: {
8401 auto *PH = cast<PHINode>(VL0);
8402
8403 TreeEntry *TE =
8404 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8405 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
8406 TE->dump());
8407
8408 // Keeps the reordered operands to avoid code duplication.
8409 PHIHandler Handler(*DT, PH, VL);
8410 Handler.buildOperands();
8411 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8412 TE->setOperand(I, Handler.getOperands(I));
8413 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8414 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8415 Operands[I] = Handler.getOperands(I);
8416 CreateOperandNodes(TE, Operands);
8417 return;
8418 }
8419 case Instruction::ExtractValue:
8420 case Instruction::ExtractElement: {
8421 if (CurrentOrder.empty()) {
8422 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8423 } else {
8424 LLVM_DEBUG({
8425 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8426 "with order";
8427 for (unsigned Idx : CurrentOrder)
8428 dbgs() << " " << Idx;
8429 dbgs() << "\n";
8430 });
8431 fixupOrderingIndices(CurrentOrder);
8432 }
8433 // Insert new order with initial value 0, if it does not exist,
8434 // otherwise return the iterator to the existing one.
8435 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8436 ReuseShuffleIndices, CurrentOrder);
8437 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
8438 "(ExtractValueInst/ExtractElementInst).\n";
8439 TE->dump());
8440 // This is a special case, as it does not gather, but at the same time
8441 // we are not extending buildTree_rec() towards the operands.
8442 TE->setOperand(*this);
8443 return;
8444 }
8445 case Instruction::InsertElement: {
8446 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8447
8448 auto OrdCompare = [](const std::pair<int, int> &P1,
8449 const std::pair<int, int> &P2) {
8450 return P1.first > P2.first;
8451 };
8453 decltype(OrdCompare)>
8454 Indices(OrdCompare);
8455 for (int I = 0, E = VL.size(); I < E; ++I) {
8456 unsigned Idx = *getElementIndex(VL[I]);
8457 Indices.emplace(Idx, I);
8458 }
8459 OrdersType CurrentOrder(VL.size(), VL.size());
8460 bool IsIdentity = true;
8461 for (int I = 0, E = VL.size(); I < E; ++I) {
8462 CurrentOrder[Indices.top().second] = I;
8463 IsIdentity &= Indices.top().second == I;
8464 Indices.pop();
8465 }
8466 if (IsIdentity)
8467 CurrentOrder.clear();
8468 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8469 {}, CurrentOrder);
8470 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
8471 TE->dump());
8472
8473 TE->setOperand(*this);
8474 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8475 return;
8476 }
8477 case Instruction::Load: {
8478 // Check that a vectorized load would load the same memory as a scalar
8479 // load. For example, we don't want to vectorize loads that are smaller
8480 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8481 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8482 // from such a struct, we read/write packed bits disagreeing with the
8483 // unvectorized version.
8484 TreeEntry *TE = nullptr;
8485 fixupOrderingIndices(CurrentOrder);
8486 switch (State) {
8487 case TreeEntry::Vectorize:
8488 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8489 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8490 if (CurrentOrder.empty())
8491 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
8492 TE->dump());
8493 else
8495 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
8496 TE->dump());
8497 break;
8498 case TreeEntry::StridedVectorize:
8499 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8500 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8501 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8502 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
8503 TE->dump());
8504 break;
8505 case TreeEntry::ScatterVectorize:
8506 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8507 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8508 UserTreeIdx, ReuseShuffleIndices);
8509 LLVM_DEBUG(
8510 dbgs()
8511 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8512 TE->dump());
8513 break;
8514 case TreeEntry::CombinedVectorize:
8515 case TreeEntry::NeedToGather:
8516 llvm_unreachable("Unexpected loads state.");
8517 }
8518 TE->setOperand(*this);
8519 if (State == TreeEntry::ScatterVectorize)
8520 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8521 return;
8522 }
8523 case Instruction::ZExt:
8524 case Instruction::SExt:
8525 case Instruction::FPToUI:
8526 case Instruction::FPToSI:
8527 case Instruction::FPExt:
8528 case Instruction::PtrToInt:
8529 case Instruction::IntToPtr:
8530 case Instruction::SIToFP:
8531 case Instruction::UIToFP:
8532 case Instruction::Trunc:
8533 case Instruction::FPTrunc:
8534 case Instruction::BitCast: {
8535 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8536 std::make_pair(std::numeric_limits<unsigned>::min(),
8537 std::numeric_limits<unsigned>::max()));
8538 if (ShuffleOrOp == Instruction::ZExt ||
8539 ShuffleOrOp == Instruction::SExt) {
8540 CastMaxMinBWSizes = std::make_pair(
8541 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8542 PrevMaxBW),
8543 std::min<unsigned>(
8544 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8545 PrevMinBW));
8546 } else if (ShuffleOrOp == Instruction::Trunc) {
8547 CastMaxMinBWSizes = std::make_pair(
8548 std::max<unsigned>(
8549 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8550 PrevMaxBW),
8551 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8552 PrevMinBW));
8553 }
8554 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8555 ReuseShuffleIndices);
8556 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
8557 TE->dump());
8558
8559 TE->setOperand(*this);
8560 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8561 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8562 if (ShuffleOrOp == Instruction::Trunc) {
8563 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8564 } else if (ShuffleOrOp == Instruction::SIToFP ||
8565 ShuffleOrOp == Instruction::UIToFP) {
8566 unsigned NumSignBits =
8567 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8568 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8569 APInt Mask = DB->getDemandedBits(OpI);
8570 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8571 }
8572 if (NumSignBits * 2 >=
8573 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8574 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8575 }
8576 return;
8577 }
8578 case Instruction::ICmp:
8579 case Instruction::FCmp: {
8580 // Check that all of the compares have the same predicate.
8581 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8582 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8583 ReuseShuffleIndices);
8584 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
8585 TE->dump());
8586
8588 VLOperands Ops(VL, VL0, *this);
8589 if (cast<CmpInst>(VL0)->isCommutative()) {
8590 // Commutative predicate - collect + sort operands of the instructions
8591 // so that each side is more likely to have the same opcode.
8593 "Commutative Predicate mismatch");
8594 Ops.reorder();
8595 Left = Ops.getVL(0);
8596 Right = Ops.getVL(1);
8597 } else {
8598 // Collect operands - commute if it uses the swapped predicate.
8599 for (Value *V : VL) {
8600 if (isa<PoisonValue>(V)) {
8601 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8602 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8603 continue;
8604 }
8605 auto *Cmp = cast<CmpInst>(V);
8606 Value *LHS = Cmp->getOperand(0);
8607 Value *RHS = Cmp->getOperand(1);
8608 if (Cmp->getPredicate() != P0)
8609 std::swap(LHS, RHS);
8610 Left.push_back(LHS);
8611 Right.push_back(RHS);
8612 }
8613 }
8614 TE->setOperand(0, Left);
8615 TE->setOperand(1, Right);
8616 buildTree_rec(Left, Depth + 1, {TE, 0});
8617 buildTree_rec(Right, Depth + 1, {TE, 1});
8618 if (ShuffleOrOp == Instruction::ICmp) {
8619 unsigned NumSignBits0 =
8620 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8621 if (NumSignBits0 * 2 >=
8622 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8623 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8624 unsigned NumSignBits1 =
8625 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8626 if (NumSignBits1 * 2 >=
8627 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8628 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8629 }
8630 return;
8631 }
8632 case Instruction::Select:
8633 case Instruction::FNeg:
8634 case Instruction::Add:
8635 case Instruction::FAdd:
8636 case Instruction::Sub:
8637 case Instruction::FSub:
8638 case Instruction::Mul:
8639 case Instruction::FMul:
8640 case Instruction::UDiv:
8641 case Instruction::SDiv:
8642 case Instruction::FDiv:
8643 case Instruction::URem:
8644 case Instruction::SRem:
8645 case Instruction::FRem:
8646 case Instruction::Shl:
8647 case Instruction::LShr:
8648 case Instruction::AShr:
8649 case Instruction::And:
8650 case Instruction::Or:
8651 case Instruction::Xor:
8652 case Instruction::Freeze: {
8653 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8654 ReuseShuffleIndices);
8655 LLVM_DEBUG(
8656 dbgs() << "SLP: added a new TreeEntry "
8657 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8658 TE->dump());
8659
8660 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8661 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8662 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8663 return;
8664 }
8665 case Instruction::GetElementPtr: {
8666 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8667 ReuseShuffleIndices);
8668 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
8669 TE->dump());
8671 // Prepare the operand vector for pointer operands.
8672 for (Value *V : VL) {
8673 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8674 if (!GEP) {
8675 Operands.front().push_back(V);
8676 continue;
8677 }
8678 Operands.front().push_back(GEP->getPointerOperand());
8679 }
8680 TE->setOperand(0, Operands.front());
8681 // Need to cast all indices to the same type before vectorization to
8682 // avoid crash.
8683 // Required to be able to find correct matches between different gather
8684 // nodes and reuse the vectorized values rather than trying to gather them
8685 // again.
8686 int IndexIdx = 1;
8687 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8688 Type *Ty = all_of(VL,
8689 [VL0Ty, IndexIdx](Value *V) {
8690 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8691 if (!GEP)
8692 return true;
8693 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8694 })
8695 ? VL0Ty
8696 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8697 ->getPointerOperandType()
8698 ->getScalarType());
8699 // Prepare the operand vector.
8700 for (Value *V : VL) {
8701 auto *I = dyn_cast<GetElementPtrInst>(V);
8702 if (!I) {
8703 Operands.back().push_back(
8704 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8705 continue;
8706 }
8707 auto *Op = I->getOperand(IndexIdx);
8708 auto *CI = dyn_cast<ConstantInt>(Op);
8709 if (!CI)
8710 Operands.back().push_back(Op);
8711 else
8712 Operands.back().push_back(ConstantFoldIntegerCast(
8713 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8714 }
8715 TE->setOperand(IndexIdx, Operands.back());
8716
8717 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8718 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8719 return;
8720 }
8721 case Instruction::Store: {
8722 bool Consecutive = CurrentOrder.empty();
8723 if (!Consecutive)
8724 fixupOrderingIndices(CurrentOrder);
8725 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8726 ReuseShuffleIndices, CurrentOrder);
8727 if (Consecutive)
8728 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
8729 TE->dump());
8730 else
8731 LLVM_DEBUG(
8732 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
8733 TE->dump());
8734 TE->setOperand(*this);
8735 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8736 return;
8737 }
8738 case Instruction::Call: {
8739 // Check if the calls are all to the same vectorizable intrinsic or
8740 // library function.
8741 CallInst *CI = cast<CallInst>(VL0);
8743
8744 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8745 ReuseShuffleIndices);
8746 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
8747 TE->dump());
8748 TE->setOperand(*this, isCommutative(VL0));
8749 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8750 // For scalar operands no need to create an entry since no need to
8751 // vectorize it.
8753 continue;
8754 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8755 }
8756 return;
8757 }
8758 case Instruction::ShuffleVector: {
8759 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8760 ReuseShuffleIndices);
8761 if (S.isAltShuffle()) {
8762 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
8763 TE->dump());
8764 } else {
8765 assert(SLPReVec && "Only supported by REVEC.");
8766 LLVM_DEBUG(
8767 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8768 TE->dump());
8769 }
8770
8771 // Reorder operands if reordering would enable vectorization.
8772 auto *CI = dyn_cast<CmpInst>(VL0);
8773 if (CI && any_of(VL, [](Value *V) {
8774 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8775 })) {
8776 auto *MainCI = cast<CmpInst>(S.getMainOp());
8777 auto *AltCI = cast<CmpInst>(S.getAltOp());
8778 CmpInst::Predicate MainP = MainCI->getPredicate();
8779 CmpInst::Predicate AltP = AltCI->getPredicate();
8780 assert(MainP != AltP &&
8781 "Expected different main/alternate predicates.");
8783 // Collect operands - commute if it uses the swapped predicate or
8784 // alternate operation.
8785 for (Value *V : VL) {
8786 if (isa<PoisonValue>(V)) {
8787 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8788 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8789 continue;
8790 }
8791 auto *Cmp = cast<CmpInst>(V);
8792 Value *LHS = Cmp->getOperand(0);
8793 Value *RHS = Cmp->getOperand(1);
8794
8795 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8796 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8797 std::swap(LHS, RHS);
8798 } else {
8799 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8800 std::swap(LHS, RHS);
8801 }
8802 Left.push_back(LHS);
8803 Right.push_back(RHS);
8804 }
8805 TE->setOperand(0, Left);
8806 TE->setOperand(1, Right);
8807 buildTree_rec(Left, Depth + 1, {TE, 0});
8808 buildTree_rec(Right, Depth + 1, {TE, 1});
8809 return;
8810 }
8811
8812 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8813 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8814 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8815 return;
8816 }
8817 default:
8818 break;
8819 }
8820 llvm_unreachable("Unexpected vectorization of the instructions.");
8821}
8822
8824 unsigned N = 1;
8825 Type *EltTy = T;
8826
8827 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8828 if (EltTy->isEmptyTy())
8829 return 0;
8830 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8831 // Check that struct is homogeneous.
8832 for (const auto *Ty : ST->elements())
8833 if (Ty != *ST->element_begin())
8834 return 0;
8835 N *= ST->getNumElements();
8836 EltTy = *ST->element_begin();
8837 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8838 N *= AT->getNumElements();
8839 EltTy = AT->getElementType();
8840 } else {
8841 auto *VT = cast<FixedVectorType>(EltTy);
8842 N *= VT->getNumElements();
8843 EltTy = VT->getElementType();
8844 }
8845 }
8846
8847 if (!isValidElementType(EltTy))
8848 return 0;
8849 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8850 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8851 VTSize != DL->getTypeStoreSizeInBits(T))
8852 return 0;
8853 return N;
8854}
8855
8856bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
8857 SmallVectorImpl<unsigned> &CurrentOrder,
8858 bool ResizeAllowed) const {
8859 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8860 assert(It != VL.end() && "Expected at least one extract instruction.");
8861 auto *E0 = cast<Instruction>(*It);
8862 assert(
8863 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8864 "Invalid opcode");
8865 // Check if all of the extracts come from the same vector and from the
8866 // correct offset.
8867 Value *Vec = E0->getOperand(0);
8868
8869 CurrentOrder.clear();
8870
8871 // We have to extract from a vector/aggregate with the same number of elements.
8872 unsigned NElts;
8873 if (E0->getOpcode() == Instruction::ExtractValue) {
8874 NElts = canMapToVector(Vec->getType());
8875 if (!NElts)
8876 return false;
8877 // Check if load can be rewritten as load of vector.
8878 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8879 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8880 return false;
8881 } else {
8882 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8883 }
8884
8885 unsigned E = VL.size();
8886 if (!ResizeAllowed && NElts != E)
8887 return false;
8888 SmallVector<int> Indices(E, PoisonMaskElem);
8889 unsigned MinIdx = NElts, MaxIdx = 0;
8890 for (auto [I, V] : enumerate(VL)) {
8891 auto *Inst = dyn_cast<Instruction>(V);
8892 if (!Inst)
8893 continue;
8894 if (Inst->getOperand(0) != Vec)
8895 return false;
8896 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8897 if (isa<UndefValue>(EE->getIndexOperand()))
8898 continue;
8899 std::optional<unsigned> Idx = getExtractIndex(Inst);
8900 if (!Idx)
8901 return false;
8902 const unsigned ExtIdx = *Idx;
8903 if (ExtIdx >= NElts)
8904 continue;
8905 Indices[I] = ExtIdx;
8906 if (MinIdx > ExtIdx)
8907 MinIdx = ExtIdx;
8908 if (MaxIdx < ExtIdx)
8909 MaxIdx = ExtIdx;
8910 }
8911 if (MaxIdx - MinIdx + 1 > E)
8912 return false;
8913 if (MaxIdx + 1 <= E)
8914 MinIdx = 0;
8915
8916 // Check that all of the indices extract from the correct offset.
8917 bool ShouldKeepOrder = true;
8918 // Assign to all items the initial value E + 1 so we can check if the extract
8919 // instruction index was used already.
8920 // Also, later we can check that all the indices are used and we have a
8921 // consecutive access in the extract instructions, by checking that no
8922 // element of CurrentOrder still has value E + 1.
8923 CurrentOrder.assign(E, E);
8924 for (unsigned I = 0; I < E; ++I) {
8925 if (Indices[I] == PoisonMaskElem)
8926 continue;
8927 const unsigned ExtIdx = Indices[I] - MinIdx;
8928 if (CurrentOrder[ExtIdx] != E) {
8929 CurrentOrder.clear();
8930 return false;
8931 }
8932 ShouldKeepOrder &= ExtIdx == I;
8933 CurrentOrder[ExtIdx] = I;
8934 }
8935 if (ShouldKeepOrder)
8936 CurrentOrder.clear();
8937
8938 return ShouldKeepOrder;
8939}
8940
8941bool BoUpSLP::areAllUsersVectorized(
8942 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
8943 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
8944 all_of(I->users(), [this](User *U) {
8945 return ScalarToTreeEntry.contains(U) ||
8946 isVectorLikeInstWithConstOps(U) ||
8947 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8948 });
8949}
8950
8951static std::pair<InstructionCost, InstructionCost>
8954 ArrayRef<Type *> ArgTys) {
8956
8957 // Calculate the cost of the scalar and vector calls.
8958 FastMathFlags FMF;
8959 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
8960 FMF = FPCI->getFastMathFlags();
8962 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
8963 dyn_cast<IntrinsicInst>(CI));
8964 auto IntrinsicCost =
8966
8967 auto Shape = VFShape::get(CI->getFunctionType(),
8969 false /*HasGlobalPred*/);
8970 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
8971 auto LibCost = IntrinsicCost;
8972 if (!CI->isNoBuiltin() && VecFunc) {
8973 // Calculate the cost of the vector library call.
8974 // If the corresponding vector call is cheaper, return its cost.
8975 LibCost =
8976 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
8977 }
8978 return {IntrinsicCost, LibCost};
8979}
8980
8981void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
8982 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
8983 SmallVectorImpl<Value *> *OpScalars,
8984 SmallVectorImpl<Value *> *AltScalars) const {
8985 unsigned Sz = Scalars.size();
8986 Mask.assign(Sz, PoisonMaskElem);
8987 SmallVector<int> OrderMask;
8988 if (!ReorderIndices.empty())
8989 inversePermutation(ReorderIndices, OrderMask);
8990 for (unsigned I = 0; I < Sz; ++I) {
8991 unsigned Idx = I;
8992 if (!ReorderIndices.empty())
8993 Idx = OrderMask[I];
8994 if (isa<PoisonValue>(Scalars[Idx]))
8995 continue;
8996 auto *OpInst = cast<Instruction>(Scalars[Idx]);
8997 if (IsAltOp(OpInst)) {
8998 Mask[I] = Sz + Idx;
8999 if (AltScalars)
9000 AltScalars->push_back(OpInst);
9001 } else {
9002 Mask[I] = Idx;
9003 if (OpScalars)
9004 OpScalars->push_back(OpInst);
9005 }
9006 }
9007 if (!ReuseShuffleIndices.empty()) {
9008 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
9009 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
9010 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9011 });
9012 Mask.swap(NewMask);
9013 }
9014}
9015
9017 const Instruction *MainOp,
9018 const Instruction *AltOp,
9019 const TargetLibraryInfo &TLI) {
9020 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9021 auto *AltCI = cast<CmpInst>(AltOp);
9022 CmpInst::Predicate MainP = MainCI->getPredicate();
9023 CmpInst::Predicate AltP = AltCI->getPredicate();
9024 assert(MainP != AltP && "Expected different main/alternate predicates.");
9025 auto *CI = cast<CmpInst>(I);
9026 if (isCmpSameOrSwapped(MainCI, CI, TLI))
9027 return false;
9028 if (isCmpSameOrSwapped(AltCI, CI, TLI))
9029 return true;
9030 CmpInst::Predicate P = CI->getPredicate();
9032
9033 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
9034 "CmpInst expected to match either main or alternate predicate or "
9035 "their swap.");
9036 (void)AltP;
9037 return MainP != P && MainP != SwappedP;
9038 }
9039 return I->getOpcode() == AltOp->getOpcode();
9040}
9041
9042TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9043 assert(!Ops.empty());
9044 const auto *Op0 = Ops.front();
9045
9046 const bool IsConstant = all_of(Ops, [](Value *V) {
9047 // TODO: We should allow undef elements here
9048 return isConstant(V) && !isa<UndefValue>(V);
9049 });
9050 const bool IsUniform = all_of(Ops, [=](Value *V) {
9051 // TODO: We should allow undef elements here
9052 return V == Op0;
9053 });
9054 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9055 // TODO: We should allow undef elements here
9056 if (auto *CI = dyn_cast<ConstantInt>(V))
9057 return CI->getValue().isPowerOf2();
9058 return false;
9059 });
9060 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9061 // TODO: We should allow undef elements here
9062 if (auto *CI = dyn_cast<ConstantInt>(V))
9063 return CI->getValue().isNegatedPowerOf2();
9064 return false;
9065 });
9066
9068 if (IsConstant && IsUniform)
9070 else if (IsConstant)
9072 else if (IsUniform)
9074
9076 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9077 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9078
9079 return {VK, VP};
9080}
9081
9082namespace {
9083/// The base class for shuffle instruction emission and shuffle cost estimation.
9084class BaseShuffleAnalysis {
9085protected:
9086 Type *ScalarTy = nullptr;
9087
9088 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9089
9090 /// V is expected to be a vectorized value.
9091 /// When REVEC is disabled, there is no difference between VF and
9092 /// VNumElements.
9093 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9094 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9095 /// of 8.
9096 unsigned getVF(Value *V) const {
9097 assert(V && "V cannot be nullptr");
9098 assert(isa<FixedVectorType>(V->getType()) &&
9099 "V does not have FixedVectorType");
9100 assert(ScalarTy && "ScalarTy cannot be nullptr");
9101 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9102 unsigned VNumElements =
9103 cast<FixedVectorType>(V->getType())->getNumElements();
9104 assert(VNumElements > ScalarTyNumElements &&
9105 "the number of elements of V is not large enough");
9106 assert(VNumElements % ScalarTyNumElements == 0 &&
9107 "the number of elements of V is not a vectorized value");
9108 return VNumElements / ScalarTyNumElements;
9109 }
9110
9111 /// Checks if the mask is an identity mask.
9112 /// \param IsStrict if is true the function returns false if mask size does
9113 /// not match vector size.
9114 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9115 bool IsStrict) {
9116 int Limit = Mask.size();
9117 int VF = VecTy->getNumElements();
9118 int Index = -1;
9119 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9120 return true;
9121 if (!IsStrict) {
9122 // Consider extract subvector starting from index 0.
9124 Index == 0)
9125 return true;
9126 // All VF-size submasks are identity (e.g.
9127 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9128 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9129 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9130 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9132 }))
9133 return true;
9134 }
9135 return false;
9136 }
9137
9138 /// Tries to combine 2 different masks into single one.
9139 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9140 /// change the size of the vector, \p LocalVF is the original size of the
9141 /// shuffled vector.
9142 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9143 ArrayRef<int> ExtMask) {
9144 unsigned VF = Mask.size();
9145 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9146 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9147 if (ExtMask[I] == PoisonMaskElem)
9148 continue;
9149 int MaskedIdx = Mask[ExtMask[I] % VF];
9150 NewMask[I] =
9151 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9152 }
9153 Mask.swap(NewMask);
9154 }
9155
9156 /// Looks through shuffles trying to reduce final number of shuffles in the
9157 /// code. The function looks through the previously emitted shuffle
9158 /// instructions and properly mark indices in mask as undef.
9159 /// For example, given the code
9160 /// \code
9161 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9162 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9163 /// \endcode
9164 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9165 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9166 /// <0, 1, 2, 3> for the shuffle.
9167 /// If 2 operands are of different size, the smallest one will be resized and
9168 /// the mask recalculated properly.
9169 /// For example, given the code
9170 /// \code
9171 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9172 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9173 /// \endcode
9174 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9175 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9176 /// <0, 1, 2, 3> for the shuffle.
9177 /// So, it tries to transform permutations to simple vector merge, if
9178 /// possible.
9179 /// \param V The input vector which must be shuffled using the given \p Mask.
9180 /// If the better candidate is found, \p V is set to this best candidate
9181 /// vector.
9182 /// \param Mask The input mask for the shuffle. If the best candidate is found
9183 /// during looking-through-shuffles attempt, it is updated accordingly.
9184 /// \param SinglePermute true if the shuffle operation is originally a
9185 /// single-value-permutation. In this case the look-through-shuffles procedure
9186 /// may look for resizing shuffles as the best candidates.
9187 /// \return true if the shuffle results in the non-resizing identity shuffle
9188 /// (and thus can be ignored), false - otherwise.
9189 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9190 bool SinglePermute) {
9191 Value *Op = V;
9192 ShuffleVectorInst *IdentityOp = nullptr;
9193 SmallVector<int> IdentityMask;
9194 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9195 // Exit if not a fixed vector type or changing size shuffle.
9196 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9197 if (!SVTy)
9198 break;
9199 // Remember the identity or broadcast mask, if it is not a resizing
9200 // shuffle. If no better candidates are found, this Op and Mask will be
9201 // used in the final shuffle.
9202 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9203 if (!IdentityOp || !SinglePermute ||
9204 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9206 IdentityMask.size()))) {
9207 IdentityOp = SV;
9208 // Store current mask in the IdentityMask so later we did not lost
9209 // this info if IdentityOp is selected as the best candidate for the
9210 // permutation.
9211 IdentityMask.assign(Mask);
9212 }
9213 }
9214 // Remember the broadcast mask. If no better candidates are found, this Op
9215 // and Mask will be used in the final shuffle.
9216 // Zero splat can be used as identity too, since it might be used with
9217 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9218 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9219 // expensive, the analysis founds out, that the source vector is just a
9220 // broadcast, this original mask can be transformed to identity mask <0,
9221 // 1, 2, 3>.
9222 // \code
9223 // %0 = shuffle %v, poison, zeroinitalizer
9224 // %res = shuffle %0, poison, <3, 1, 2, 0>
9225 // \endcode
9226 // may be transformed to
9227 // \code
9228 // %0 = shuffle %v, poison, zeroinitalizer
9229 // %res = shuffle %0, poison, <0, 1, 2, 3>
9230 // \endcode
9231 if (SV->isZeroEltSplat()) {
9232 IdentityOp = SV;
9233 IdentityMask.assign(Mask);
9234 }
9235 int LocalVF = Mask.size();
9236 if (auto *SVOpTy =
9237 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9238 LocalVF = SVOpTy->getNumElements();
9239 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9240 for (auto [Idx, I] : enumerate(Mask)) {
9241 if (I == PoisonMaskElem ||
9242 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9243 continue;
9244 ExtMask[Idx] = SV->getMaskValue(I);
9245 }
9246 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9247 SV->getOperand(0),
9248 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9249 .all();
9250 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9251 SV->getOperand(1),
9252 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9253 .all();
9254 if (!IsOp1Undef && !IsOp2Undef) {
9255 // Update mask and mark undef elems.
9256 for (int &I : Mask) {
9257 if (I == PoisonMaskElem)
9258 continue;
9259 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9261 I = PoisonMaskElem;
9262 }
9263 break;
9264 }
9265 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9266 combineMasks(LocalVF, ShuffleMask, Mask);
9267 Mask.swap(ShuffleMask);
9268 if (IsOp2Undef)
9269 Op = SV->getOperand(0);
9270 else
9271 Op = SV->getOperand(1);
9272 }
9273 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9274 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9276 if (IdentityOp) {
9277 V = IdentityOp;
9278 assert(Mask.size() == IdentityMask.size() &&
9279 "Expected masks of same sizes.");
9280 // Clear known poison elements.
9281 for (auto [I, Idx] : enumerate(Mask))
9282 if (Idx == PoisonMaskElem)
9283 IdentityMask[I] = PoisonMaskElem;
9284 Mask.swap(IdentityMask);
9285 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9286 return SinglePermute &&
9287 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9288 /*IsStrict=*/true) ||
9289 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9290 Shuffle->isZeroEltSplat() &&
9292 }
9293 V = Op;
9294 return false;
9295 }
9296 V = Op;
9297 return true;
9298 }
9299
9300 /// Smart shuffle instruction emission, walks through shuffles trees and
9301 /// tries to find the best matching vector for the actual shuffle
9302 /// instruction.
9303 template <typename T, typename ShuffleBuilderTy>
9304 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9305 ShuffleBuilderTy &Builder) {
9306 assert(V1 && "Expected at least one vector value.");
9307 if (V2)
9308 Builder.resizeToMatch(V1, V2);
9309 int VF = Mask.size();
9310 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9311 VF = FTy->getNumElements();
9312 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9313 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9314 .all()) {
9315 // Peek through shuffles.
9316 Value *Op1 = V1;
9317 Value *Op2 = V2;
9318 int VF =
9319 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9320 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9321 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9322 for (int I = 0, E = Mask.size(); I < E; ++I) {
9323 if (Mask[I] < VF)
9324 CombinedMask1[I] = Mask[I];
9325 else
9326 CombinedMask2[I] = Mask[I] - VF;
9327 }
9328 Value *PrevOp1;
9329 Value *PrevOp2;
9330 do {
9331 PrevOp1 = Op1;
9332 PrevOp2 = Op2;
9333 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9334 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9335 // Check if we have 2 resizing shuffles - need to peek through operands
9336 // again.
9337 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9338 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9339 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9340 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9341 if (I == PoisonMaskElem)
9342 continue;
9343 ExtMask1[Idx] = SV1->getMaskValue(I);
9344 }
9345 SmallBitVector UseMask1 = buildUseMask(
9346 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9347 ->getNumElements(),
9348 ExtMask1, UseMask::SecondArg);
9349 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9350 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9351 if (I == PoisonMaskElem)
9352 continue;
9353 ExtMask2[Idx] = SV2->getMaskValue(I);
9354 }
9355 SmallBitVector UseMask2 = buildUseMask(
9356 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9357 ->getNumElements(),
9358 ExtMask2, UseMask::SecondArg);
9359 if (SV1->getOperand(0)->getType() ==
9360 SV2->getOperand(0)->getType() &&
9361 SV1->getOperand(0)->getType() != SV1->getType() &&
9362 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9363 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9364 Op1 = SV1->getOperand(0);
9365 Op2 = SV2->getOperand(0);
9366 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9367 int LocalVF = ShuffleMask1.size();
9368 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9369 LocalVF = FTy->getNumElements();
9370 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9371 CombinedMask1.swap(ShuffleMask1);
9372 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9373 LocalVF = ShuffleMask2.size();
9374 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9375 LocalVF = FTy->getNumElements();
9376 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9377 CombinedMask2.swap(ShuffleMask2);
9378 }
9379 }
9380 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9381 Builder.resizeToMatch(Op1, Op2);
9382 VF = std::max(cast<VectorType>(Op1->getType())
9383 ->getElementCount()
9384 .getKnownMinValue(),
9385 cast<VectorType>(Op2->getType())
9386 ->getElementCount()
9387 .getKnownMinValue());
9388 for (int I = 0, E = Mask.size(); I < E; ++I) {
9389 if (CombinedMask2[I] != PoisonMaskElem) {
9390 assert(CombinedMask1[I] == PoisonMaskElem &&
9391 "Expected undefined mask element");
9392 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9393 }
9394 }
9395 if (Op1 == Op2 &&
9396 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9397 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9398 isa<ShuffleVectorInst>(Op1) &&
9399 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9400 ArrayRef(CombinedMask1))))
9401 return Builder.createIdentity(Op1);
9402 return Builder.createShuffleVector(
9403 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9404 CombinedMask1);
9405 }
9406 if (isa<PoisonValue>(V1))
9407 return Builder.createPoison(
9408 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9409 SmallVector<int> NewMask(Mask);
9410 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9411 assert(V1 && "Expected non-null value after looking through shuffles.");
9412
9413 if (!IsIdentity)
9414 return Builder.createShuffleVector(V1, NewMask);
9415 return Builder.createIdentity(V1);
9416 }
9417};
9418} // namespace
9419
9420/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9421static std::pair<InstructionCost, InstructionCost>
9423 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9424 Type *ScalarTy, VectorType *VecTy) {
9425 InstructionCost ScalarCost = 0;
9426 InstructionCost VecCost = 0;
9427 // Here we differentiate two cases: (1) when Ptrs represent a regular
9428 // vectorization tree node (as they are pointer arguments of scattered
9429 // loads) or (2) when Ptrs are the arguments of loads or stores being
9430 // vectorized as plane wide unit-stride load/store since all the
9431 // loads/stores are known to be from/to adjacent locations.
9432 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9433 // Case 2: estimate costs for pointer related costs when vectorizing to
9434 // a wide load/store.
9435 // Scalar cost is estimated as a set of pointers with known relationship
9436 // between them.
9437 // For vector code we will use BasePtr as argument for the wide load/store
9438 // but we also need to account all the instructions which are going to
9439 // stay in vectorized code due to uses outside of these scalar
9440 // loads/stores.
9441 ScalarCost = TTI.getPointersChainCost(
9442 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9443 CostKind);
9444
9445 SmallVector<const Value *> PtrsRetainedInVecCode;
9446 for (Value *V : Ptrs) {
9447 if (V == BasePtr) {
9448 PtrsRetainedInVecCode.push_back(V);
9449 continue;
9450 }
9451 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9452 // For simplicity assume Ptr to stay in vectorized code if it's not a
9453 // GEP instruction. We don't care since it's cost considered free.
9454 // TODO: We should check for any uses outside of vectorizable tree
9455 // rather than just single use.
9456 if (!Ptr || !Ptr->hasOneUse())
9457 PtrsRetainedInVecCode.push_back(V);
9458 }
9459
9460 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9461 // If all pointers stay in vectorized code then we don't have
9462 // any savings on that.
9463 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9464 }
9465 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9466 TTI::PointersChainInfo::getKnownStride(),
9467 VecTy, CostKind);
9468 } else {
9469 // Case 1: Ptrs are the arguments of loads that we are going to transform
9470 // into masked gather load intrinsic.
9471 // All the scalar GEPs will be removed as a result of vectorization.
9472 // For any external uses of some lanes extract element instructions will
9473 // be generated (which cost is estimated separately).
9474 TTI::PointersChainInfo PtrsInfo =
9475 all_of(Ptrs,
9476 [](const Value *V) {
9477 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9478 return Ptr && !Ptr->hasAllConstantIndices();
9479 })
9480 ? TTI::PointersChainInfo::getUnknownStride()
9481 : TTI::PointersChainInfo::getKnownStride();
9482
9483 ScalarCost =
9484 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9485 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9486 if (!BaseGEP) {
9487 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9488 if (It != Ptrs.end())
9489 BaseGEP = cast<GEPOperator>(*It);
9490 }
9491 if (BaseGEP) {
9492 SmallVector<const Value *> Indices(BaseGEP->indices());
9493 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9494 BaseGEP->getPointerOperand(), Indices, VecTy,
9495 CostKind);
9496 }
9497 }
9498
9499 return std::make_pair(ScalarCost, VecCost);
9500}
9501
9502void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9503 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9504 "Expected gather node without reordering.");
9506 SmallSet<size_t, 2> LoadKeyUsed;
9507
9508 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9509 // instructions have same opcode already.
9510 if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
9511 all_of(TE.Scalars, isConstant))
9512 return;
9513
9514 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9515 return VectorizableTree[Idx]->isSame(TE.Scalars);
9516 }))
9517 return;
9518
9519 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9520 Key = hash_combine(hash_value(LI->getParent()), Key);
9521 Value *Ptr =
9523 if (LoadKeyUsed.contains(Key)) {
9524 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9525 if (LIt != LoadsMap.end()) {
9526 for (LoadInst *RLI : LIt->second) {
9527 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9528 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9529 /*StrictCheck=*/true))
9530 return hash_value(RLI->getPointerOperand());
9531 }
9532 for (LoadInst *RLI : LIt->second) {
9534 LI->getPointerOperand(), *TLI)) {
9535 hash_code SubKey = hash_value(RLI->getPointerOperand());
9536 return SubKey;
9537 }
9538 }
9539 if (LIt->second.size() > 2) {
9540 hash_code SubKey =
9541 hash_value(LIt->second.back()->getPointerOperand());
9542 return SubKey;
9543 }
9544 }
9545 }
9546 LoadKeyUsed.insert(Key);
9547 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9548 return hash_value(LI->getPointerOperand());
9549 };
9552 bool IsOrdered = true;
9553 unsigned NumInstructions = 0;
9554 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9555 // nodes.
9556 for (auto [I, V] : enumerate(TE.Scalars)) {
9557 size_t Key = 1, Idx = 1;
9558 if (auto *Inst = dyn_cast<Instruction>(V);
9559 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9560 !isDeleted(Inst) && !isVectorized(V)) {
9561 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9562 /*AllowAlternate=*/false);
9563 ++NumInstructions;
9564 }
9565 auto &Container = SortedValues[Key];
9566 if (IsOrdered && !KeyToIndex.contains(V) &&
9567 !(isa<Constant, ExtractElementInst>(V) ||
9569 ((Container.contains(Idx) &&
9570 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9571 (!Container.empty() && !Container.contains(Idx) &&
9572 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9573 IsOrdered = false;
9574 auto &KTI = KeyToIndex[V];
9575 if (KTI.empty())
9576 Container[Idx].push_back(V);
9577 KTI.push_back(I);
9578 }
9580 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9581 if (!IsOrdered && NumInstructions > 1) {
9582 unsigned Cnt = 0;
9583 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9584 for (const auto &D : SortedValues) {
9585 for (const auto &P : D.second) {
9586 unsigned Sz = 0;
9587 for (Value *V : P.second) {
9588 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9589 for (auto [K, Idx] : enumerate(Indices)) {
9590 TE.ReorderIndices[Cnt + K] = Idx;
9591 TE.Scalars[Cnt + K] = V;
9592 }
9593 Sz += Indices.size();
9594 Cnt += Indices.size();
9595 }
9596 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9597 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9598 *TTI, TE.Scalars.front()->getType(), Sz);
9599 SubVectors.emplace_back(Cnt - Sz, SubVF);
9600 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9601 DemandedElts.clearBit(I);
9602 } else if (!P.second.empty() && isConstant(P.second.front())) {
9603 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9604 DemandedElts.clearBit(I);
9605 }
9606 }
9607 }
9608 }
9609 // Reuses always require shuffles, so consider it as profitable.
9610 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9611 return;
9612 // Do simple cost estimation.
9615 auto *ScalarTy = TE.Scalars.front()->getType();
9616 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9617 for (auto [Idx, Sz] : SubVectors) {
9619 Idx, getWidenedType(ScalarTy, Sz));
9620 }
9621 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9622 assert(SLPReVec && "Only supported by REVEC.");
9623 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9624 // of CreateInsertElement.
9625 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9626 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9627 if (DemandedElts[I])
9628 Cost +=
9629 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9630 CostKind, I * ScalarTyNumElements, FTy);
9631 } else {
9632 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9633 /*Extract=*/false, CostKind);
9634 }
9635 int Sz = TE.Scalars.size();
9636 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9637 TE.ReorderIndices.end());
9638 for (unsigned I : seq<unsigned>(Sz)) {
9639 Value *V = TE.getOrdered(I);
9640 if (isa<PoisonValue>(V)) {
9641 ReorderMask[I] = PoisonMaskElem;
9642 } else if (isConstant(V) || DemandedElts[I]) {
9643 ReorderMask[I] = I + TE.ReorderIndices.size();
9644 }
9645 }
9647 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9650 VecTy, ReorderMask);
9651 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9652 ReorderMask.assign(Sz, PoisonMaskElem);
9653 for (unsigned I : seq<unsigned>(Sz)) {
9654 Value *V = TE.getOrdered(I);
9655 if (isConstant(V)) {
9656 DemandedElts.clearBit(I);
9657 if (!isa<PoisonValue>(V))
9658 ReorderMask[I] = I;
9659 } else {
9660 ReorderMask[I] = I + Sz;
9661 }
9662 }
9664 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9665 if (!DemandedElts.isAllOnes())
9666 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9667 if (Cost >= BVCost) {
9668 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9669 reorderScalars(TE.Scalars, Mask);
9670 TE.ReorderIndices.clear();
9671 }
9672}
9673
9676 BaseGraphSize = VectorizableTree.size();
9677 // Turn graph transforming mode on and off, when done.
9678 class GraphTransformModeRAAI {
9679 bool &SavedIsGraphTransformMode;
9680
9681 public:
9682 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9683 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9684 IsGraphTransformMode = true;
9685 }
9686 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9687 } TransformContext(IsGraphTransformMode);
9688 // Operands are profitable if they are:
9689 // 1. At least one constant
9690 // or
9691 // 2. Splats
9692 // or
9693 // 3. Results in good vectorization opportunity, i.e. may generate vector
9694 // nodes and reduce cost of the graph.
9695 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9696 const InstructionsState &S) {
9698 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9699 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9700 I2->getOperand(Op));
9701 return all_of(
9702 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9703 return all_of(Cand,
9704 [](const std::pair<Value *, Value *> &P) {
9705 return isa<Constant>(P.first) ||
9706 isa<Constant>(P.second) || P.first == P.second;
9707 }) ||
9709 });
9710 };
9711
9712 // Try to reorder gather nodes for better vectorization opportunities.
9713 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9714 TreeEntry &E = *VectorizableTree[Idx];
9715 if (E.isGather())
9716 reorderGatherNode(E);
9717 }
9718
9719 // The tree may grow here, so iterate over nodes, built before.
9720 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9721 TreeEntry &E = *VectorizableTree[Idx];
9722 if (E.isGather()) {
9723 ArrayRef<Value *> VL = E.Scalars;
9724 const unsigned Sz = getVectorElementSize(VL.front());
9725 unsigned MinVF = getMinVF(2 * Sz);
9726 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9727 // same opcode and same parent block or all constants.
9728 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9729 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9730 E.isAltShuffle() || !allSameBlock(VL)) ||
9731 allConstant(VL) || isSplat(VL))
9732 continue;
9733 // Try to find vectorizable sequences and transform them into a series of
9734 // insertvector instructions.
9735 unsigned StartIdx = 0;
9736 unsigned End = VL.size();
9737 for (unsigned VF = getFloorFullVectorNumberOfElements(
9738 *TTI, VL.front()->getType(), VL.size() - 1);
9739 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9740 *TTI, VL.front()->getType(), VF - 1)) {
9741 if (StartIdx + VF > End)
9742 continue;
9744 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9745 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9746 // If any instruction is vectorized already - do not try again.
9747 // Reuse the existing node, if it fully matches the slice.
9748 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9749 SE || getTreeEntry(Slice.back())) {
9750 if (!SE)
9751 continue;
9752 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9753 continue;
9754 }
9755 // Constant already handled effectively - skip.
9756 if (allConstant(Slice))
9757 continue;
9758 // Do not try to vectorize small splats (less than vector register and
9759 // only with the single non-undef element).
9760 bool IsSplat = isSplat(Slice);
9761 if (Slices.empty() || !IsSplat ||
9762 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9763 Slice.front()->getType(), VF)),
9764 1U, VF - 1) !=
9766 Slice.front()->getType(), 2 * VF)),
9767 1U, 2 * VF)) ||
9768 count(Slice, Slice.front()) ==
9769 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9770 : 1)) {
9771 if (IsSplat)
9772 continue;
9773 InstructionsState S = getSameOpcode(Slice, *TLI);
9774 if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) ||
9775 (S.getOpcode() == Instruction::Load &&
9777 (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
9778 continue;
9779 if (VF == 2) {
9780 // Try to vectorize reduced values or if all users are vectorized.
9781 // For expensive instructions extra extracts might be profitable.
9782 if ((!UserIgnoreList || E.Idx != 0) &&
9783 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9785 !all_of(Slice, [&](Value *V) {
9786 if (isa<PoisonValue>(V))
9787 return true;
9788 return areAllUsersVectorized(cast<Instruction>(V),
9789 UserIgnoreList);
9790 }))
9791 continue;
9792 if (S.getOpcode() == Instruction::Load) {
9793 OrdersType Order;
9794 SmallVector<Value *> PointerOps;
9795 LoadsState Res =
9796 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9797 // Do not vectorize gathers.
9798 if (Res == LoadsState::ScatterVectorize ||
9799 Res == LoadsState::Gather) {
9800 if (Res == LoadsState::Gather) {
9802 // If reductions and the scalars from the root node are
9803 // analyzed - mark as non-vectorizable reduction.
9804 if (UserIgnoreList && E.Idx == 0)
9805 analyzedReductionVals(Slice);
9806 }
9807 continue;
9808 }
9809 } else if (S.getOpcode() == Instruction::ExtractElement ||
9810 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9812 !CheckOperandsProfitability(
9813 S.getMainOp(),
9814 cast<Instruction>(*find_if(reverse(Slice),
9815 IsaPred<Instruction>)),
9816 S))) {
9817 // Do not vectorize extractelements (handled effectively
9818 // alread). Do not vectorize non-profitable instructions (with
9819 // low cost and non-vectorizable operands.)
9820 continue;
9821 }
9822 }
9823 }
9824 Slices.emplace_back(Cnt, Slice.size());
9825 }
9826 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9827 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9828 if (StartIdx == Cnt)
9829 StartIdx = Cnt + Sz;
9830 if (End == Cnt + Sz)
9831 End = Cnt;
9832 };
9833 for (auto [Cnt, Sz] : Slices) {
9834 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9835 // If any instruction is vectorized already - do not try again.
9836 if (TreeEntry *SE = getTreeEntry(Slice.front());
9837 SE || getTreeEntry(Slice.back())) {
9838 if (!SE)
9839 continue;
9840 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9841 continue;
9842 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9843 AddCombinedNode(SE->Idx, Cnt, Sz);
9844 continue;
9845 }
9846 unsigned PrevSize = VectorizableTree.size();
9847 [[maybe_unused]] unsigned PrevEntriesSize =
9848 LoadEntriesToVectorize.size();
9849 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9850 if (PrevSize + 1 == VectorizableTree.size() &&
9851 VectorizableTree[PrevSize]->isGather() &&
9852 VectorizableTree[PrevSize]->getOpcode() !=
9853 Instruction::ExtractElement &&
9854 !isSplat(Slice)) {
9855 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9856 analyzedReductionVals(Slice);
9857 VectorizableTree.pop_back();
9858 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9859 "LoadEntriesToVectorize expected to remain the same");
9860 continue;
9861 }
9862 AddCombinedNode(PrevSize, Cnt, Sz);
9863 }
9864 }
9865 // Restore ordering, if no extra vectorization happened.
9866 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9867 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9868 reorderScalars(E.Scalars, Mask);
9869 E.ReorderIndices.clear();
9870 }
9871 }
9872 switch (E.getOpcode()) {
9873 case Instruction::Load: {
9874 // No need to reorder masked gather loads, just reorder the scalar
9875 // operands.
9876 if (E.State != TreeEntry::Vectorize)
9877 break;
9878 Type *ScalarTy = E.getMainOp()->getType();
9879 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9880 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9881 // Check if profitable to represent consecutive load + reverse as strided
9882 // load with stride -1.
9883 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9884 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9885 SmallVector<int> Mask;
9886 inversePermutation(E.ReorderIndices, Mask);
9887 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9888 InstructionCost OriginalVecCost =
9889 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9894 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9895 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9896 if (StridedCost < OriginalVecCost)
9897 // Strided load is more profitable than consecutive load + reverse -
9898 // transform the node to strided load.
9899 E.State = TreeEntry::StridedVectorize;
9900 }
9901 break;
9902 }
9903 case Instruction::Store: {
9904 Type *ScalarTy =
9905 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9906 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9907 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9908 // Check if profitable to represent consecutive load + reverse as strided
9909 // load with stride -1.
9910 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9911 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9912 SmallVector<int> Mask;
9913 inversePermutation(E.ReorderIndices, Mask);
9914 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9915 InstructionCost OriginalVecCost =
9916 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9921 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9922 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
9923 if (StridedCost < OriginalVecCost)
9924 // Strided store is more profitable than reverse + consecutive store -
9925 // transform the node to strided store.
9926 E.State = TreeEntry::StridedVectorize;
9927 } else if (!E.ReorderIndices.empty()) {
9928 // Check for interleaved stores.
9929 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
9930 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9931 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
9932 if (Mask.size() < 4)
9933 return 0u;
9934 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9936 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
9938 VecTy, Factor, BaseSI->getAlign(),
9939 BaseSI->getPointerAddressSpace()))
9940 return Factor;
9941 }
9942
9943 return 0u;
9944 };
9945 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9946 unsigned InterleaveFactor = IsInterleaveMask(Mask);
9947 if (InterleaveFactor != 0)
9948 E.setInterleave(InterleaveFactor);
9949 }
9950 break;
9951 }
9952 case Instruction::Select: {
9953 if (E.State != TreeEntry::Vectorize)
9954 break;
9955 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
9956 if (MinMaxID == Intrinsic::not_intrinsic)
9957 break;
9958 // This node is a minmax node.
9959 E.CombinedOp = TreeEntry::MinMax;
9960 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
9961 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9962 CondEntry->State == TreeEntry::Vectorize) {
9963 // The condition node is part of the combined minmax node.
9964 CondEntry->State = TreeEntry::CombinedVectorize;
9965 }
9966 break;
9967 }
9968 default:
9969 break;
9970 }
9971 }
9972
9973 if (LoadEntriesToVectorize.empty()) {
9974 // Single load node - exit.
9975 if (VectorizableTree.size() <= 1 &&
9976 VectorizableTree.front()->getOpcode() == Instruction::Load)
9977 return;
9978 // Small graph with small VF - exit.
9979 constexpr unsigned SmallTree = 3;
9980 constexpr unsigned SmallVF = 2;
9981 if ((VectorizableTree.size() <= SmallTree &&
9982 VectorizableTree.front()->Scalars.size() == SmallVF) ||
9983 (VectorizableTree.size() <= 2 && UserIgnoreList))
9984 return;
9985
9986 if (VectorizableTree.front()->isNonPowOf2Vec() &&
9987 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
9988 getCanonicalGraphSize() <= SmallTree &&
9989 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
9990 [](const std::unique_ptr<TreeEntry> &TE) {
9991 return TE->isGather() &&
9992 TE->getOpcode() == Instruction::Load &&
9993 !allSameBlock(TE->Scalars);
9994 }) == 1)
9995 return;
9996 }
9997
9998 // A list of loads to be gathered during the vectorization process. We can
9999 // try to vectorize them at the end, if profitable.
10002 GatheredLoads;
10003
10004 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10005 TreeEntry &E = *TE;
10006 if (E.isGather() &&
10007 (E.getOpcode() == Instruction::Load ||
10008 (!E.getOpcode() && any_of(E.Scalars,
10009 [&](Value *V) {
10010 return isa<LoadInst>(V) &&
10011 !isVectorized(V) &&
10012 !isDeleted(cast<Instruction>(V));
10013 }))) &&
10014 !isSplat(E.Scalars)) {
10015 for (Value *V : E.Scalars) {
10016 auto *LI = dyn_cast<LoadInst>(V);
10017 if (!LI)
10018 continue;
10019 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
10020 continue;
10022 *this, V, *DL, *SE, *TTI,
10023 GatheredLoads[std::make_tuple(
10024 LI->getParent(),
10026 LI->getType())]);
10027 }
10028 }
10029 }
10030 // Try to vectorize gathered loads if this is not just a gather of loads.
10031 if (!GatheredLoads.empty())
10032 tryToVectorizeGatheredLoads(GatheredLoads);
10033}
10034
10035/// Merges shuffle masks and emits final shuffle instruction, if required. It
10036/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10037/// when the actual shuffle instruction is generated only if this is actually
10038/// required. Otherwise, the shuffle instruction emission is delayed till the
10039/// end of the process, to reduce the number of emitted instructions and further
10040/// analysis/transformations.
10041class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10042 bool IsFinalized = false;
10043 SmallVector<int> CommonMask;
10045 const TargetTransformInfo &TTI;
10047 SmallDenseSet<Value *> VectorizedVals;
10048 BoUpSLP &R;
10049 SmallPtrSetImpl<Value *> &CheckedExtracts;
10050 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10051 /// While set, still trying to estimate the cost for the same nodes and we
10052 /// can delay actual cost estimation (virtual shuffle instruction emission).
10053 /// May help better estimate the cost if same nodes must be permuted + allows
10054 /// to move most of the long shuffles cost estimation to TTI.
10055 bool SameNodesEstimated = true;
10056
10057 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10058 if (Ty->getScalarType()->isPointerTy()) {
10062 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10063 Ty->getScalarType());
10064 if (auto *VTy = dyn_cast<VectorType>(Ty))
10065 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10066 return Res;
10067 }
10068 return Constant::getAllOnesValue(Ty);
10069 }
10070
10071 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10072 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10073 return TTI::TCC_Free;
10074 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10075 InstructionCost GatherCost = 0;
10076 SmallVector<Value *> Gathers(VL);
10077 if (!Root && isSplat(VL)) {
10078 // Found the broadcasting of the single scalar, calculate the cost as
10079 // the broadcast.
10080 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10081 assert(It != VL.end() && "Expected at least one non-undef value.");
10082 // Add broadcast for non-identity shuffle only.
10083 bool NeedShuffle =
10084 count(VL, *It) > 1 &&
10085 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10086 if (!NeedShuffle) {
10087 if (isa<FixedVectorType>(ScalarTy)) {
10088 assert(SLPReVec && "FixedVectorType is not expected.");
10089 return TTI.getShuffleCost(
10090 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10091 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10092 cast<FixedVectorType>(ScalarTy));
10093 }
10094 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10095 CostKind, std::distance(VL.begin(), It),
10096 PoisonValue::get(VecTy), *It);
10097 }
10098
10099 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10100 transform(VL, ShuffleMask.begin(), [](Value *V) {
10101 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10102 });
10103 InstructionCost InsertCost =
10104 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10105 PoisonValue::get(VecTy), *It);
10106 return InsertCost + ::getShuffleCost(TTI,
10108 VecTy, ShuffleMask, CostKind,
10109 /*Index=*/0, /*SubTp=*/nullptr,
10110 /*Args=*/*It);
10111 }
10112 return GatherCost +
10113 (all_of(Gathers, IsaPred<UndefValue>)
10115 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10116 ScalarTy));
10117 };
10118
10119 /// Compute the cost of creating a vector containing the extracted values from
10120 /// \p VL.
10122 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10123 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10124 unsigned NumParts) {
10125 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10126 unsigned NumElts =
10127 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10128 auto *EE = dyn_cast<ExtractElementInst>(V);
10129 if (!EE)
10130 return Sz;
10131 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10132 if (!VecTy)
10133 return Sz;
10134 return std::max(Sz, VecTy->getNumElements());
10135 });
10136 // FIXME: this must be moved to TTI for better estimation.
10137 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10138 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10140 -> std::optional<TTI::ShuffleKind> {
10141 if (NumElts <= EltsPerVector)
10142 return std::nullopt;
10143 int OffsetReg0 =
10144 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10145 [](int S, int I) {
10146 if (I == PoisonMaskElem)
10147 return S;
10148 return std::min(S, I);
10149 }),
10150 EltsPerVector);
10151 int OffsetReg1 = OffsetReg0;
10152 DenseSet<int> RegIndices;
10153 // Check that if trying to permute same single/2 input vectors.
10155 int FirstRegId = -1;
10156 Indices.assign(1, OffsetReg0);
10157 for (auto [Pos, I] : enumerate(Mask)) {
10158 if (I == PoisonMaskElem)
10159 continue;
10160 int Idx = I - OffsetReg0;
10161 int RegId =
10162 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10163 if (FirstRegId < 0)
10164 FirstRegId = RegId;
10165 RegIndices.insert(RegId);
10166 if (RegIndices.size() > 2)
10167 return std::nullopt;
10168 if (RegIndices.size() == 2) {
10169 ShuffleKind = TTI::SK_PermuteTwoSrc;
10170 if (Indices.size() == 1) {
10171 OffsetReg1 = alignDown(
10172 std::accumulate(
10173 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10174 [&](int S, int I) {
10175 if (I == PoisonMaskElem)
10176 return S;
10177 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10178 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10179 if (RegId == FirstRegId)
10180 return S;
10181 return std::min(S, I);
10182 }),
10183 EltsPerVector);
10184 Indices.push_back(OffsetReg1 % NumElts);
10185 }
10186 Idx = I - OffsetReg1;
10187 }
10188 I = (Idx % NumElts) % EltsPerVector +
10189 (RegId == FirstRegId ? 0 : EltsPerVector);
10190 }
10191 return ShuffleKind;
10192 };
10194
10195 // Process extracts in blocks of EltsPerVector to check if the source vector
10196 // operand can be re-used directly. If not, add the cost of creating a
10197 // shuffle to extract the values into a vector register.
10198 for (unsigned Part : seq<unsigned>(NumParts)) {
10199 if (!ShuffleKinds[Part])
10200 continue;
10201 ArrayRef<int> MaskSlice = Mask.slice(
10202 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10203 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10204 copy(MaskSlice, SubMask.begin());
10206 std::optional<TTI::ShuffleKind> RegShuffleKind =
10207 CheckPerRegistersShuffle(SubMask, Indices);
10208 if (!RegShuffleKind) {
10209 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10211 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10212 Cost +=
10213 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10214 getWidenedType(ScalarTy, NumElts), MaskSlice);
10215 continue;
10216 }
10217 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10218 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10219 Cost +=
10220 ::getShuffleCost(TTI, *RegShuffleKind,
10221 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10222 }
10223 const unsigned BaseVF = getFullVectorNumberOfElements(
10224 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10225 for (unsigned Idx : Indices) {
10226 assert((Idx + EltsPerVector) <= BaseVF &&
10227 "SK_ExtractSubvector index out of range");
10229 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10230 Idx, getWidenedType(ScalarTy, EltsPerVector));
10231 }
10232 // Second attempt to check, if just a permute is better estimated than
10233 // subvector extract.
10234 SubMask.assign(NumElts, PoisonMaskElem);
10235 copy(MaskSlice, SubMask.begin());
10236 InstructionCost OriginalCost = ::getShuffleCost(
10237 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10238 if (OriginalCost < Cost)
10239 Cost = OriginalCost;
10240 }
10241 return Cost;
10242 }
10243 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
10244 /// shuffle emission.
10245 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
10246 ArrayRef<int> Mask) {
10247 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10248 if (Mask[Idx] != PoisonMaskElem)
10249 CommonMask[Idx] = Idx;
10250 }
10251 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10252 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10253 /// elements.
10254 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10255 ArrayRef<int> Mask, unsigned Part,
10256 unsigned SliceSize) {
10257 if (SameNodesEstimated) {
10258 // Delay the cost estimation if the same nodes are reshuffling.
10259 // If we already requested the cost of reshuffling of E1 and E2 before, no
10260 // need to estimate another cost with the sub-Mask, instead include this
10261 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10262 // estimation.
10263 if ((InVectors.size() == 2 &&
10264 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10265 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10266 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10267 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10268 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10269 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10270 "Expected all poisoned elements.");
10271 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10272 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10273 return;
10274 }
10275 // Found non-matching nodes - need to estimate the cost for the matched
10276 // and transform mask.
10277 Cost += createShuffle(InVectors.front(),
10278 InVectors.size() == 1 ? nullptr : InVectors.back(),
10279 CommonMask);
10280 transformMaskAfterShuffle(CommonMask, CommonMask);
10281 } else if (InVectors.size() == 2) {
10282 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10283 transformMaskAfterShuffle(CommonMask, CommonMask);
10284 }
10285 SameNodesEstimated = false;
10286 if (!E2 && InVectors.size() == 1) {
10287 unsigned VF = E1.getVectorFactor();
10288 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10289 VF = std::max(VF,
10290 cast<FixedVectorType>(V1->getType())->getNumElements());
10291 } else {
10292 const auto *E = cast<const TreeEntry *>(InVectors.front());
10293 VF = std::max(VF, E->getVectorFactor());
10294 }
10295 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10296 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10297 CommonMask[Idx] = Mask[Idx] + VF;
10298 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10299 transformMaskAfterShuffle(CommonMask, CommonMask);
10300 } else {
10301 auto P = InVectors.front();
10302 Cost += createShuffle(&E1, E2, Mask);
10303 unsigned VF = Mask.size();
10304 if (Value *V1 = P.dyn_cast<Value *>()) {
10305 VF = std::max(VF,
10306 getNumElements(V1->getType()));
10307 } else {
10308 const auto *E = cast<const TreeEntry *>(P);
10309 VF = std::max(VF, E->getVectorFactor());
10310 }
10311 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10312 if (Mask[Idx] != PoisonMaskElem)
10313 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10314 Cost += createShuffle(P, InVectors.front(), CommonMask);
10315 transformMaskAfterShuffle(CommonMask, CommonMask);
10316 }
10317 }
10318
10319 class ShuffleCostBuilder {
10320 const TargetTransformInfo &TTI;
10321
10322 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10323 int Index = -1;
10324 return Mask.empty() ||
10325 (VF == Mask.size() &&
10328 Index == 0);
10329 }
10330
10331 public:
10332 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10333 ~ShuffleCostBuilder() = default;
10334 InstructionCost createShuffleVector(Value *V1, Value *,
10335 ArrayRef<int> Mask) const {
10336 // Empty mask or identity mask are free.
10337 unsigned VF =
10338 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10339 if (isEmptyOrIdentity(Mask, VF))
10340 return TTI::TCC_Free;
10341 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10342 cast<VectorType>(V1->getType()), Mask);
10343 }
10344 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10345 // Empty mask or identity mask are free.
10346 unsigned VF =
10347 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10348 if (isEmptyOrIdentity(Mask, VF))
10349 return TTI::TCC_Free;
10350 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10351 cast<VectorType>(V1->getType()), Mask);
10352 }
10353 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10354 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10355 return TTI::TCC_Free;
10356 }
10357 void resizeToMatch(Value *&, Value *&) const {}
10358 };
10359
10360 /// Smart shuffle instruction emission, walks through shuffles trees and
10361 /// tries to find the best matching vector for the actual shuffle
10362 /// instruction.
10364 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10366 ArrayRef<int> Mask) {
10367 ShuffleCostBuilder Builder(TTI);
10368 SmallVector<int> CommonMask(Mask);
10369 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10370 unsigned CommonVF = Mask.size();
10371 InstructionCost ExtraCost = 0;
10372 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10373 unsigned VF) -> InstructionCost {
10374 if (E.isGather() && allConstant(E.Scalars))
10375 return TTI::TCC_Free;
10376 Type *EScalarTy = E.Scalars.front()->getType();
10377 bool IsSigned = true;
10378 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10379 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10380 IsSigned = It->second.second;
10381 }
10382 if (EScalarTy != ScalarTy) {
10383 unsigned CastOpcode = Instruction::Trunc;
10384 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10385 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10386 if (DstSz > SrcSz)
10387 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10388 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10389 getWidenedType(EScalarTy, VF),
10390 TTI::CastContextHint::None, CostKind);
10391 }
10392 return TTI::TCC_Free;
10393 };
10394 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10395 if (isa<Constant>(V))
10396 return TTI::TCC_Free;
10397 auto *VecTy = cast<VectorType>(V->getType());
10398 Type *EScalarTy = VecTy->getElementType();
10399 if (EScalarTy != ScalarTy) {
10400 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10401 unsigned CastOpcode = Instruction::Trunc;
10402 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10403 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10404 if (DstSz > SrcSz)
10405 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10406 return TTI.getCastInstrCost(
10407 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10408 VecTy, TTI::CastContextHint::None, CostKind);
10409 }
10410 return TTI::TCC_Free;
10411 };
10412 if (!V1 && !V2 && !P2.isNull()) {
10413 // Shuffle 2 entry nodes.
10414 const TreeEntry *E = cast<const TreeEntry *>(P1);
10415 unsigned VF = E->getVectorFactor();
10416 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10417 CommonVF = std::max(VF, E2->getVectorFactor());
10418 assert(all_of(Mask,
10419 [=](int Idx) {
10420 return Idx < 2 * static_cast<int>(CommonVF);
10421 }) &&
10422 "All elements in mask must be less than 2 * CommonVF.");
10423 if (E->Scalars.size() == E2->Scalars.size()) {
10424 SmallVector<int> EMask = E->getCommonMask();
10425 SmallVector<int> E2Mask = E2->getCommonMask();
10426 if (!EMask.empty() || !E2Mask.empty()) {
10427 for (int &Idx : CommonMask) {
10428 if (Idx == PoisonMaskElem)
10429 continue;
10430 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10431 Idx = EMask[Idx];
10432 else if (Idx >= static_cast<int>(CommonVF))
10433 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10434 E->Scalars.size();
10435 }
10436 }
10437 CommonVF = E->Scalars.size();
10438 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10439 GetNodeMinBWAffectedCost(*E2, CommonVF);
10440 } else {
10441 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10442 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10443 }
10444 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10445 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10446 } else if (!V1 && P2.isNull()) {
10447 // Shuffle single entry node.
10448 const TreeEntry *E = cast<const TreeEntry *>(P1);
10449 unsigned VF = E->getVectorFactor();
10450 CommonVF = VF;
10451 assert(
10452 all_of(Mask,
10453 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10454 "All elements in mask must be less than CommonVF.");
10455 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10456 SmallVector<int> EMask = E->getCommonMask();
10457 assert(!EMask.empty() && "Expected non-empty common mask.");
10458 for (int &Idx : CommonMask) {
10459 if (Idx != PoisonMaskElem)
10460 Idx = EMask[Idx];
10461 }
10462 CommonVF = E->Scalars.size();
10463 } else if (unsigned Factor = E->getInterleaveFactor();
10464 Factor > 0 && E->Scalars.size() != Mask.size() &&
10466 Factor)) {
10467 // Deinterleaved nodes are free.
10468 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10469 }
10470 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10471 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10472 // Not identity/broadcast? Try to see if the original vector is better.
10473 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10474 CommonVF == CommonMask.size() &&
10475 any_of(enumerate(CommonMask),
10476 [](const auto &&P) {
10477 return P.value() != PoisonMaskElem &&
10478 static_cast<unsigned>(P.value()) != P.index();
10479 }) &&
10480 any_of(CommonMask,
10481 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10482 SmallVector<int> ReorderMask;
10483 inversePermutation(E->ReorderIndices, ReorderMask);
10484 ::addMask(CommonMask, ReorderMask);
10485 }
10486 } else if (V1 && P2.isNull()) {
10487 // Shuffle single vector.
10488 ExtraCost += GetValueMinBWAffectedCost(V1);
10489 CommonVF = getVF(V1);
10490 assert(
10491 all_of(Mask,
10492 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10493 "All elements in mask must be less than CommonVF.");
10494 } else if (V1 && !V2) {
10495 // Shuffle vector and tree node.
10496 unsigned VF = getVF(V1);
10497 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10498 CommonVF = std::max(VF, E2->getVectorFactor());
10499 assert(all_of(Mask,
10500 [=](int Idx) {
10501 return Idx < 2 * static_cast<int>(CommonVF);
10502 }) &&
10503 "All elements in mask must be less than 2 * CommonVF.");
10504 if (E2->Scalars.size() == VF && VF != CommonVF) {
10505 SmallVector<int> E2Mask = E2->getCommonMask();
10506 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10507 for (int &Idx : CommonMask) {
10508 if (Idx == PoisonMaskElem)
10509 continue;
10510 if (Idx >= static_cast<int>(CommonVF))
10511 Idx = E2Mask[Idx - CommonVF] + VF;
10512 }
10513 CommonVF = VF;
10514 }
10515 ExtraCost += GetValueMinBWAffectedCost(V1);
10516 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10517 ExtraCost += GetNodeMinBWAffectedCost(
10518 *E2, std::min(CommonVF, E2->getVectorFactor()));
10519 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10520 } else if (!V1 && V2) {
10521 // Shuffle vector and tree node.
10522 unsigned VF = getVF(V2);
10523 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10524 CommonVF = std::max(VF, E1->getVectorFactor());
10525 assert(all_of(Mask,
10526 [=](int Idx) {
10527 return Idx < 2 * static_cast<int>(CommonVF);
10528 }) &&
10529 "All elements in mask must be less than 2 * CommonVF.");
10530 if (E1->Scalars.size() == VF && VF != CommonVF) {
10531 SmallVector<int> E1Mask = E1->getCommonMask();
10532 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10533 for (int &Idx : CommonMask) {
10534 if (Idx == PoisonMaskElem)
10535 continue;
10536 if (Idx >= static_cast<int>(CommonVF))
10537 Idx = E1Mask[Idx - CommonVF] + VF;
10538 else
10539 Idx = E1Mask[Idx];
10540 }
10541 CommonVF = VF;
10542 }
10543 ExtraCost += GetNodeMinBWAffectedCost(
10544 *E1, std::min(CommonVF, E1->getVectorFactor()));
10545 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10546 ExtraCost += GetValueMinBWAffectedCost(V2);
10547 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10548 } else {
10549 assert(V1 && V2 && "Expected both vectors.");
10550 unsigned VF = getVF(V1);
10551 CommonVF = std::max(VF, getVF(V2));
10552 assert(all_of(Mask,
10553 [=](int Idx) {
10554 return Idx < 2 * static_cast<int>(CommonVF);
10555 }) &&
10556 "All elements in mask must be less than 2 * CommonVF.");
10557 ExtraCost +=
10558 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10559 if (V1->getType() != V2->getType()) {
10560 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10561 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10562 } else {
10563 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10564 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10565 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10566 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10567 }
10568 }
10569 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10570 assert(SLPReVec && "FixedVectorType is not expected.");
10572 CommonMask);
10573 }
10574 InVectors.front() =
10575 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10576 if (InVectors.size() == 2)
10577 InVectors.pop_back();
10578 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10579 V1, V2, CommonMask, Builder);
10580 }
10581
10582public:
10584 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10585 SmallPtrSetImpl<Value *> &CheckedExtracts)
10586 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10587 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10588 CheckedExtracts(CheckedExtracts) {}
10589 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10590 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10591 unsigned NumParts, bool &UseVecBaseAsInput) {
10592 UseVecBaseAsInput = false;
10593 if (Mask.empty())
10594 return nullptr;
10595 Value *VecBase = nullptr;
10596 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10597 if (!E->ReorderIndices.empty()) {
10598 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10599 E->ReorderIndices.end());
10600 reorderScalars(VL, ReorderMask);
10601 }
10602 // Check if it can be considered reused if same extractelements were
10603 // vectorized already.
10604 bool PrevNodeFound = any_of(
10605 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10606 [&](const std::unique_ptr<TreeEntry> &TE) {
10607 return ((!TE->isAltShuffle() &&
10608 TE->getOpcode() == Instruction::ExtractElement) ||
10609 TE->isGather()) &&
10610 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10611 return VL.size() > Data.index() &&
10612 (Mask[Data.index()] == PoisonMaskElem ||
10613 isa<UndefValue>(VL[Data.index()]) ||
10614 Data.value() == VL[Data.index()]);
10615 });
10616 });
10617 SmallPtrSet<Value *, 4> UniqueBases;
10618 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10619 for (unsigned Part : seq<unsigned>(NumParts)) {
10620 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10621 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10622 for (auto [I, V] :
10623 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10624 // Ignore non-extractelement scalars.
10625 if (isa<UndefValue>(V) ||
10626 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10627 continue;
10628 // If all users of instruction are going to be vectorized and this
10629 // instruction itself is not going to be vectorized, consider this
10630 // instruction as dead and remove its cost from the final cost of the
10631 // vectorized tree.
10632 // Also, avoid adjusting the cost for extractelements with multiple uses
10633 // in different graph entries.
10634 auto *EE = cast<ExtractElementInst>(V);
10635 VecBase = EE->getVectorOperand();
10636 UniqueBases.insert(VecBase);
10637 const TreeEntry *VE = R.getTreeEntry(V);
10638 if (!CheckedExtracts.insert(V).second ||
10639 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10640 any_of(EE->users(),
10641 [&](User *U) {
10642 return isa<GetElementPtrInst>(U) &&
10643 !R.areAllUsersVectorized(cast<Instruction>(U),
10644 &VectorizedVals);
10645 }) ||
10646 (VE && VE != E))
10647 continue;
10648 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10649 if (!EEIdx)
10650 continue;
10651 unsigned Idx = *EEIdx;
10652 // Take credit for instruction that will become dead.
10653 if (EE->hasOneUse() || !PrevNodeFound) {
10654 Instruction *Ext = EE->user_back();
10655 if (isa<SExtInst, ZExtInst>(Ext) &&
10656 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10657 // Use getExtractWithExtendCost() to calculate the cost of
10658 // extractelement/ext pair.
10659 Cost -=
10660 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10661 EE->getVectorOperandType(), Idx);
10662 // Add back the cost of s|zext which is subtracted separately.
10664 Ext->getOpcode(), Ext->getType(), EE->getType(),
10665 TTI::getCastContextHint(Ext), CostKind, Ext);
10666 continue;
10667 }
10668 }
10669 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10670 CostKind, Idx);
10671 }
10672 }
10673 // Check that gather of extractelements can be represented as just a
10674 // shuffle of a single/two vectors the scalars are extracted from.
10675 // Found the bunch of extractelement instructions that must be gathered
10676 // into a vector and can be represented as a permutation elements in a
10677 // single input vector or of 2 input vectors.
10678 // Done for reused if same extractelements were vectorized already.
10679 if (!PrevNodeFound)
10680 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10681 InVectors.assign(1, E);
10682 CommonMask.assign(Mask.begin(), Mask.end());
10683 transformMaskAfterShuffle(CommonMask, CommonMask);
10684 SameNodesEstimated = false;
10685 if (NumParts != 1 && UniqueBases.size() != 1) {
10686 UseVecBaseAsInput = true;
10687 VecBase =
10688 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10689 }
10690 return VecBase;
10691 }
10692 /// Checks if the specified entry \p E needs to be delayed because of its
10693 /// dependency nodes.
10694 std::optional<InstructionCost>
10695 needToDelay(const TreeEntry *,
10697 // No need to delay the cost estimation during analysis.
10698 return std::nullopt;
10699 }
10700 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10701 if (&E1 == &E2) {
10702 assert(all_of(Mask,
10703 [&](int Idx) {
10704 return Idx < static_cast<int>(E1.getVectorFactor());
10705 }) &&
10706 "Expected single vector shuffle mask.");
10707 add(E1, Mask);
10708 return;
10709 }
10710 if (InVectors.empty()) {
10711 CommonMask.assign(Mask.begin(), Mask.end());
10712 InVectors.assign({&E1, &E2});
10713 return;
10714 }
10715 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10716 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10717 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10718 if (NumParts == 0 || NumParts >= Mask.size() ||
10719 MaskVecTy->getNumElements() % NumParts != 0 ||
10720 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10721 MaskVecTy->getNumElements() / NumParts))
10722 NumParts = 1;
10723 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10724 const auto *It =
10725 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10726 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10727 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10728 }
10729 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10730 if (InVectors.empty()) {
10731 CommonMask.assign(Mask.begin(), Mask.end());
10732 InVectors.assign(1, &E1);
10733 return;
10734 }
10735 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10736 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10737 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10738 if (NumParts == 0 || NumParts >= Mask.size() ||
10739 MaskVecTy->getNumElements() % NumParts != 0 ||
10740 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10741 MaskVecTy->getNumElements() / NumParts))
10742 NumParts = 1;
10743 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10744 const auto *It =
10745 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10746 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10747 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10748 if (!SameNodesEstimated && InVectors.size() == 1)
10749 InVectors.emplace_back(&E1);
10750 }
10751 /// Adds 2 input vectors and the mask for their shuffling.
10752 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10753 // May come only for shuffling of 2 vectors with extractelements, already
10754 // handled in adjustExtracts.
10755 assert(InVectors.size() == 1 &&
10756 all_of(enumerate(CommonMask),
10757 [&](auto P) {
10758 if (P.value() == PoisonMaskElem)
10759 return Mask[P.index()] == PoisonMaskElem;
10760 auto *EI = cast<ExtractElementInst>(
10761 cast<const TreeEntry *>(InVectors.front())
10762 ->getOrdered(P.index()));
10763 return EI->getVectorOperand() == V1 ||
10764 EI->getVectorOperand() == V2;
10765 }) &&
10766 "Expected extractelement vectors.");
10767 }
10768 /// Adds another one input vector and the mask for the shuffling.
10769 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10770 if (InVectors.empty()) {
10771 assert(CommonMask.empty() && !ForExtracts &&
10772 "Expected empty input mask/vectors.");
10773 CommonMask.assign(Mask.begin(), Mask.end());
10774 InVectors.assign(1, V1);
10775 return;
10776 }
10777 if (ForExtracts) {
10778 // No need to add vectors here, already handled them in adjustExtracts.
10779 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10780 !CommonMask.empty() &&
10781 all_of(enumerate(CommonMask),
10782 [&](auto P) {
10783 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10784 ->getOrdered(P.index());
10785 if (P.value() == PoisonMaskElem)
10786 return P.value() == Mask[P.index()] ||
10787 isa<UndefValue>(Scalar);
10788 if (isa<Constant>(V1))
10789 return true;
10790 auto *EI = cast<ExtractElementInst>(Scalar);
10791 return EI->getVectorOperand() == V1;
10792 }) &&
10793 "Expected only tree entry for extractelement vectors.");
10794 return;
10795 }
10796 assert(!InVectors.empty() && !CommonMask.empty() &&
10797 "Expected only tree entries from extracts/reused buildvectors.");
10798 unsigned VF = getVF(V1);
10799 if (InVectors.size() == 2) {
10800 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10801 transformMaskAfterShuffle(CommonMask, CommonMask);
10802 VF = std::max<unsigned>(VF, CommonMask.size());
10803 } else if (const auto *InTE =
10804 InVectors.front().dyn_cast<const TreeEntry *>()) {
10805 VF = std::max(VF, InTE->getVectorFactor());
10806 } else {
10807 VF = std::max(
10808 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10809 ->getNumElements());
10810 }
10811 InVectors.push_back(V1);
10812 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10813 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10814 CommonMask[Idx] = Mask[Idx] + VF;
10815 }
10816 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10817 Value *Root = nullptr) {
10818 Cost += getBuildVectorCost(VL, Root);
10819 if (!Root) {
10820 // FIXME: Need to find a way to avoid use of getNullValue here.
10822 unsigned VF = VL.size();
10823 if (MaskVF != 0)
10824 VF = std::min(VF, MaskVF);
10825 for (Value *V : VL.take_front(VF)) {
10826 if (isa<UndefValue>(V)) {
10827 Vals.push_back(cast<Constant>(V));
10828 continue;
10829 }
10830 Vals.push_back(Constant::getNullValue(V->getType()));
10831 }
10832 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10833 assert(SLPReVec && "FixedVectorType is not expected.");
10834 // When REVEC is enabled, we need to expand vector types into scalar
10835 // types.
10836 unsigned VecTyNumElements = VecTy->getNumElements();
10837 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10838 for (auto [I, V] : enumerate(Vals)) {
10839 Type *ScalarTy = V->getType()->getScalarType();
10840 Constant *NewVal;
10841 if (isa<PoisonValue>(V))
10842 NewVal = PoisonValue::get(ScalarTy);
10843 else if (isa<UndefValue>(V))
10844 NewVal = UndefValue::get(ScalarTy);
10845 else
10846 NewVal = Constant::getNullValue(ScalarTy);
10847 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10848 NewVal);
10849 }
10850 Vals.swap(NewVals);
10851 }
10852 return ConstantVector::get(Vals);
10853 }
10856 cast<FixedVectorType>(Root->getType())->getNumElements()),
10857 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10858 }
10860 /// Finalize emission of the shuffles.
10863 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10864 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10865 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10866 IsFinalized = true;
10867 if (Action) {
10868 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10869 if (InVectors.size() == 2)
10870 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10871 else
10872 Cost += createShuffle(Vec, nullptr, CommonMask);
10873 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10874 if (CommonMask[Idx] != PoisonMaskElem)
10875 CommonMask[Idx] = Idx;
10876 assert(VF > 0 &&
10877 "Expected vector length for the final value before action.");
10878 Value *V = cast<Value *>(Vec);
10879 Action(V, CommonMask);
10880 InVectors.front() = V;
10881 }
10882 if (!SubVectors.empty()) {
10883 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10884 if (InVectors.size() == 2)
10885 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10886 else
10887 Cost += createShuffle(Vec, nullptr, CommonMask);
10888 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10889 if (CommonMask[Idx] != PoisonMaskElem)
10890 CommonMask[Idx] = Idx;
10891 // Add subvectors permutation cost.
10892 if (!SubVectorsMask.empty()) {
10893 assert(SubVectorsMask.size() <= CommonMask.size() &&
10894 "Expected same size of masks for subvectors and common mask.");
10895 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10896 copy(SubVectorsMask, SVMask.begin());
10897 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10898 if (I2 != PoisonMaskElem) {
10899 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10900 I1 = I2 + CommonMask.size();
10901 }
10902 }
10904 getWidenedType(ScalarTy, CommonMask.size()),
10905 SVMask, CostKind);
10906 }
10907 for (auto [E, Idx] : SubVectors) {
10908 Type *EScalarTy = E->Scalars.front()->getType();
10909 bool IsSigned = true;
10910 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10911 EScalarTy =
10912 IntegerType::get(EScalarTy->getContext(), It->second.first);
10913 IsSigned = It->second.second;
10914 }
10915 if (ScalarTy != EScalarTy) {
10916 unsigned CastOpcode = Instruction::Trunc;
10917 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10918 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10919 if (DstSz > SrcSz)
10920 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10922 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10923 getWidenedType(EScalarTy, E->getVectorFactor()),
10925 }
10928 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
10929 getWidenedType(ScalarTy, E->getVectorFactor()));
10930 if (!CommonMask.empty()) {
10931 std::iota(std::next(CommonMask.begin(), Idx),
10932 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
10933 Idx);
10934 }
10935 }
10936 }
10937
10938 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
10939 if (CommonMask.empty()) {
10940 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
10941 return Cost;
10942 }
10943 return Cost +
10944 createShuffle(InVectors.front(),
10945 InVectors.size() == 2 ? InVectors.back() : nullptr,
10946 CommonMask);
10947 }
10948
10950 assert((IsFinalized || CommonMask.empty()) &&
10951 "Shuffle construction must be finalized.");
10952 }
10953};
10954
10955const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
10956 unsigned Idx) const {
10957 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
10958 return VE;
10959 const auto *It =
10960 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10961 return TE->isGather() &&
10962 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
10963 return EI.EdgeIdx == Idx && EI.UserTE == E;
10964 }) != TE->UserTreeIndices.end();
10965 });
10966 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
10967 return It->get();
10968}
10969
10970TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
10971 if (TE.State == TreeEntry::ScatterVectorize ||
10972 TE.State == TreeEntry::StridedVectorize)
10974 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
10975 !TE.isAltShuffle()) {
10976 if (TE.ReorderIndices.empty())
10978 SmallVector<int> Mask;
10979 inversePermutation(TE.ReorderIndices, Mask);
10980 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
10982 }
10984}
10985
10986/// Builds the arguments types vector for the given call instruction with the
10987/// given \p ID for the specified vector factor.
10990 const unsigned VF, unsigned MinBW,
10991 const TargetTransformInfo *TTI) {
10992 SmallVector<Type *> ArgTys;
10993 for (auto [Idx, Arg] : enumerate(CI->args())) {
10996 ArgTys.push_back(Arg->getType());
10997 continue;
10998 }
10999 if (MinBW > 0) {
11000 ArgTys.push_back(
11001 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11002 continue;
11003 }
11004 }
11005 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11006 }
11007 return ArgTys;
11008}
11009
11011BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11012 SmallPtrSetImpl<Value *> &CheckedExtracts) {
11013 ArrayRef<Value *> VL = E->Scalars;
11014
11015 Type *ScalarTy = getValueType(VL[0]);
11016 if (!isValidElementType(ScalarTy))
11019
11020 // If we have computed a smaller type for the expression, update VecTy so
11021 // that the costs will be accurate.
11022 auto It = MinBWs.find(E);
11023 Type *OrigScalarTy = ScalarTy;
11024 if (It != MinBWs.end()) {
11025 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11026 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11027 if (VecTy)
11028 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
11029 }
11030 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11031 unsigned EntryVF = E->getVectorFactor();
11032 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
11033
11034 if (E->isGather()) {
11035 if (allConstant(VL))
11036 return 0;
11037 if (isa<InsertElementInst>(VL[0]))
11039 if (isa<CmpInst>(VL.front()))
11040 ScalarTy = VL.front()->getType();
11041 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11042 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11043 }
11044 InstructionCost CommonCost = 0;
11046 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11047 !isReverseOrder(E->ReorderIndices))) {
11048 SmallVector<int> NewMask;
11049 if (E->getOpcode() == Instruction::Store) {
11050 // For stores the order is actually a mask.
11051 NewMask.resize(E->ReorderIndices.size());
11052 copy(E->ReorderIndices, NewMask.begin());
11053 } else {
11054 inversePermutation(E->ReorderIndices, NewMask);
11055 }
11056 ::addMask(Mask, NewMask);
11057 }
11058 if (!E->ReuseShuffleIndices.empty())
11059 ::addMask(Mask, E->ReuseShuffleIndices);
11060 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11061 CommonCost =
11062 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11063 assert((E->State == TreeEntry::Vectorize ||
11064 E->State == TreeEntry::ScatterVectorize ||
11065 E->State == TreeEntry::StridedVectorize) &&
11066 "Unhandled state");
11067 assert(E->getOpcode() &&
11068 ((allSameType(VL) && allSameBlock(VL)) ||
11069 (E->getOpcode() == Instruction::GetElementPtr &&
11070 E->getMainOp()->getType()->isPointerTy())) &&
11071 "Invalid VL");
11072 Instruction *VL0 = E->getMainOp();
11073 unsigned ShuffleOrOp =
11074 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11075 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11076 ShuffleOrOp = E->CombinedOp;
11077 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11078 const unsigned Sz = UniqueValues.size();
11079 SmallBitVector UsedScalars(Sz, false);
11080 for (unsigned I = 0; I < Sz; ++I) {
11081 if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11082 continue;
11083 UsedScalars.set(I);
11084 }
11085 auto GetCastContextHint = [&](Value *V) {
11086 if (const TreeEntry *OpTE = getTreeEntry(V))
11087 return getCastContextHint(*OpTE);
11088 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11089 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
11092 };
11093 auto GetCostDiff =
11094 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11096 // Calculate the cost of this instruction.
11097 InstructionCost ScalarCost = 0;
11098 if (isa<CastInst, CallInst>(VL0)) {
11099 // For some of the instructions no need to calculate cost for each
11100 // particular instruction, we can use the cost of the single
11101 // instruction x total number of scalar instructions.
11102 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11103 } else {
11104 for (unsigned I = 0; I < Sz; ++I) {
11105 if (UsedScalars.test(I))
11106 continue;
11107 ScalarCost += ScalarEltCost(I);
11108 }
11109 }
11110
11111 InstructionCost VecCost = VectorCost(CommonCost);
11112 // Check if the current node must be resized, if the parent node is not
11113 // resized.
11114 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11115 E->Idx != 0 &&
11116 (E->getOpcode() != Instruction::Load ||
11117 !E->UserTreeIndices.empty())) {
11118 const EdgeInfo &EI =
11119 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11120 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11121 });
11122 if (EI.UserTE->getOpcode() != Instruction::Select ||
11123 EI.EdgeIdx != 0) {
11124 auto UserBWIt = MinBWs.find(EI.UserTE);
11125 Type *UserScalarTy =
11126 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11127 if (UserBWIt != MinBWs.end())
11128 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11129 UserBWIt->second.first);
11130 if (ScalarTy != UserScalarTy) {
11131 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11132 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11133 unsigned VecOpcode;
11134 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11135 if (BWSz > SrcBWSz)
11136 VecOpcode = Instruction::Trunc;
11137 else
11138 VecOpcode =
11139 It->second.second ? Instruction::SExt : Instruction::ZExt;
11140 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11141 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11142 CostKind);
11143 }
11144 }
11145 }
11146 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11147 ScalarCost, "Calculated costs for Tree"));
11148 return VecCost - ScalarCost;
11149 };
11150 // Calculate cost difference from vectorizing set of GEPs.
11151 // Negative value means vectorizing is profitable.
11152 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11153 assert((E->State == TreeEntry::Vectorize ||
11154 E->State == TreeEntry::StridedVectorize) &&
11155 "Entry state expected to be Vectorize or StridedVectorize here.");
11156 InstructionCost ScalarCost = 0;
11157 InstructionCost VecCost = 0;
11158 std::tie(ScalarCost, VecCost) = getGEPCosts(
11159 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11160 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11161 "Calculated GEPs cost for Tree"));
11162
11163 return VecCost - ScalarCost;
11164 };
11165
11166 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11167 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11168 if (MinMaxID == Intrinsic::not_intrinsic)
11170 Type *CanonicalType = Ty;
11171 if (CanonicalType->isPtrOrPtrVectorTy())
11172 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11173 CanonicalType->getContext(),
11174 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11175
11176 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11177 {CanonicalType, CanonicalType});
11178 InstructionCost IntrinsicCost =
11179 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11180 // If the selects are the only uses of the compares, they will be
11181 // dead and we can adjust the cost by removing their cost.
11182 if (VI && SelectOnly) {
11183 assert((!Ty->isVectorTy() || SLPReVec) &&
11184 "Expected only for scalar type.");
11185 auto *CI = cast<CmpInst>(VI->getOperand(0));
11186 IntrinsicCost -= TTI->getCmpSelInstrCost(
11187 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11188 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11189 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11190 }
11191 return IntrinsicCost;
11192 };
11193 switch (ShuffleOrOp) {
11194 case Instruction::PHI: {
11195 // Count reused scalars.
11196 InstructionCost ScalarCost = 0;
11198 for (Value *V : UniqueValues) {
11199 auto *PHI = dyn_cast<PHINode>(V);
11200 if (!PHI)
11201 continue;
11202
11203 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11204 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11205 Value *Op = PHI->getIncomingValue(I);
11206 Operands[I] = Op;
11207 }
11208 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11209 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11210 if (!OpTE->ReuseShuffleIndices.empty())
11211 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11212 OpTE->Scalars.size());
11213 }
11214
11215 return CommonCost - ScalarCost;
11216 }
11217 case Instruction::ExtractValue:
11218 case Instruction::ExtractElement: {
11219 auto GetScalarCost = [&](unsigned Idx) {
11220 if (isa<PoisonValue>(UniqueValues[Idx]))
11222
11223 auto *I = cast<Instruction>(UniqueValues[Idx]);
11224 VectorType *SrcVecTy;
11225 if (ShuffleOrOp == Instruction::ExtractElement) {
11226 auto *EE = cast<ExtractElementInst>(I);
11227 SrcVecTy = EE->getVectorOperandType();
11228 } else {
11229 auto *EV = cast<ExtractValueInst>(I);
11230 Type *AggregateTy = EV->getAggregateOperand()->getType();
11231 unsigned NumElts;
11232 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11233 NumElts = ATy->getNumElements();
11234 else
11235 NumElts = AggregateTy->getStructNumElements();
11236 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11237 }
11238 if (I->hasOneUse()) {
11239 Instruction *Ext = I->user_back();
11240 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11241 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11242 // Use getExtractWithExtendCost() to calculate the cost of
11243 // extractelement/ext pair.
11245 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11246 // Subtract the cost of s|zext which is subtracted separately.
11248 Ext->getOpcode(), Ext->getType(), I->getType(),
11250 return Cost;
11251 }
11252 }
11253 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11255 };
11256 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11257 return GetCostDiff(GetScalarCost, GetVectorCost);
11258 }
11259 case Instruction::InsertElement: {
11260 assert(E->ReuseShuffleIndices.empty() &&
11261 "Unique insertelements only are expected.");
11262 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11263 unsigned const NumElts = SrcVecTy->getNumElements();
11264 unsigned const NumScalars = VL.size();
11265
11266 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11267
11268 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11269 unsigned OffsetBeg = *getElementIndex(VL.front());
11270 unsigned OffsetEnd = OffsetBeg;
11271 InsertMask[OffsetBeg] = 0;
11272 for (auto [I, V] : enumerate(VL.drop_front())) {
11273 unsigned Idx = *getElementIndex(V);
11274 if (OffsetBeg > Idx)
11275 OffsetBeg = Idx;
11276 else if (OffsetEnd < Idx)
11277 OffsetEnd = Idx;
11278 InsertMask[Idx] = I + 1;
11279 }
11280 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11281 if (NumOfParts > 0 && NumOfParts < NumElts)
11282 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11283 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11284 VecScalarsSz;
11285 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11286 unsigned InsertVecSz = std::min<unsigned>(
11287 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11288 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11289 bool IsWholeSubvector =
11290 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11291 // Check if we can safely insert a subvector. If it is not possible, just
11292 // generate a whole-sized vector and shuffle the source vector and the new
11293 // subvector.
11294 if (OffsetBeg + InsertVecSz > VecSz) {
11295 // Align OffsetBeg to generate correct mask.
11296 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11297 InsertVecSz = VecSz;
11298 }
11299
11300 APInt DemandedElts = APInt::getZero(NumElts);
11301 // TODO: Add support for Instruction::InsertValue.
11303 if (!E->ReorderIndices.empty()) {
11304 inversePermutation(E->ReorderIndices, Mask);
11305 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11306 } else {
11307 Mask.assign(VecSz, PoisonMaskElem);
11308 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11309 }
11310 bool IsIdentity = true;
11311 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11312 Mask.swap(PrevMask);
11313 for (unsigned I = 0; I < NumScalars; ++I) {
11314 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11315 DemandedElts.setBit(InsertIdx);
11316 IsIdentity &= InsertIdx - OffsetBeg == I;
11317 Mask[InsertIdx - OffsetBeg] = I;
11318 }
11319 assert(Offset < NumElts && "Failed to find vector index offset");
11320
11322 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11323 /*Insert*/ true, /*Extract*/ false,
11324 CostKind);
11325
11326 // First cost - resize to actual vector size if not identity shuffle or
11327 // need to shift the vector.
11328 // Do not calculate the cost if the actual size is the register size and
11329 // we can merge this shuffle with the following SK_Select.
11330 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11331 if (!IsIdentity)
11333 InsertVecTy, Mask);
11334 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11335 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11336 }));
11337 // Second cost - permutation with subvector, if some elements are from the
11338 // initial vector or inserting a subvector.
11339 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11340 // subvector of ActualVecTy.
11341 SmallBitVector InMask =
11342 isUndefVector(FirstInsert->getOperand(0),
11343 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11344 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11345 if (InsertVecSz != VecSz) {
11346 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11347 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11348 CostKind, OffsetBeg - Offset, InsertVecTy);
11349 } else {
11350 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11351 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11352 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11353 I <= End; ++I)
11354 if (Mask[I] != PoisonMaskElem)
11355 Mask[I] = I + VecSz;
11356 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11357 Mask[I] =
11358 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11359 Cost +=
11360 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11361 }
11362 }
11363 return Cost;
11364 }
11365 case Instruction::ZExt:
11366 case Instruction::SExt:
11367 case Instruction::FPToUI:
11368 case Instruction::FPToSI:
11369 case Instruction::FPExt:
11370 case Instruction::PtrToInt:
11371 case Instruction::IntToPtr:
11372 case Instruction::SIToFP:
11373 case Instruction::UIToFP:
11374 case Instruction::Trunc:
11375 case Instruction::FPTrunc:
11376 case Instruction::BitCast: {
11377 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11378 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11379 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11380 unsigned Opcode = ShuffleOrOp;
11381 unsigned VecOpcode = Opcode;
11382 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11383 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11384 // Check if the values are candidates to demote.
11385 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11386 if (SrcIt != MinBWs.end()) {
11387 SrcBWSz = SrcIt->second.first;
11388 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11389 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11390 SrcVecTy =
11391 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11392 }
11393 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11394 if (BWSz == SrcBWSz) {
11395 VecOpcode = Instruction::BitCast;
11396 } else if (BWSz < SrcBWSz) {
11397 VecOpcode = Instruction::Trunc;
11398 } else if (It != MinBWs.end()) {
11399 assert(BWSz > SrcBWSz && "Invalid cast!");
11400 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11401 } else if (SrcIt != MinBWs.end()) {
11402 assert(BWSz > SrcBWSz && "Invalid cast!");
11403 VecOpcode =
11404 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11405 }
11406 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11407 !SrcIt->second.second) {
11408 VecOpcode = Instruction::UIToFP;
11409 }
11410 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11411 assert(Idx == 0 && "Expected 0 index only");
11412 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11413 VL0->getOperand(0)->getType(),
11415 };
11416 auto GetVectorCost = [=](InstructionCost CommonCost) {
11417 // Do not count cost here if minimum bitwidth is in effect and it is just
11418 // a bitcast (here it is just a noop).
11419 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11420 return CommonCost;
11421 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11422 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11423
11424 bool IsArithmeticExtendedReduction =
11425 E->Idx == 0 && UserIgnoreList &&
11426 all_of(*UserIgnoreList, [](Value *V) {
11427 auto *I = cast<Instruction>(V);
11428 return is_contained({Instruction::Add, Instruction::FAdd,
11429 Instruction::Mul, Instruction::FMul,
11430 Instruction::And, Instruction::Or,
11431 Instruction::Xor},
11432 I->getOpcode());
11433 });
11434 if (IsArithmeticExtendedReduction &&
11435 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11436 return CommonCost;
11437 return CommonCost +
11438 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11439 VecOpcode == Opcode ? VI : nullptr);
11440 };
11441 return GetCostDiff(GetScalarCost, GetVectorCost);
11442 }
11443 case Instruction::FCmp:
11444 case Instruction::ICmp:
11445 case Instruction::Select: {
11446 CmpPredicate VecPred, SwappedVecPred;
11447 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11448 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11449 match(VL0, MatchCmp))
11450 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11451 else
11452 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11455 auto GetScalarCost = [&](unsigned Idx) {
11456 if (isa<PoisonValue>(UniqueValues[Idx]))
11458
11459 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11460 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11463 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11464 // FIXME: Use CmpPredicate::getMatching here.
11465 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11466 !match(VI, MatchCmp)) ||
11467 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11468 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11469 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11472
11474 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11475 CostKind, getOperandInfo(VI->getOperand(0)),
11476 getOperandInfo(VI->getOperand(1)), VI);
11477 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11478 if (IntrinsicCost.isValid())
11479 ScalarCost = IntrinsicCost;
11480
11481 return ScalarCost;
11482 };
11483 auto GetVectorCost = [&](InstructionCost CommonCost) {
11484 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11485
11486 InstructionCost VecCost =
11487 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11488 CostKind, getOperandInfo(E->getOperand(0)),
11489 getOperandInfo(E->getOperand(1)), VL0);
11490 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11491 auto *CondType =
11492 getWidenedType(SI->getCondition()->getType(), VL.size());
11493 unsigned CondNumElements = CondType->getNumElements();
11494 unsigned VecTyNumElements = getNumElements(VecTy);
11495 assert(VecTyNumElements >= CondNumElements &&
11496 VecTyNumElements % CondNumElements == 0 &&
11497 "Cannot vectorize Instruction::Select");
11498 if (CondNumElements != VecTyNumElements) {
11499 // When the return type is i1 but the source is fixed vector type, we
11500 // need to duplicate the condition value.
11501 VecCost += ::getShuffleCost(
11502 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11503 createReplicatedMask(VecTyNumElements / CondNumElements,
11504 CondNumElements));
11505 }
11506 }
11507 return VecCost + CommonCost;
11508 };
11509 return GetCostDiff(GetScalarCost, GetVectorCost);
11510 }
11511 case TreeEntry::MinMax: {
11512 auto GetScalarCost = [&](unsigned Idx) {
11513 return GetMinMaxCost(OrigScalarTy);
11514 };
11515 auto GetVectorCost = [&](InstructionCost CommonCost) {
11516 InstructionCost VecCost = GetMinMaxCost(VecTy);
11517 return VecCost + CommonCost;
11518 };
11519 return GetCostDiff(GetScalarCost, GetVectorCost);
11520 }
11521 case Instruction::FNeg:
11522 case Instruction::Add:
11523 case Instruction::FAdd:
11524 case Instruction::Sub:
11525 case Instruction::FSub:
11526 case Instruction::Mul:
11527 case Instruction::FMul:
11528 case Instruction::UDiv:
11529 case Instruction::SDiv:
11530 case Instruction::FDiv:
11531 case Instruction::URem:
11532 case Instruction::SRem:
11533 case Instruction::FRem:
11534 case Instruction::Shl:
11535 case Instruction::LShr:
11536 case Instruction::AShr:
11537 case Instruction::And:
11538 case Instruction::Or:
11539 case Instruction::Xor: {
11540 auto GetScalarCost = [&](unsigned Idx) {
11541 if (isa<PoisonValue>(UniqueValues[Idx]))
11543
11544 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11545 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11546 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11547 TTI::OperandValueInfo Op2Info =
11548 TTI::getOperandInfo(VI->getOperand(OpIdx));
11549 SmallVector<const Value *> Operands(VI->operand_values());
11550 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11551 Op1Info, Op2Info, Operands, VI);
11552 };
11553 auto GetVectorCost = [=](InstructionCost CommonCost) {
11554 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11555 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11556 ArrayRef<Value *> Ops = E->getOperand(I);
11557 if (all_of(Ops, [&](Value *Op) {
11558 auto *CI = dyn_cast<ConstantInt>(Op);
11559 return CI && CI->getValue().countr_one() >= It->second.first;
11560 }))
11561 return CommonCost;
11562 }
11563 }
11564 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11565 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11566 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11567 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11568 Op2Info, {}, nullptr, TLI) +
11569 CommonCost;
11570 };
11571 return GetCostDiff(GetScalarCost, GetVectorCost);
11572 }
11573 case Instruction::GetElementPtr: {
11574 return CommonCost + GetGEPCostDiff(VL, VL0);
11575 }
11576 case Instruction::Load: {
11577 auto GetScalarCost = [&](unsigned Idx) {
11578 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11579 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11580 VI->getAlign(), VI->getPointerAddressSpace(),
11582 };
11583 auto *LI0 = cast<LoadInst>(VL0);
11584 auto GetVectorCost = [&](InstructionCost CommonCost) {
11585 InstructionCost VecLdCost;
11586 switch (E->State) {
11587 case TreeEntry::Vectorize:
11588 if (unsigned Factor = E->getInterleaveFactor()) {
11589 VecLdCost = TTI->getInterleavedMemoryOpCost(
11590 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11591 LI0->getPointerAddressSpace(), CostKind);
11592
11593 } else {
11594 VecLdCost = TTI->getMemoryOpCost(
11595 Instruction::Load, VecTy, LI0->getAlign(),
11596 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11597 }
11598 break;
11599 case TreeEntry::StridedVectorize: {
11600 Align CommonAlignment =
11601 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11602 VecLdCost = TTI->getStridedMemoryOpCost(
11603 Instruction::Load, VecTy, LI0->getPointerOperand(),
11604 /*VariableMask=*/false, CommonAlignment, CostKind);
11605 break;
11606 }
11607 case TreeEntry::ScatterVectorize: {
11608 Align CommonAlignment =
11609 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11610 VecLdCost = TTI->getGatherScatterOpCost(
11611 Instruction::Load, VecTy, LI0->getPointerOperand(),
11612 /*VariableMask=*/false, CommonAlignment, CostKind);
11613 break;
11614 }
11615 case TreeEntry::CombinedVectorize:
11616 case TreeEntry::NeedToGather:
11617 llvm_unreachable("Unexpected vectorization state.");
11618 }
11619 return VecLdCost + CommonCost;
11620 };
11621
11622 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11623 // If this node generates masked gather load then it is not a terminal node.
11624 // Hence address operand cost is estimated separately.
11625 if (E->State == TreeEntry::ScatterVectorize)
11626 return Cost;
11627
11628 // Estimate cost of GEPs since this tree node is a terminator.
11629 SmallVector<Value *> PointerOps(VL.size());
11630 for (auto [I, V] : enumerate(VL))
11631 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11632 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11633 }
11634 case Instruction::Store: {
11635 bool IsReorder = !E->ReorderIndices.empty();
11636 auto GetScalarCost = [=](unsigned Idx) {
11637 auto *VI = cast<StoreInst>(VL[Idx]);
11638 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11639 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11640 VI->getAlign(), VI->getPointerAddressSpace(),
11641 CostKind, OpInfo, VI);
11642 };
11643 auto *BaseSI =
11644 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11645 auto GetVectorCost = [=](InstructionCost CommonCost) {
11646 // We know that we can merge the stores. Calculate the cost.
11647 InstructionCost VecStCost;
11648 if (E->State == TreeEntry::StridedVectorize) {
11649 Align CommonAlignment =
11650 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11651 VecStCost = TTI->getStridedMemoryOpCost(
11652 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11653 /*VariableMask=*/false, CommonAlignment, CostKind);
11654 } else {
11655 assert(E->State == TreeEntry::Vectorize &&
11656 "Expected either strided or consecutive stores.");
11657 if (unsigned Factor = E->getInterleaveFactor()) {
11658 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11659 "No reused shuffles expected");
11660 CommonCost = 0;
11661 VecStCost = TTI->getInterleavedMemoryOpCost(
11662 Instruction::Store, VecTy, Factor, std::nullopt,
11663 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11664 } else {
11665 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11666 VecStCost = TTI->getMemoryOpCost(
11667 Instruction::Store, VecTy, BaseSI->getAlign(),
11668 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11669 }
11670 }
11671 return VecStCost + CommonCost;
11672 };
11673 SmallVector<Value *> PointerOps(VL.size());
11674 for (auto [I, V] : enumerate(VL)) {
11675 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11676 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11677 }
11678
11679 return GetCostDiff(GetScalarCost, GetVectorCost) +
11680 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11681 }
11682 case Instruction::Call: {
11683 auto GetScalarCost = [&](unsigned Idx) {
11684 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11687 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11688 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11689 }
11692 CI->getFunctionType()->params(), CostKind);
11693 };
11694 auto GetVectorCost = [=](InstructionCost CommonCost) {
11695 auto *CI = cast<CallInst>(VL0);
11698 CI, ID, VecTy->getNumElements(),
11699 It != MinBWs.end() ? It->second.first : 0, TTI);
11700 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11701 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11702 };
11703 return GetCostDiff(GetScalarCost, GetVectorCost);
11704 }
11705 case Instruction::ShuffleVector: {
11706 if (!SLPReVec || E->isAltShuffle())
11707 assert(E->isAltShuffle() &&
11708 ((Instruction::isBinaryOp(E->getOpcode()) &&
11709 Instruction::isBinaryOp(E->getAltOpcode())) ||
11710 (Instruction::isCast(E->getOpcode()) &&
11711 Instruction::isCast(E->getAltOpcode())) ||
11712 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11713 "Invalid Shuffle Vector Operand");
11714 // Try to find the previous shuffle node with the same operands and same
11715 // main/alternate ops.
11716 auto TryFindNodeWithEqualOperands = [=]() {
11717 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11718 if (TE.get() == E)
11719 break;
11720 if (TE->isAltShuffle() &&
11721 ((TE->getOpcode() == E->getOpcode() &&
11722 TE->getAltOpcode() == E->getAltOpcode()) ||
11723 (TE->getOpcode() == E->getAltOpcode() &&
11724 TE->getAltOpcode() == E->getOpcode())) &&
11725 TE->hasEqualOperands(*E))
11726 return true;
11727 }
11728 return false;
11729 };
11730 auto GetScalarCost = [&](unsigned Idx) {
11731 if (isa<PoisonValue>(UniqueValues[Idx]))
11733
11734 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11735 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11736 (void)E;
11737 return TTI->getInstructionCost(VI, CostKind);
11738 };
11739 // Need to clear CommonCost since the final shuffle cost is included into
11740 // vector cost.
11741 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11742 // VecCost is equal to sum of the cost of creating 2 vectors
11743 // and the cost of creating shuffle.
11744 InstructionCost VecCost = 0;
11745 if (TryFindNodeWithEqualOperands()) {
11746 LLVM_DEBUG({
11747 dbgs() << "SLP: diamond match for alternate node found.\n";
11748 E->dump();
11749 });
11750 // No need to add new vector costs here since we're going to reuse
11751 // same main/alternate vector ops, just do different shuffling.
11752 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11753 VecCost =
11754 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11755 VecCost +=
11756 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11757 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11758 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11759 VecCost = TTIRef.getCmpSelInstrCost(
11760 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11761 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11762 VL0);
11763 VecCost += TTIRef.getCmpSelInstrCost(
11764 E->getOpcode(), VecTy, MaskTy,
11765 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11766 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11767 E->getAltOp());
11768 } else {
11769 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11770 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11771 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11772 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11773 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11774 unsigned SrcBWSz =
11775 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11776 if (SrcIt != MinBWs.end()) {
11777 SrcBWSz = SrcIt->second.first;
11778 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11779 SrcTy = getWidenedType(SrcSclTy, VL.size());
11780 }
11781 if (BWSz <= SrcBWSz) {
11782 if (BWSz < SrcBWSz)
11783 VecCost =
11784 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11786 LLVM_DEBUG({
11787 dbgs()
11788 << "SLP: alternate extension, which should be truncated.\n";
11789 E->dump();
11790 });
11791 return VecCost;
11792 }
11793 }
11794 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11796 VecCost +=
11797 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11799 }
11801 E->buildAltOpShuffleMask(
11802 [&](Instruction *I) {
11803 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11804 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11805 *TLI);
11806 },
11807 Mask);
11809 FinalVecTy, Mask, CostKind);
11810 // Patterns like [fadd,fsub] can be combined into a single instruction
11811 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11812 // need to take into account their order when looking for the most used
11813 // order.
11814 unsigned Opcode0 = E->getOpcode();
11815 unsigned Opcode1 = E->getAltOpcode();
11816 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11817 // If this pattern is supported by the target then we consider the
11818 // order.
11819 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11820 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11821 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11822 return AltVecCost < VecCost ? AltVecCost : VecCost;
11823 }
11824 // TODO: Check the reverse order too.
11825 return VecCost;
11826 };
11827 if (SLPReVec && !E->isAltShuffle())
11828 return GetCostDiff(
11829 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11830 // If a group uses mask in order, the shufflevector can be
11831 // eliminated by instcombine. Then the cost is 0.
11832 assert(isa<ShuffleVectorInst>(VL.front()) &&
11833 "Not supported shufflevector usage.");
11834 auto *SV = cast<ShuffleVectorInst>(VL.front());
11835 unsigned SVNumElements =
11836 cast<FixedVectorType>(SV->getOperand(0)->getType())
11837 ->getNumElements();
11838 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11839 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11840 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11841 int NextIndex = 0;
11842 if (!all_of(Group, [&](Value *V) {
11843 assert(isa<ShuffleVectorInst>(V) &&
11844 "Not supported shufflevector usage.");
11845 auto *SV = cast<ShuffleVectorInst>(V);
11846 int Index;
11847 [[maybe_unused]] bool IsExtractSubvectorMask =
11848 SV->isExtractSubvectorMask(Index);
11849 assert(IsExtractSubvectorMask &&
11850 "Not supported shufflevector usage.");
11851 if (NextIndex != Index)
11852 return false;
11853 NextIndex += SV->getShuffleMask().size();
11854 return true;
11855 }))
11856 return ::getShuffleCost(
11858 calculateShufflevectorMask(E->Scalars));
11859 }
11860 return TTI::TCC_Free;
11861 });
11862 return GetCostDiff(GetScalarCost, GetVectorCost);
11863 }
11864 case Instruction::Freeze:
11865 return CommonCost;
11866 default:
11867 llvm_unreachable("Unknown instruction");
11868 }
11869}
11870
11871bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11872 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11873 << VectorizableTree.size() << " is fully vectorizable .\n");
11874
11875 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11877 return TE->isGather() &&
11878 !any_of(TE->Scalars,
11879 [this](Value *V) { return EphValues.contains(V); }) &&
11880 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11881 TE->Scalars.size() < Limit ||
11882 ((TE->getOpcode() == Instruction::ExtractElement ||
11883 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11884 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11885 (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
11886 any_of(TE->Scalars, IsaPred<LoadInst>));
11887 };
11888
11889 // We only handle trees of heights 1 and 2.
11890 if (VectorizableTree.size() == 1 &&
11891 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11892 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11893 (ForReduction &&
11894 AreVectorizableGathers(VectorizableTree[0].get(),
11895 VectorizableTree[0]->Scalars.size()) &&
11896 VectorizableTree[0]->getVectorFactor() > 2)))
11897 return true;
11898
11899 if (VectorizableTree.size() != 2)
11900 return false;
11901
11902 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11903 // with the second gather nodes if they have less scalar operands rather than
11904 // the initial tree element (may be profitable to shuffle the second gather)
11905 // or they are extractelements, which form shuffle.
11907 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11908 AreVectorizableGathers(VectorizableTree[1].get(),
11909 VectorizableTree[0]->Scalars.size()))
11910 return true;
11911
11912 // Gathering cost would be too much for tiny trees.
11913 if (VectorizableTree[0]->isGather() ||
11914 (VectorizableTree[1]->isGather() &&
11915 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11916 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11917 return false;
11918
11919 return true;
11920}
11921
11922static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
11924 bool MustMatchOrInst) {
11925 // Look past the root to find a source value. Arbitrarily follow the
11926 // path through operand 0 of any 'or'. Also, peek through optional
11927 // shift-left-by-multiple-of-8-bits.
11928 Value *ZextLoad = Root;
11929 const APInt *ShAmtC;
11930 bool FoundOr = false;
11931 while (!isa<ConstantExpr>(ZextLoad) &&
11932 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
11933 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
11934 ShAmtC->urem(8) == 0))) {
11935 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11936 ZextLoad = BinOp->getOperand(0);
11937 if (BinOp->getOpcode() == Instruction::Or)
11938 FoundOr = true;
11939 }
11940 // Check if the input is an extended load of the required or/shift expression.
11941 Value *Load;
11942 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11943 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
11944 return false;
11945
11946 // Require that the total load bit width is a legal integer type.
11947 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
11948 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
11949 Type *SrcTy = Load->getType();
11950 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
11951 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
11952 return false;
11953
11954 // Everything matched - assume that we can fold the whole sequence using
11955 // load combining.
11956 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
11957 << *(cast<Instruction>(Root)) << "\n");
11958
11959 return true;
11960}
11961
11963 if (RdxKind != RecurKind::Or)
11964 return false;
11965
11966 unsigned NumElts = VectorizableTree[0]->Scalars.size();
11967 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
11968 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
11969 /* MatchOr */ false);
11970}
11971
11973 // Peek through a final sequence of stores and check if all operations are
11974 // likely to be load-combined.
11975 unsigned NumElts = Stores.size();
11976 for (Value *Scalar : Stores) {
11977 Value *X;
11978 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
11979 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
11980 return false;
11981 }
11982 return true;
11983}
11984
11985bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
11986 if (!DebugCounter::shouldExecute(VectorizedGraphs))
11987 return true;
11988
11989 // Graph is empty - do nothing.
11990 if (VectorizableTree.empty()) {
11991 assert(ExternalUses.empty() && "We shouldn't have any external users");
11992
11993 return true;
11994 }
11995
11996 // No need to vectorize inserts of gathered values.
11997 if (VectorizableTree.size() == 2 &&
11998 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
11999 VectorizableTree[1]->isGather() &&
12000 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12001 !(isSplat(VectorizableTree[1]->Scalars) ||
12002 allConstant(VectorizableTree[1]->Scalars))))
12003 return true;
12004
12005 // If the graph includes only PHI nodes and gathers, it is defnitely not
12006 // profitable for the vectorization, we can skip it, if the cost threshold is
12007 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12008 // gathers/buildvectors.
12009 constexpr int Limit = 4;
12010 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12011 !VectorizableTree.empty() &&
12012 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12013 return (TE->isGather() &&
12014 TE->getOpcode() != Instruction::ExtractElement &&
12015 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12016 TE->getOpcode() == Instruction::PHI;
12017 }))
12018 return true;
12019
12020 // We can vectorize the tree if its size is greater than or equal to the
12021 // minimum size specified by the MinTreeSize command line option.
12022 if (VectorizableTree.size() >= MinTreeSize)
12023 return false;
12024
12025 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12026 // can vectorize it if we can prove it fully vectorizable.
12027 if (isFullyVectorizableTinyTree(ForReduction))
12028 return false;
12029
12030 // Check if any of the gather node forms an insertelement buildvector
12031 // somewhere.
12032 bool IsAllowedSingleBVNode =
12033 VectorizableTree.size() > 1 ||
12034 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
12035 !VectorizableTree.front()->isAltShuffle() &&
12036 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12037 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12038 allSameBlock(VectorizableTree.front()->Scalars));
12039 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12040 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12041 return isa<ExtractElementInst, UndefValue>(V) ||
12042 (IsAllowedSingleBVNode &&
12043 !V->hasNUsesOrMore(UsesLimit) &&
12044 any_of(V->users(), IsaPred<InsertElementInst>));
12045 });
12046 }))
12047 return false;
12048
12049 if (VectorizableTree.back()->isGather() &&
12050 VectorizableTree.back()->isAltShuffle() &&
12051 VectorizableTree.back()->getVectorFactor() > 2 &&
12052 allSameBlock(VectorizableTree.back()->Scalars) &&
12053 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12055 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12056 VectorizableTree.back()->getVectorFactor()),
12057 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12058 /*Insert=*/true, /*Extract=*/false,
12060 return false;
12061
12062 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12063 // vectorizable.
12064 return true;
12065}
12066
12069 constexpr unsigned SmallTree = 3;
12070 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12071 getCanonicalGraphSize() <= SmallTree &&
12072 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12073 [](const std::unique_ptr<TreeEntry> &TE) {
12074 return TE->isGather() &&
12075 TE->getOpcode() == Instruction::Load &&
12076 !allSameBlock(TE->Scalars);
12077 }) == 1)
12078 return true;
12079 return false;
12080 }
12081 bool Res = false;
12082 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12083 TreeEntry &E = *VectorizableTree[Idx];
12084 if (!E.isGather())
12085 continue;
12086 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12087 return false;
12088 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12089 continue;
12090 Res = true;
12091 }
12092 return Res;
12093}
12094
12096 // Walk from the bottom of the tree to the top, tracking which values are
12097 // live. When we see a call instruction that is not part of our tree,
12098 // query TTI to see if there is a cost to keeping values live over it
12099 // (for example, if spills and fills are required).
12100 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12102
12104 Instruction *PrevInst = nullptr;
12105
12106 // The entries in VectorizableTree are not necessarily ordered by their
12107 // position in basic blocks. Collect them and order them by dominance so later
12108 // instructions are guaranteed to be visited first. For instructions in
12109 // different basic blocks, we only scan to the beginning of the block, so
12110 // their order does not matter, as long as all instructions in a basic block
12111 // are grouped together. Using dominance ensures a deterministic order.
12112 SmallVector<Instruction *, 16> OrderedScalars;
12113 for (const auto &TEPtr : VectorizableTree) {
12114 if (TEPtr->State != TreeEntry::Vectorize)
12115 continue;
12116 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12117 if (!Inst)
12118 continue;
12119 OrderedScalars.push_back(Inst);
12120 }
12121 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12122 auto *NodeA = DT->getNode(A->getParent());
12123 auto *NodeB = DT->getNode(B->getParent());
12124 assert(NodeA && "Should only process reachable instructions");
12125 assert(NodeB && "Should only process reachable instructions");
12126 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12127 "Different nodes should have different DFS numbers");
12128 if (NodeA != NodeB)
12129 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12130 return B->comesBefore(A);
12131 });
12132
12133 for (Instruction *Inst : OrderedScalars) {
12134 if (!PrevInst) {
12135 PrevInst = Inst;
12136 continue;
12137 }
12138
12139 // Update LiveValues.
12140 LiveValues.erase(PrevInst);
12141 for (auto &J : PrevInst->operands()) {
12142 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12143 LiveValues.insert(cast<Instruction>(&*J));
12144 }
12145
12146 LLVM_DEBUG({
12147 dbgs() << "SLP: #LV: " << LiveValues.size();
12148 for (auto *X : LiveValues)
12149 dbgs() << " " << X->getName();
12150 dbgs() << ", Looking at ";
12151 Inst->dump();
12152 });
12153
12154 // Now find the sequence of instructions between PrevInst and Inst.
12155 unsigned NumCalls = 0;
12156 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12157 PrevInstIt =
12158 PrevInst->getIterator().getReverse();
12159 while (InstIt != PrevInstIt) {
12160 if (PrevInstIt == PrevInst->getParent()->rend()) {
12161 PrevInstIt = Inst->getParent()->rbegin();
12162 continue;
12163 }
12164
12165 auto NoCallIntrinsic = [this](Instruction *I) {
12166 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12167 if (II->isAssumeLikeIntrinsic())
12168 return true;
12169 FastMathFlags FMF;
12171 for (auto &ArgOp : II->args())
12172 Tys.push_back(ArgOp->getType());
12173 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12174 FMF = FPMO->getFastMathFlags();
12175 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12176 FMF);
12177 InstructionCost IntrCost =
12180 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12181 if (IntrCost < CallCost)
12182 return true;
12183 }
12184 return false;
12185 };
12186
12187 // Debug information does not impact spill cost.
12188 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12189 &*PrevInstIt != PrevInst)
12190 NumCalls++;
12191
12192 ++PrevInstIt;
12193 }
12194
12195 if (NumCalls) {
12197 for (auto *II : LiveValues) {
12198 auto *ScalarTy = II->getType();
12199 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12200 ScalarTy = VectorTy->getElementType();
12201 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12202 }
12203 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12204 }
12205
12206 PrevInst = Inst;
12207 }
12208
12209 return Cost;
12210}
12211
12212/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12213/// buildvector sequence.
12215 const InsertElementInst *IE2) {
12216 if (IE1 == IE2)
12217 return false;
12218 const auto *I1 = IE1;
12219 const auto *I2 = IE2;
12220 const InsertElementInst *PrevI1;
12221 const InsertElementInst *PrevI2;
12222 unsigned Idx1 = *getElementIndex(IE1);
12223 unsigned Idx2 = *getElementIndex(IE2);
12224 do {
12225 if (I2 == IE1)
12226 return true;
12227 if (I1 == IE2)
12228 return false;
12229 PrevI1 = I1;
12230 PrevI2 = I2;
12231 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12232 getElementIndex(I1).value_or(Idx2) != Idx2)
12233 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12234 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12235 getElementIndex(I2).value_or(Idx1) != Idx1)
12236 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12237 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12238 llvm_unreachable("Two different buildvectors not expected.");
12239}
12240
12241namespace {
12242/// Returns incoming Value *, if the requested type is Value * too, or a default
12243/// value, otherwise.
12244struct ValueSelect {
12245 template <typename U>
12246 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12247 return V;
12248 }
12249 template <typename U>
12250 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12251 return U();
12252 }
12253};
12254} // namespace
12255
12256/// Does the analysis of the provided shuffle masks and performs the requested
12257/// actions on the vectors with the given shuffle masks. It tries to do it in
12258/// several steps.
12259/// 1. If the Base vector is not undef vector, resizing the very first mask to
12260/// have common VF and perform action for 2 input vectors (including non-undef
12261/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12262/// and processed as a shuffle of 2 elements.
12263/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12264/// action only for 1 vector with the given mask, if it is not the identity
12265/// mask.
12266/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12267/// vectors, combing the masks properly between the steps.
12268template <typename T>
12270 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12271 function_ref<unsigned(T *)> GetVF,
12272 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12274 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12275 SmallVector<int> Mask(ShuffleMask.begin()->second);
12276 auto VMIt = std::next(ShuffleMask.begin());
12277 T *Prev = nullptr;
12278 SmallBitVector UseMask =
12279 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12280 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12281 if (!IsBaseUndef.all()) {
12282 // Base is not undef, need to combine it with the next subvectors.
12283 std::pair<T *, bool> Res =
12284 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12285 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12286 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12287 if (Mask[Idx] == PoisonMaskElem)
12288 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12289 else
12290 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12291 }
12292 auto *V = ValueSelect::get<T *>(Base);
12293 (void)V;
12294 assert((!V || GetVF(V) == Mask.size()) &&
12295 "Expected base vector of VF number of elements.");
12296 Prev = Action(Mask, {nullptr, Res.first});
12297 } else if (ShuffleMask.size() == 1) {
12298 // Base is undef and only 1 vector is shuffled - perform the action only for
12299 // single vector, if the mask is not the identity mask.
12300 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12301 /*ForSingleMask=*/true);
12302 if (Res.second)
12303 // Identity mask is found.
12304 Prev = Res.first;
12305 else
12306 Prev = Action(Mask, {ShuffleMask.begin()->first});
12307 } else {
12308 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12309 // shuffles step by step, combining shuffle between the steps.
12310 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12311 unsigned Vec2VF = GetVF(VMIt->first);
12312 if (Vec1VF == Vec2VF) {
12313 // No need to resize the input vectors since they are of the same size, we
12314 // can shuffle them directly.
12315 ArrayRef<int> SecMask = VMIt->second;
12316 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12317 if (SecMask[I] != PoisonMaskElem) {
12318 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12319 Mask[I] = SecMask[I] + Vec1VF;
12320 }
12321 }
12322 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12323 } else {
12324 // Vectors of different sizes - resize and reshuffle.
12325 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12326 /*ForSingleMask=*/false);
12327 std::pair<T *, bool> Res2 =
12328 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12329 ArrayRef<int> SecMask = VMIt->second;
12330 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12331 if (Mask[I] != PoisonMaskElem) {
12332 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12333 if (Res1.second)
12334 Mask[I] = I;
12335 } else if (SecMask[I] != PoisonMaskElem) {
12336 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12337 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12338 }
12339 }
12340 Prev = Action(Mask, {Res1.first, Res2.first});
12341 }
12342 VMIt = std::next(VMIt);
12343 }
12344 bool IsBaseNotUndef = !IsBaseUndef.all();
12345 (void)IsBaseNotUndef;
12346 // Perform requested actions for the remaining masks/vectors.
12347 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12348 // Shuffle other input vectors, if any.
12349 std::pair<T *, bool> Res =
12350 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12351 ArrayRef<int> SecMask = VMIt->second;
12352 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12353 if (SecMask[I] != PoisonMaskElem) {
12354 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12355 "Multiple uses of scalars.");
12356 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12357 } else if (Mask[I] != PoisonMaskElem) {
12358 Mask[I] = I;
12359 }
12360 }
12361 Prev = Action(Mask, {Prev, Res.first});
12362 }
12363 return Prev;
12364}
12365
12366namespace {
12367/// Data type for handling buildvector sequences with the reused scalars from
12368/// other tree entries.
12369template <typename T> struct ShuffledInsertData {
12370 /// List of insertelements to be replaced by shuffles.
12371 SmallVector<InsertElementInst *> InsertElements;
12372 /// The parent vectors and shuffle mask for the given list of inserts.
12374};
12375} // namespace
12376
12379 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12380 << VectorizableTree.size() << ".\n");
12381
12382 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12383
12384 SmallPtrSet<Value *, 4> CheckedExtracts;
12385 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12386 TreeEntry &TE = *VectorizableTree[I];
12387 // No need to count the cost for combined entries, they are combined and
12388 // just skip their cost.
12389 if (TE.State == TreeEntry::CombinedVectorize) {
12390 LLVM_DEBUG(
12391 dbgs() << "SLP: Skipping cost for combined node that starts with "
12392 << *TE.Scalars[0] << ".\n";
12393 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12394 continue;
12395 }
12396 if (TE.isGather()) {
12397 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12398 E && E->getVectorFactor() == TE.getVectorFactor() &&
12399 E->isSame(TE.Scalars)) {
12400 // Some gather nodes might be absolutely the same as some vectorizable
12401 // nodes after reordering, need to handle it.
12402 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12403 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12404 << "SLP: Current total cost = " << Cost << "\n");
12405 continue;
12406 }
12407 }
12408
12409 // Exclude cost of gather loads nodes which are not used. These nodes were
12410 // built as part of the final attempt to vectorize gathered loads.
12411 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12412 "Expected gather nodes with users only.");
12413
12414 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12415 Cost += C;
12416 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12417 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12418 << "SLP: Current total cost = " << Cost << "\n");
12419 }
12420
12421 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12422 InstructionCost ExtractCost = 0;
12424 SmallVector<APInt> DemandedElts;
12425 SmallDenseSet<Value *, 4> UsedInserts;
12427 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12429 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12430 // Keep track {Scalar, Index, User} tuple.
12431 // On AArch64, this helps in fusing a mov instruction, associated with
12432 // extractelement, with fmul in the backend so that extractelement is free.
12434 for (ExternalUser &EU : ExternalUses) {
12435 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12436 }
12437 for (ExternalUser &EU : ExternalUses) {
12438 // Uses by ephemeral values are free (because the ephemeral value will be
12439 // removed prior to code generation, and so the extraction will be
12440 // removed as well).
12441 if (EphValues.count(EU.User))
12442 continue;
12443
12444 // Used in unreachable blocks or in EH pads (rarely executed) or is
12445 // terminated with unreachable instruction.
12446 if (BasicBlock *UserParent =
12447 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12448 UserParent &&
12449 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12450 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12451 continue;
12452
12453 // We only add extract cost once for the same scalar.
12454 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12455 !ExtractCostCalculated.insert(EU.Scalar).second)
12456 continue;
12457
12458 // No extract cost for vector "scalar"
12459 if (isa<FixedVectorType>(EU.Scalar->getType()))
12460 continue;
12461
12462 // If found user is an insertelement, do not calculate extract cost but try
12463 // to detect it as a final shuffled/identity match.
12464 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12465 VU && VU->getOperand(1) == EU.Scalar) {
12466 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12467 if (!UsedInserts.insert(VU).second)
12468 continue;
12469 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12470 if (InsertIdx) {
12471 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12472 auto *It = find_if(
12473 ShuffledInserts,
12474 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12475 // Checks if 2 insertelements are from the same buildvector.
12476 InsertElementInst *VecInsert = Data.InsertElements.front();
12478 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12479 Value *Op0 = II->getOperand(0);
12480 if (getTreeEntry(II) && !getTreeEntry(Op0))
12481 return nullptr;
12482 return Op0;
12483 });
12484 });
12485 int VecId = -1;
12486 if (It == ShuffledInserts.end()) {
12487 auto &Data = ShuffledInserts.emplace_back();
12488 Data.InsertElements.emplace_back(VU);
12489 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12490 VecId = ShuffledInserts.size() - 1;
12491 auto It = MinBWs.find(ScalarTE);
12492 if (It != MinBWs.end() &&
12493 VectorCasts
12494 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12495 .second) {
12496 unsigned BWSz = It->second.first;
12497 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12498 unsigned VecOpcode;
12499 if (DstBWSz < BWSz)
12500 VecOpcode = Instruction::Trunc;
12501 else
12502 VecOpcode =
12503 It->second.second ? Instruction::SExt : Instruction::ZExt;
12506 VecOpcode, FTy,
12507 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12508 FTy->getNumElements()),
12510 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12511 << " for extending externally used vector with "
12512 "non-equal minimum bitwidth.\n");
12513 Cost += C;
12514 }
12515 } else {
12516 if (isFirstInsertElement(VU, It->InsertElements.front()))
12517 It->InsertElements.front() = VU;
12518 VecId = std::distance(ShuffledInserts.begin(), It);
12519 }
12520 int InIdx = *InsertIdx;
12521 SmallVectorImpl<int> &Mask =
12522 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12523 if (Mask.empty())
12524 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12525 Mask[InIdx] = EU.Lane;
12526 DemandedElts[VecId].setBit(InIdx);
12527 continue;
12528 }
12529 }
12530 }
12531
12533 // If we plan to rewrite the tree in a smaller type, we will need to sign
12534 // extend the extracted value back to the original type. Here, we account
12535 // for the extract and the added cost of the sign extend if needed.
12536 InstructionCost ExtraCost = TTI::TCC_Free;
12537 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12538 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12539 auto It = MinBWs.find(Entry);
12540 if (It != MinBWs.end()) {
12541 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12542 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12543 ? Instruction::ZExt
12544 : Instruction::SExt;
12545 VecTy = getWidenedType(MinTy, BundleWidth);
12546 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12547 VecTy, EU.Lane);
12548 } else {
12549 ExtraCost =
12550 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12551 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12552 }
12553 // Leave the scalar instructions as is if they are cheaper than extracts.
12554 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12555 Entry->getOpcode() == Instruction::Load) {
12556 // Checks if the user of the external scalar is phi in loop body.
12557 auto IsPhiInLoop = [&](const ExternalUser &U) {
12558 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12559 auto *I = cast<Instruction>(U.Scalar);
12560 const Loop *L = LI->getLoopFor(Phi->getParent());
12561 return L && (Phi->getParent() == I->getParent() ||
12562 L == LI->getLoopFor(I->getParent()));
12563 }
12564 return false;
12565 };
12566 if (!ValueToExtUses) {
12567 ValueToExtUses.emplace();
12568 for_each(enumerate(ExternalUses), [&](const auto &P) {
12569 // Ignore phis in loops.
12570 if (IsPhiInLoop(P.value()))
12571 return;
12572
12573 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12574 });
12575 }
12576 // Can use original instruction, if no operands vectorized or they are
12577 // marked as externally used already.
12578 auto *Inst = cast<Instruction>(EU.Scalar);
12579 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12580 auto OperandIsScalar = [&](Value *V) {
12581 if (!getTreeEntry(V)) {
12582 // Some extractelements might be not vectorized, but
12583 // transformed into shuffle and removed from the function,
12584 // consider it here.
12585 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12586 return !EE->hasOneUse() || !MustGather.contains(EE);
12587 return true;
12588 }
12589 return ValueToExtUses->contains(V);
12590 };
12591 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12592 bool CanBeUsedAsScalarCast = false;
12593 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12594 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12595 Op && all_of(Op->operands(), OperandIsScalar)) {
12596 InstructionCost OpCost =
12597 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12599 : 0;
12600 if (ScalarCost + OpCost <= ExtraCost) {
12601 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12602 ScalarCost += OpCost;
12603 }
12604 }
12605 }
12606 if (CanBeUsedAsScalar) {
12607 bool KeepScalar = ScalarCost <= ExtraCost;
12608 // Try to keep original scalar if the user is the phi node from the same
12609 // block as the root phis, currently vectorized. It allows to keep
12610 // better ordering info of PHIs, being vectorized currently.
12611 bool IsProfitablePHIUser =
12612 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12613 VectorizableTree.front()->Scalars.size() > 2)) &&
12614 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12615 !Inst->hasNUsesOrMore(UsesLimit) &&
12616 none_of(Inst->users(),
12617 [&](User *U) {
12618 auto *PHIUser = dyn_cast<PHINode>(U);
12619 return (!PHIUser ||
12620 PHIUser->getParent() !=
12621 cast<Instruction>(
12622 VectorizableTree.front()->getMainOp())
12623 ->getParent()) &&
12624 !getTreeEntry(U);
12625 }) &&
12626 count_if(Entry->Scalars, [&](Value *V) {
12627 return ValueToExtUses->contains(V);
12628 }) <= 2;
12629 if (IsProfitablePHIUser) {
12630 KeepScalar = true;
12631 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12632 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12633 (!GatheredLoadsEntriesFirst.has_value() ||
12634 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12635 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12636 return ValueToExtUses->contains(V);
12637 });
12638 auto It = ExtractsCount.find(Entry);
12639 if (It != ExtractsCount.end()) {
12640 assert(ScalarUsesCount >= It->getSecond().size() &&
12641 "Expected total number of external uses not less than "
12642 "number of scalar uses.");
12643 ScalarUsesCount -= It->getSecond().size();
12644 }
12645 // Keep original scalar if number of externally used instructions in
12646 // the same entry is not power of 2. It may help to do some extra
12647 // vectorization for now.
12648 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12649 }
12650 if (KeepScalar) {
12651 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12652 for_each(Inst->operands(), [&](Value *V) {
12653 auto It = ValueToExtUses->find(V);
12654 if (It != ValueToExtUses->end()) {
12655 // Replace all uses to avoid compiler crash.
12656 ExternalUses[It->second].User = nullptr;
12657 }
12658 });
12659 ExtraCost = ScalarCost;
12660 if (!IsPhiInLoop(EU))
12661 ExtractsCount[Entry].insert(Inst);
12662 if (CanBeUsedAsScalarCast) {
12663 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12664 // Update the users of the operands of the cast operand to avoid
12665 // compiler crash.
12666 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12667 for_each(IOp->operands(), [&](Value *V) {
12668 auto It = ValueToExtUses->find(V);
12669 if (It != ValueToExtUses->end()) {
12670 // Replace all uses to avoid compiler crash.
12671 ExternalUses[It->second].User = nullptr;
12672 }
12673 });
12674 }
12675 }
12676 }
12677 }
12678 }
12679
12680 ExtractCost += ExtraCost;
12681 }
12682 // Insert externals for extract of operands of casts to be emitted as scalars
12683 // instead of extractelement.
12684 for (Value *V : ScalarOpsFromCasts) {
12685 ExternalUsesAsOriginalScalar.insert(V);
12686 if (const TreeEntry *E = getTreeEntry(V)) {
12687 ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
12688 }
12689 }
12690 // Add reduced value cost, if resized.
12691 if (!VectorizedVals.empty()) {
12692 const TreeEntry &Root = *VectorizableTree.front();
12693 auto BWIt = MinBWs.find(&Root);
12694 if (BWIt != MinBWs.end()) {
12695 Type *DstTy = Root.Scalars.front()->getType();
12696 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12697 unsigned SrcSz =
12698 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12699 if (OriginalSz != SrcSz) {
12700 unsigned Opcode = Instruction::Trunc;
12701 if (OriginalSz > SrcSz)
12702 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12703 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12704 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12705 assert(SLPReVec && "Only supported by REVEC.");
12706 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12707 }
12708 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12711 }
12712 }
12713 }
12714
12715 InstructionCost SpillCost = getSpillCost();
12716 Cost += SpillCost + ExtractCost;
12717 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12718 bool) {
12719 InstructionCost C = 0;
12720 unsigned VF = Mask.size();
12721 unsigned VecVF = TE->getVectorFactor();
12722 if (VF != VecVF &&
12723 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12725 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12726 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12727 OrigMask.begin());
12729 getWidenedType(TE->getMainOp()->getType(), VecVF),
12730 OrigMask);
12731 LLVM_DEBUG(
12732 dbgs() << "SLP: Adding cost " << C
12733 << " for final shuffle of insertelement external users.\n";
12734 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12735 Cost += C;
12736 return std::make_pair(TE, true);
12737 }
12738 return std::make_pair(TE, false);
12739 };
12740 // Calculate the cost of the reshuffled vectors, if any.
12741 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12742 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12743 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12744 unsigned VF = 0;
12745 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12747 assert((TEs.size() == 1 || TEs.size() == 2) &&
12748 "Expected exactly 1 or 2 tree entries.");
12749 if (TEs.size() == 1) {
12750 if (VF == 0)
12751 VF = TEs.front()->getVectorFactor();
12752 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12753 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12754 !all_of(enumerate(Mask), [=](const auto &Data) {
12755 return Data.value() == PoisonMaskElem ||
12756 (Data.index() < VF &&
12757 static_cast<int>(Data.index()) == Data.value());
12758 })) {
12761 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12762 << " for final shuffle of insertelement "
12763 "external users.\n";
12764 TEs.front()->dump();
12765 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12766 Cost += C;
12767 }
12768 } else {
12769 if (VF == 0) {
12770 if (TEs.front() &&
12771 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12772 VF = TEs.front()->getVectorFactor();
12773 else
12774 VF = Mask.size();
12775 }
12776 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12779 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12780 << " for final shuffle of vector node and external "
12781 "insertelement users.\n";
12782 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12783 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12784 Cost += C;
12785 }
12786 VF = Mask.size();
12787 return TEs.back();
12788 };
12789 (void)performExtractsShuffleAction<const TreeEntry>(
12790 MutableArrayRef(Vector.data(), Vector.size()), Base,
12791 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12792 EstimateShufflesCost);
12794 cast<FixedVectorType>(
12795 ShuffledInserts[I].InsertElements.front()->getType()),
12796 DemandedElts[I],
12797 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12798 Cost -= InsertCost;
12799 }
12800
12801 // Add the cost for reduced value resize (if required).
12802 if (ReductionBitWidth != 0) {
12803 assert(UserIgnoreList && "Expected reduction tree.");
12804 const TreeEntry &E = *VectorizableTree.front();
12805 auto It = MinBWs.find(&E);
12806 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12807 unsigned SrcSize = It->second.first;
12808 unsigned DstSize = ReductionBitWidth;
12809 unsigned Opcode = Instruction::Trunc;
12810 if (SrcSize < DstSize) {
12811 bool IsArithmeticExtendedReduction =
12812 all_of(*UserIgnoreList, [](Value *V) {
12813 auto *I = cast<Instruction>(V);
12814 return is_contained({Instruction::Add, Instruction::FAdd,
12815 Instruction::Mul, Instruction::FMul,
12816 Instruction::And, Instruction::Or,
12817 Instruction::Xor},
12818 I->getOpcode());
12819 });
12820 if (IsArithmeticExtendedReduction)
12821 Opcode =
12822 Instruction::BitCast; // Handle it by getExtendedReductionCost
12823 else
12824 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12825 }
12826 if (Opcode != Instruction::BitCast) {
12827 auto *SrcVecTy =
12828 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12829 auto *DstVecTy =
12830 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12831 TTI::CastContextHint CCH = getCastContextHint(E);
12832 InstructionCost CastCost;
12833 switch (E.getOpcode()) {
12834 case Instruction::SExt:
12835 case Instruction::ZExt:
12836 case Instruction::Trunc: {
12837 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12838 CCH = getCastContextHint(*OpTE);
12839 break;
12840 }
12841 default:
12842 break;
12843 }
12844 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12846 Cost += CastCost;
12847 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12848 << " for final resize for reduction from " << SrcVecTy
12849 << " to " << DstVecTy << "\n";
12850 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12851 }
12852 }
12853 }
12854
12855#ifndef NDEBUG
12856 SmallString<256> Str;
12857 {
12859 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12860 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12861 << "SLP: Total Cost = " << Cost << ".\n";
12862 }
12863 LLVM_DEBUG(dbgs() << Str);
12864 if (ViewSLPTree)
12865 ViewGraph(this, "SLP" + F->getName(), false, Str);
12866#endif
12867
12868 return Cost;
12869}
12870
12871/// Tries to find extractelement instructions with constant indices from fixed
12872/// vector type and gather such instructions into a bunch, which highly likely
12873/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12874/// successful, the matched scalars are replaced by poison values in \p VL for
12875/// future analysis.
12876std::optional<TTI::ShuffleKind>
12877BoUpSLP::tryToGatherSingleRegisterExtractElements(
12879 // Scan list of gathered scalars for extractelements that can be represented
12880 // as shuffles.
12882 SmallVector<int> UndefVectorExtracts;
12883 for (int I = 0, E = VL.size(); I < E; ++I) {
12884 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12885 if (!EI) {
12886 if (isa<UndefValue>(VL[I]))
12887 UndefVectorExtracts.push_back(I);
12888 continue;
12889 }
12890 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12891 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12892 continue;
12893 std::optional<unsigned> Idx = getExtractIndex(EI);
12894 // Undefined index.
12895 if (!Idx) {
12896 UndefVectorExtracts.push_back(I);
12897 continue;
12898 }
12899 if (Idx >= VecTy->getNumElements()) {
12900 UndefVectorExtracts.push_back(I);
12901 continue;
12902 }
12903 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12904 ExtractMask.reset(*Idx);
12905 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12906 UndefVectorExtracts.push_back(I);
12907 continue;
12908 }
12909 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12910 }
12911 // Sort the vector operands by the maximum number of uses in extractelements.
12913 VectorOpToIdx.takeVector();
12914 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
12915 return P1.second.size() > P2.second.size();
12916 });
12917 // Find the best pair of the vectors or a single vector.
12918 const int UndefSz = UndefVectorExtracts.size();
12919 unsigned SingleMax = 0;
12920 unsigned PairMax = 0;
12921 if (!Vectors.empty()) {
12922 SingleMax = Vectors.front().second.size() + UndefSz;
12923 if (Vectors.size() > 1) {
12924 auto *ItNext = std::next(Vectors.begin());
12925 PairMax = SingleMax + ItNext->second.size();
12926 }
12927 }
12928 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12929 return std::nullopt;
12930 // Check if better to perform a shuffle of 2 vectors or just of a single
12931 // vector.
12932 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
12933 SmallVector<Value *> GatheredExtracts(
12934 VL.size(), PoisonValue::get(VL.front()->getType()));
12935 if (SingleMax >= PairMax && SingleMax) {
12936 for (int Idx : Vectors.front().second)
12937 std::swap(GatheredExtracts[Idx], VL[Idx]);
12938 } else if (!Vectors.empty()) {
12939 for (unsigned Idx : {0, 1})
12940 for (int Idx : Vectors[Idx].second)
12941 std::swap(GatheredExtracts[Idx], VL[Idx]);
12942 }
12943 // Add extracts from undefs too.
12944 for (int Idx : UndefVectorExtracts)
12945 std::swap(GatheredExtracts[Idx], VL[Idx]);
12946 // Check that gather of extractelements can be represented as just a
12947 // shuffle of a single/two vectors the scalars are extracted from.
12948 std::optional<TTI::ShuffleKind> Res =
12949 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
12950 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
12951 // TODO: try to check other subsets if possible.
12952 // Restore the original VL if attempt was not successful.
12953 copy(SavedVL, VL.begin());
12954 return std::nullopt;
12955 }
12956 // Restore unused scalars from mask, if some of the extractelements were not
12957 // selected for shuffle.
12958 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
12959 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
12960 isa<UndefValue>(GatheredExtracts[I])) {
12961 std::swap(VL[I], GatheredExtracts[I]);
12962 continue;
12963 }
12964 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12965 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
12966 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
12967 is_contained(UndefVectorExtracts, I))
12968 continue;
12969 }
12970 return Res;
12971}
12972
12973/// Tries to find extractelement instructions with constant indices from fixed
12974/// vector type and gather such instructions into a bunch, which highly likely
12975/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12976/// successful, the matched scalars are replaced by poison values in \p VL for
12977/// future analysis.
12979BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
12981 unsigned NumParts) const {
12982 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
12983 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
12984 Mask.assign(VL.size(), PoisonMaskElem);
12985 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
12986 for (unsigned Part : seq<unsigned>(NumParts)) {
12987 // Scan list of gathered scalars for extractelements that can be represented
12988 // as shuffles.
12990 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
12991 SmallVector<int> SubMask;
12992 std::optional<TTI::ShuffleKind> Res =
12993 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
12994 ShufflesRes[Part] = Res;
12995 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
12996 }
12997 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
12998 return Res.has_value();
12999 }))
13000 ShufflesRes.clear();
13001 return ShufflesRes;
13002}
13003
13004std::optional<TargetTransformInfo::ShuffleKind>
13005BoUpSLP::isGatherShuffledSingleRegisterEntry(
13006 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
13007 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
13008 Entries.clear();
13009 // TODO: currently checking only for Scalars in the tree entry, need to count
13010 // reused elements too for better cost estimation.
13011 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
13012 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13013 : TE->UserTreeIndices.front();
13014 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13015 const BasicBlock *TEInsertBlock = nullptr;
13016 // Main node of PHI entries keeps the correct order of operands/incoming
13017 // blocks.
13018 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13019 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13020 TEInsertPt = TEInsertBlock->getTerminator();
13021 } else {
13022 TEInsertBlock = TEInsertPt->getParent();
13023 }
13024 if (!DT->isReachableFromEntry(TEInsertBlock))
13025 return std::nullopt;
13026 auto *NodeUI = DT->getNode(TEInsertBlock);
13027 assert(NodeUI && "Should only process reachable instructions");
13028 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13029 auto CheckOrdering = [&](const Instruction *InsertPt) {
13030 // Argument InsertPt is an instruction where vector code for some other
13031 // tree entry (one that shares one or more scalars with TE) is going to be
13032 // generated. This lambda returns true if insertion point of vector code
13033 // for the TE dominates that point (otherwise dependency is the other way
13034 // around). The other node is not limited to be of a gather kind. Gather
13035 // nodes are not scheduled and their vector code is inserted before their
13036 // first user. If user is PHI, that is supposed to be at the end of a
13037 // predecessor block. Otherwise it is the last instruction among scalars of
13038 // the user node. So, instead of checking dependency between instructions
13039 // themselves, we check dependency between their insertion points for vector
13040 // code (since each scalar instruction ends up as a lane of a vector
13041 // instruction).
13042 const BasicBlock *InsertBlock = InsertPt->getParent();
13043 auto *NodeEUI = DT->getNode(InsertBlock);
13044 if (!NodeEUI)
13045 return false;
13046 assert((NodeUI == NodeEUI) ==
13047 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13048 "Different nodes should have different DFS numbers");
13049 // Check the order of the gather nodes users.
13050 if (TEInsertPt->getParent() != InsertBlock &&
13051 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13052 return false;
13053 if (TEInsertPt->getParent() == InsertBlock &&
13054 TEInsertPt->comesBefore(InsertPt))
13055 return false;
13056 return true;
13057 };
13058 // Find all tree entries used by the gathered values. If no common entries
13059 // found - not a shuffle.
13060 // Here we build a set of tree nodes for each gathered value and trying to
13061 // find the intersection between these sets. If we have at least one common
13062 // tree node for each gathered value - we have just a permutation of the
13063 // single vector. If we have 2 different sets, we're in situation where we
13064 // have a permutation of 2 input vectors.
13066 DenseMap<Value *, int> UsedValuesEntry;
13067 for (Value *V : VL) {
13068 if (isConstant(V))
13069 continue;
13070 // Build a list of tree entries where V is used.
13072 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13073 if (TEPtr == TE || TEPtr->Idx == 0)
13074 continue;
13075 assert(any_of(TEPtr->Scalars,
13076 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13077 "Must contain at least single gathered value.");
13078 assert(TEPtr->UserTreeIndices.size() == 1 &&
13079 "Expected only single user of a gather node.");
13080 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13081
13082 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13083 const Instruction *InsertPt =
13084 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13085 : &getLastInstructionInBundle(UseEI.UserTE);
13086 if (TEInsertPt == InsertPt) {
13087 // If 2 gathers are operands of the same entry (regardless of whether
13088 // user is PHI or else), compare operands indices, use the earlier one
13089 // as the base.
13090 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13091 continue;
13092 // If the user instruction is used for some reason in different
13093 // vectorized nodes - make it depend on index.
13094 if (TEUseEI.UserTE != UseEI.UserTE &&
13095 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13096 continue;
13097 }
13098
13099 // Check if the user node of the TE comes after user node of TEPtr,
13100 // otherwise TEPtr depends on TE.
13101 if ((TEInsertBlock != InsertPt->getParent() ||
13102 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13103 !CheckOrdering(InsertPt))
13104 continue;
13105 VToTEs.insert(TEPtr);
13106 }
13107 if (const TreeEntry *VTE = getTreeEntry(V)) {
13108 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13109 if (VTE->State != TreeEntry::Vectorize) {
13110 auto It = MultiNodeScalars.find(V);
13111 if (It == MultiNodeScalars.end())
13112 continue;
13113 VTE = *It->getSecond().begin();
13114 // Iterate through all vectorized nodes.
13115 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
13116 return MTE->State == TreeEntry::Vectorize;
13117 });
13118 if (MIt == It->getSecond().end())
13119 continue;
13120 VTE = *MIt;
13121 }
13122 }
13123 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13124 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13125 continue;
13126 VToTEs.insert(VTE);
13127 }
13128 if (VToTEs.empty())
13129 continue;
13130 if (UsedTEs.empty()) {
13131 // The first iteration, just insert the list of nodes to vector.
13132 UsedTEs.push_back(VToTEs);
13133 UsedValuesEntry.try_emplace(V, 0);
13134 } else {
13135 // Need to check if there are any previously used tree nodes which use V.
13136 // If there are no such nodes, consider that we have another one input
13137 // vector.
13138 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13139 unsigned Idx = 0;
13140 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13141 // Do we have a non-empty intersection of previously listed tree entries
13142 // and tree entries using current V?
13143 set_intersect(VToTEs, Set);
13144 if (!VToTEs.empty()) {
13145 // Yes, write the new subset and continue analysis for the next
13146 // scalar.
13147 Set.swap(VToTEs);
13148 break;
13149 }
13150 VToTEs = SavedVToTEs;
13151 ++Idx;
13152 }
13153 // No non-empty intersection found - need to add a second set of possible
13154 // source vectors.
13155 if (Idx == UsedTEs.size()) {
13156 // If the number of input vectors is greater than 2 - not a permutation,
13157 // fallback to the regular gather.
13158 // TODO: support multiple reshuffled nodes.
13159 if (UsedTEs.size() == 2)
13160 continue;
13161 UsedTEs.push_back(SavedVToTEs);
13162 Idx = UsedTEs.size() - 1;
13163 }
13164 UsedValuesEntry.try_emplace(V, Idx);
13165 }
13166 }
13167
13168 if (UsedTEs.empty()) {
13169 Entries.clear();
13170 return std::nullopt;
13171 }
13172
13173 unsigned VF = 0;
13174 if (UsedTEs.size() == 1) {
13175 // Keep the order to avoid non-determinism.
13176 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13177 UsedTEs.front().end());
13178 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13179 return TE1->Idx < TE2->Idx;
13180 });
13181 // Try to find the perfect match in another gather node at first.
13182 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13183 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13184 });
13185 if (It != FirstEntries.end() &&
13186 ((*It)->getVectorFactor() == VL.size() ||
13187 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13188 TE->ReuseShuffleIndices.size() == VL.size() &&
13189 (*It)->isSame(TE->Scalars)))) {
13190 Entries.push_back(*It);
13191 if ((*It)->getVectorFactor() == VL.size()) {
13192 std::iota(std::next(Mask.begin(), Part * VL.size()),
13193 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13194 } else {
13195 SmallVector<int> CommonMask = TE->getCommonMask();
13196 copy(CommonMask, Mask.begin());
13197 }
13198 // Clear undef scalars.
13199 for (unsigned I : seq<unsigned>(VL.size()))
13200 if (isa<PoisonValue>(VL[I]))
13201 Mask[Part * VL.size() + I] = PoisonMaskElem;
13203 }
13204 // No perfect match, just shuffle, so choose the first tree node from the
13205 // tree.
13206 Entries.push_back(FirstEntries.front());
13207 VF = FirstEntries.front()->getVectorFactor();
13208 } else {
13209 // Try to find nodes with the same vector factor.
13210 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13211 // Keep the order of tree nodes to avoid non-determinism.
13213 for (const TreeEntry *TE : UsedTEs.front()) {
13214 unsigned VF = TE->getVectorFactor();
13215 auto It = VFToTE.find(VF);
13216 if (It != VFToTE.end()) {
13217 if (It->second->Idx > TE->Idx)
13218 It->getSecond() = TE;
13219 continue;
13220 }
13221 VFToTE.try_emplace(VF, TE);
13222 }
13223 // Same, keep the order to avoid non-determinism.
13224 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13225 UsedTEs.back().end());
13226 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13227 return TE1->Idx < TE2->Idx;
13228 });
13229 for (const TreeEntry *TE : SecondEntries) {
13230 auto It = VFToTE.find(TE->getVectorFactor());
13231 if (It != VFToTE.end()) {
13232 VF = It->first;
13233 Entries.push_back(It->second);
13234 Entries.push_back(TE);
13235 break;
13236 }
13237 }
13238 // No 2 source vectors with the same vector factor - just choose 2 with max
13239 // index.
13240 if (Entries.empty()) {
13241 Entries.push_back(*llvm::max_element(
13242 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13243 return TE1->Idx < TE2->Idx;
13244 }));
13245 Entries.push_back(SecondEntries.front());
13246 VF = std::max(Entries.front()->getVectorFactor(),
13247 Entries.back()->getVectorFactor());
13248 } else {
13249 VF = Entries.front()->getVectorFactor();
13250 }
13251 }
13252
13253 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13254 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13255 // vectorized.
13256 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13257 auto *PHI = cast<PHINode>(V);
13258 auto *PHI1 = cast<PHINode>(V1);
13259 // Check that all incoming values are compatible/from same parent (if they
13260 // are instructions).
13261 // The incoming values are compatible if they all are constants, or
13262 // instruction with the same/alternate opcodes from the same basic block.
13263 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13264 Value *In = PHI->getIncomingValue(I);
13265 Value *In1 = PHI1->getIncomingValue(I);
13266 if (isConstant(In) && isConstant(In1))
13267 continue;
13268 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
13269 return false;
13270 if (cast<Instruction>(In)->getParent() !=
13271 cast<Instruction>(In1)->getParent())
13272 return false;
13273 }
13274 return true;
13275 };
13276 // Check if the value can be ignored during analysis for shuffled gathers.
13277 // We suppose it is better to ignore instruction, which do not form splats,
13278 // are not vectorized/not extractelements (these instructions will be handled
13279 // by extractelements processing) or may form vector node in future.
13280 auto MightBeIgnored = [=](Value *V) {
13281 auto *I = dyn_cast<Instruction>(V);
13282 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13284 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13285 };
13286 // Check that the neighbor instruction may form a full vector node with the
13287 // current instruction V. It is possible, if they have same/alternate opcode
13288 // and same parent basic block.
13289 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13290 Value *V1 = VL[Idx];
13291 bool UsedInSameVTE = false;
13292 auto It = UsedValuesEntry.find(V1);
13293 if (It != UsedValuesEntry.end())
13294 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13295 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13296 getSameOpcode({V, V1}, *TLI).getOpcode() &&
13297 cast<Instruction>(V)->getParent() ==
13298 cast<Instruction>(V1)->getParent() &&
13299 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13300 };
13301 // Build a shuffle mask for better cost estimation and vector emission.
13302 SmallBitVector UsedIdxs(Entries.size());
13304 for (int I = 0, E = VL.size(); I < E; ++I) {
13305 Value *V = VL[I];
13306 auto It = UsedValuesEntry.find(V);
13307 if (It == UsedValuesEntry.end())
13308 continue;
13309 // Do not try to shuffle scalars, if they are constants, or instructions
13310 // that can be vectorized as a result of the following vector build
13311 // vectorization.
13312 if (isConstant(V) || (MightBeIgnored(V) &&
13313 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13314 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13315 continue;
13316 unsigned Idx = It->second;
13317 EntryLanes.emplace_back(Idx, I);
13318 UsedIdxs.set(Idx);
13319 }
13320 // Iterate through all shuffled scalars and select entries, which can be used
13321 // for final shuffle.
13323 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13324 if (!UsedIdxs.test(I))
13325 continue;
13326 // Fix the entry number for the given scalar. If it is the first entry, set
13327 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13328 // These indices are used when calculating final shuffle mask as the vector
13329 // offset.
13330 for (std::pair<unsigned, int> &Pair : EntryLanes)
13331 if (Pair.first == I)
13332 Pair.first = TempEntries.size();
13333 TempEntries.push_back(Entries[I]);
13334 }
13335 Entries.swap(TempEntries);
13336 if (EntryLanes.size() == Entries.size() &&
13337 !VL.equals(ArrayRef(TE->Scalars)
13338 .slice(Part * VL.size(),
13339 std::min<int>(VL.size(), TE->Scalars.size())))) {
13340 // We may have here 1 or 2 entries only. If the number of scalars is equal
13341 // to the number of entries, no need to do the analysis, it is not very
13342 // profitable. Since VL is not the same as TE->Scalars, it means we already
13343 // have some shuffles before. Cut off not profitable case.
13344 Entries.clear();
13345 return std::nullopt;
13346 }
13347 // Build the final mask, check for the identity shuffle, if possible.
13348 bool IsIdentity = Entries.size() == 1;
13349 // Pair.first is the offset to the vector, while Pair.second is the index of
13350 // scalar in the list.
13351 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13352 unsigned Idx = Part * VL.size() + Pair.second;
13353 Mask[Idx] =
13354 Pair.first * VF +
13355 (ForOrder ? std::distance(
13356 Entries[Pair.first]->Scalars.begin(),
13357 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13358 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13359 IsIdentity &= Mask[Idx] == Pair.second;
13360 }
13361 if (ForOrder || IsIdentity || Entries.empty()) {
13362 switch (Entries.size()) {
13363 case 1:
13364 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13366 break;
13367 case 2:
13368 if (EntryLanes.size() > 2 || VL.size() <= 2)
13370 break;
13371 default:
13372 break;
13373 }
13374 } else if (!isa<VectorType>(VL.front()->getType()) &&
13375 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13376 // Do the cost estimation if shuffle beneficial than buildvector.
13377 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13378 std::next(Mask.begin(), (Part + 1) * VL.size()));
13379 int MinElement = SubMask.front(), MaxElement = SubMask.front();
13380 for (int Idx : SubMask) {
13381 if (Idx == PoisonMaskElem)
13382 continue;
13383 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13384 MinElement = Idx;
13385 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13386 MaxElement = Idx;
13387 }
13388 assert(MaxElement >= 0 && MinElement >= 0 &&
13389 MaxElement % VF >= MinElement % VF &&
13390 "Expected at least single element.");
13391 unsigned NewVF = std::max<unsigned>(
13392 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13393 (MaxElement % VF) -
13394 (MinElement % VF) + 1));
13395 if (NewVF < VF) {
13396 for_each(SubMask, [&](int &Idx) {
13397 if (Idx == PoisonMaskElem)
13398 return;
13399 Idx = (Idx % VF) - (MinElement % VF) +
13400 (Idx >= static_cast<int>(VF) ? NewVF : 0);
13401 });
13402 VF = NewVF;
13403 }
13404
13406 auto *VecTy = getWidenedType(VL.front()->getType(), VF);
13407 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13408 auto GetShuffleCost = [&,
13411 VectorType *VecTy) -> InstructionCost {
13412 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13414 Mask, Entries.front()->getInterleaveFactor()))
13415 return TTI::TCC_Free;
13416 return ::getShuffleCost(TTI,
13417 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13419 VecTy, Mask, CostKind);
13420 };
13421 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13422 InstructionCost FirstShuffleCost = 0;
13423 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13424 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13425 FirstShuffleCost = ShuffleCost;
13426 } else {
13427 // Transform mask to include only first entry.
13428 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13429 bool IsIdentity = true;
13430 for (auto [I, Idx] : enumerate(FirstMask)) {
13431 if (Idx >= static_cast<int>(VF)) {
13433 } else {
13434 DemandedElts.clearBit(I);
13435 if (Idx != PoisonMaskElem)
13436 IsIdentity &= static_cast<int>(I) == Idx;
13437 }
13438 }
13439 if (!IsIdentity)
13440 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13441 FirstShuffleCost += TTI->getScalarizationOverhead(
13442 MaskVecTy, DemandedElts, /*Insert=*/true,
13443 /*Extract=*/false, CostKind);
13444 }
13445 InstructionCost SecondShuffleCost = 0;
13446 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13447 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13448 SecondShuffleCost = ShuffleCost;
13449 } else {
13450 // Transform mask to include only first entry.
13451 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13452 bool IsIdentity = true;
13453 for (auto [I, Idx] : enumerate(SecondMask)) {
13454 if (Idx < static_cast<int>(VF) && Idx >= 0) {
13456 } else {
13457 DemandedElts.clearBit(I);
13458 if (Idx != PoisonMaskElem) {
13459 Idx -= VF;
13460 IsIdentity &= static_cast<int>(I) == Idx;
13461 }
13462 }
13463 }
13464 if (!IsIdentity)
13465 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13466 SecondShuffleCost += TTI->getScalarizationOverhead(
13467 MaskVecTy, DemandedElts, /*Insert=*/true,
13468 /*Extract=*/false, CostKind);
13469 }
13470 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13471 for (auto [I, Idx] : enumerate(SubMask))
13472 if (Idx == PoisonMaskElem)
13473 DemandedElts.clearBit(I);
13474 InstructionCost BuildVectorCost =
13475 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13476 /*Extract=*/false, CostKind);
13477 const TreeEntry *BestEntry = nullptr;
13478 if (FirstShuffleCost < ShuffleCost) {
13479 copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
13480 BestEntry = Entries.front();
13481 ShuffleCost = FirstShuffleCost;
13482 }
13483 if (SecondShuffleCost < ShuffleCost) {
13484 copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
13485 BestEntry = Entries[1];
13486 ShuffleCost = SecondShuffleCost;
13487 }
13488 if (BuildVectorCost >= ShuffleCost) {
13489 if (BestEntry) {
13490 Entries.clear();
13491 Entries.push_back(BestEntry);
13492 }
13493 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13495 }
13496 }
13497 Entries.clear();
13498 // Clear the corresponding mask elements.
13499 std::fill(std::next(Mask.begin(), Part * VL.size()),
13500 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13501 return std::nullopt;
13502}
13503
13505BoUpSLP::isGatherShuffledEntry(
13506 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13507 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13508 bool ForOrder) {
13509 assert(NumParts > 0 && NumParts < VL.size() &&
13510 "Expected positive number of registers.");
13511 Entries.clear();
13512 // No need to check for the topmost gather node.
13513 if (TE == VectorizableTree.front().get() &&
13514 (!GatheredLoadsEntriesFirst.has_value() ||
13515 none_of(ArrayRef(VectorizableTree).drop_front(),
13516 [](const std::unique_ptr<TreeEntry> &TE) {
13517 return !TE->isGather();
13518 })))
13519 return {};
13520 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
13521 if (TE->isNonPowOf2Vec())
13522 return {};
13523 Mask.assign(VL.size(), PoisonMaskElem);
13524 assert((TE->UserTreeIndices.size() == 1 ||
13525 TE == VectorizableTree.front().get()) &&
13526 "Expected only single user of the gather node.");
13527 assert(VL.size() % NumParts == 0 &&
13528 "Number of scalars must be divisible by NumParts.");
13529 if (!TE->UserTreeIndices.empty() &&
13530 TE->UserTreeIndices.front().UserTE->isGather() &&
13531 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13532 assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
13533 isSplat(TE->Scalars)) &&
13534 "Expected splat or extractelements only node.");
13535 return {};
13536 }
13537 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13539 for (unsigned Part : seq<unsigned>(NumParts)) {
13540 ArrayRef<Value *> SubVL =
13541 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13542 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13543 std::optional<TTI::ShuffleKind> SubRes =
13544 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13545 ForOrder);
13546 if (!SubRes)
13547 SubEntries.clear();
13548 Res.push_back(SubRes);
13549 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13550 SubEntries.front()->getVectorFactor() == VL.size() &&
13551 (SubEntries.front()->isSame(TE->Scalars) ||
13552 SubEntries.front()->isSame(VL))) {
13553 SmallVector<const TreeEntry *> LocalSubEntries;
13554 LocalSubEntries.swap(SubEntries);
13555 Entries.clear();
13556 Res.clear();
13557 std::iota(Mask.begin(), Mask.end(), 0);
13558 // Clear undef scalars.
13559 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13560 if (isa<PoisonValue>(VL[I]))
13562 Entries.emplace_back(1, LocalSubEntries.front());
13564 return Res;
13565 }
13566 }
13567 if (all_of(Res,
13568 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13569 Entries.clear();
13570 return {};
13571 }
13572 return Res;
13573}
13574
13575InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13576 Type *ScalarTy) const {
13577 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13578 bool DuplicateNonConst = false;
13579 // Find the cost of inserting/extracting values from the vector.
13580 // Check if the same elements are inserted several times and count them as
13581 // shuffle candidates.
13582 APInt ShuffledElements = APInt::getZero(VL.size());
13583 DenseMap<Value *, unsigned> UniqueElements;
13586 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13587 if (V->getType() != ScalarTy) {
13588 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13590 V = nullptr;
13591 }
13592 if (!ForPoisonSrc)
13593 Cost +=
13594 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13595 I, Constant::getNullValue(VecTy), V);
13596 };
13597 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13598 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13599 Value *V = VL[I];
13600 // No need to shuffle duplicates for constants.
13601 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13602 ShuffledElements.setBit(I);
13603 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13604 continue;
13605 }
13606
13607 auto Res = UniqueElements.try_emplace(V, I);
13608 if (Res.second) {
13609 EstimateInsertCost(I, V);
13610 ShuffleMask[I] = I;
13611 continue;
13612 }
13613
13614 DuplicateNonConst = true;
13615 ShuffledElements.setBit(I);
13616 ShuffleMask[I] = Res.first->second;
13617 }
13618 if (ForPoisonSrc) {
13619 if (isa<FixedVectorType>(ScalarTy)) {
13620 assert(SLPReVec && "Only supported by REVEC.");
13621 // We don't need to insert elements one by one. Instead, we can insert the
13622 // entire vector into the destination.
13623 Cost = 0;
13624 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13625 for (unsigned I : seq<unsigned>(VL.size()))
13626 if (!ShuffledElements[I])
13628 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13629 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13630 } else {
13632 /*DemandedElts*/ ~ShuffledElements,
13633 /*Insert*/ true,
13634 /*Extract*/ false, CostKind, VL);
13635 }
13636 }
13637 if (DuplicateNonConst)
13639 VecTy, ShuffleMask);
13640 return Cost;
13641}
13642
13643Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13644 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13645 if (Res)
13646 return *Res;
13647 // Get the basic block this bundle is in. All instructions in the bundle
13648 // should be in this block (except for extractelement-like instructions with
13649 // constant indices or gathered loads).
13650 auto *Front = E->getMainOp();
13651 auto *BB = Front->getParent();
13652 assert(((GatheredLoadsEntriesFirst.has_value() &&
13653 E->getOpcode() == Instruction::Load && E->isGather() &&
13654 E->Idx < *GatheredLoadsEntriesFirst) ||
13655 all_of(E->Scalars,
13656 [=](Value *V) -> bool {
13657 if (E->getOpcode() == Instruction::GetElementPtr &&
13658 !isa<GetElementPtrInst>(V))
13659 return true;
13660 auto *I = dyn_cast<Instruction>(V);
13661 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13662 isVectorLikeInstWithConstOps(I);
13663 })) &&
13664 "Expected gathered loads or GEPs or instructions from same basic "
13665 "block.");
13666
13667 auto FindLastInst = [&]() {
13668 Instruction *LastInst = Front;
13669 for (Value *V : E->Scalars) {
13670 auto *I = dyn_cast<Instruction>(V);
13671 if (!I)
13672 continue;
13673 if (LastInst->getParent() == I->getParent()) {
13674 if (LastInst->comesBefore(I))
13675 LastInst = I;
13676 continue;
13677 }
13678 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13679 !isa<GetElementPtrInst>(I)) ||
13680 (isVectorLikeInstWithConstOps(LastInst) &&
13682 (GatheredLoadsEntriesFirst.has_value() &&
13683 E->getOpcode() == Instruction::Load && E->isGather() &&
13684 E->Idx < *GatheredLoadsEntriesFirst)) &&
13685 "Expected vector-like or non-GEP in GEP node insts only.");
13686 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13687 LastInst = I;
13688 continue;
13689 }
13690 if (!DT->isReachableFromEntry(I->getParent()))
13691 continue;
13692 auto *NodeA = DT->getNode(LastInst->getParent());
13693 auto *NodeB = DT->getNode(I->getParent());
13694 assert(NodeA && "Should only process reachable instructions");
13695 assert(NodeB && "Should only process reachable instructions");
13696 assert((NodeA == NodeB) ==
13697 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13698 "Different nodes should have different DFS numbers");
13699 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13700 LastInst = I;
13701 }
13702 BB = LastInst->getParent();
13703 return LastInst;
13704 };
13705
13706 auto FindFirstInst = [&]() {
13707 Instruction *FirstInst = Front;
13708 for (Value *V : E->Scalars) {
13709 auto *I = dyn_cast<Instruction>(V);
13710 if (!I)
13711 continue;
13712 if (FirstInst->getParent() == I->getParent()) {
13713 if (I->comesBefore(FirstInst))
13714 FirstInst = I;
13715 continue;
13716 }
13717 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13718 !isa<GetElementPtrInst>(I)) ||
13719 (isVectorLikeInstWithConstOps(FirstInst) &&
13721 "Expected vector-like or non-GEP in GEP node insts only.");
13722 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13723 FirstInst = I;
13724 continue;
13725 }
13726 if (!DT->isReachableFromEntry(I->getParent()))
13727 continue;
13728 auto *NodeA = DT->getNode(FirstInst->getParent());
13729 auto *NodeB = DT->getNode(I->getParent());
13730 assert(NodeA && "Should only process reachable instructions");
13731 assert(NodeB && "Should only process reachable instructions");
13732 assert((NodeA == NodeB) ==
13733 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13734 "Different nodes should have different DFS numbers");
13735 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13736 FirstInst = I;
13737 }
13738 return FirstInst;
13739 };
13740
13741 // Set insertpoint for gathered loads to the very first load.
13742 if (GatheredLoadsEntriesFirst.has_value() &&
13743 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13744 E->getOpcode() == Instruction::Load) {
13745 Res = FindFirstInst();
13746 return *Res;
13747 }
13748
13749 // Set the insert point to the beginning of the basic block if the entry
13750 // should not be scheduled.
13751 if (doesNotNeedToSchedule(E->Scalars) ||
13752 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13753 if ((E->getOpcode() == Instruction::GetElementPtr &&
13754 any_of(E->Scalars,
13755 [](Value *V) {
13756 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13757 })) ||
13758 all_of(E->Scalars,
13759 [](Value *V) {
13760 return isa<PoisonValue>(V) ||
13761 (!isVectorLikeInstWithConstOps(V) &&
13762 isUsedOutsideBlock(V));
13763 }) ||
13764 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13765 return isa<ExtractElementInst, UndefValue>(V) ||
13766 areAllOperandsNonInsts(V);
13767 })))
13768 Res = FindLastInst();
13769 else
13770 Res = FindFirstInst();
13771 return *Res;
13772 }
13773
13774 // Find the last instruction. The common case should be that BB has been
13775 // scheduled, and the last instruction is VL.back(). So we start with
13776 // VL.back() and iterate over schedule data until we reach the end of the
13777 // bundle. The end of the bundle is marked by null ScheduleData.
13778 if (BlocksSchedules.count(BB) && !E->isGather()) {
13779 Value *V = E->isOneOf(E->Scalars.back());
13781 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13782 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13783 if (Bundle && Bundle->isPartOfBundle())
13784 for (; Bundle; Bundle = Bundle->NextInBundle)
13785 Res = Bundle->Inst;
13786 }
13787
13788 // LastInst can still be null at this point if there's either not an entry
13789 // for BB in BlocksSchedules or there's no ScheduleData available for
13790 // VL.back(). This can be the case if buildTree_rec aborts for various
13791 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13792 // size is reached, etc.). ScheduleData is initialized in the scheduling
13793 // "dry-run".
13794 //
13795 // If this happens, we can still find the last instruction by brute force. We
13796 // iterate forwards from Front (inclusive) until we either see all
13797 // instructions in the bundle or reach the end of the block. If Front is the
13798 // last instruction in program order, LastInst will be set to Front, and we
13799 // will visit all the remaining instructions in the block.
13800 //
13801 // One of the reasons we exit early from buildTree_rec is to place an upper
13802 // bound on compile-time. Thus, taking an additional compile-time hit here is
13803 // not ideal. However, this should be exceedingly rare since it requires that
13804 // we both exit early from buildTree_rec and that the bundle be out-of-order
13805 // (causing us to iterate all the way to the end of the block).
13806 if (!Res)
13807 Res = FindLastInst();
13808 assert(Res && "Failed to find last instruction in bundle");
13809 return *Res;
13810}
13811
13812void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13813 auto *Front = E->getMainOp();
13814 Instruction *LastInst = &getLastInstructionInBundle(E);
13815 assert(LastInst && "Failed to find last instruction in bundle");
13816 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13817 // If the instruction is PHI, set the insert point after all the PHIs.
13818 bool IsPHI = isa<PHINode>(LastInst);
13819 if (IsPHI)
13820 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13821 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13822 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13823 } else {
13824 // Set the insertion point after the last instruction in the bundle. Set the
13825 // debug location to Front.
13826 Builder.SetInsertPoint(
13827 LastInst->getParent(),
13829 }
13830 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13831}
13832
13833Value *BoUpSLP::gather(
13834 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13835 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13836 // List of instructions/lanes from current block and/or the blocks which are
13837 // part of the current loop. These instructions will be inserted at the end to
13838 // make it possible to optimize loops and hoist invariant instructions out of
13839 // the loops body with better chances for success.
13841 SmallSet<int, 4> PostponedIndices;
13842 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13843 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13845 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13846 InsertBB = InsertBB->getSinglePredecessor();
13847 return InsertBB && InsertBB == InstBB;
13848 };
13849 for (int I = 0, E = VL.size(); I < E; ++I) {
13850 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13851 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13852 getTreeEntry(Inst) ||
13853 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13854 PostponedIndices.insert(I).second)
13855 PostponedInsts.emplace_back(Inst, I);
13856 }
13857
13858 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13859 Type *Ty) {
13860 Value *Scalar = V;
13861 if (Scalar->getType() != Ty) {
13862 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13863 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13864 Value *V = Scalar;
13865 if (auto *CI = dyn_cast<CastInst>(Scalar);
13866 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13867 Value *Op = CI->getOperand(0);
13868 if (auto *IOp = dyn_cast<Instruction>(Op);
13869 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13870 V = Op;
13871 }
13872 Scalar = Builder.CreateIntCast(
13873 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13874 }
13875
13876 Instruction *InsElt;
13877 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13878 assert(SLPReVec && "FixedVectorType is not expected.");
13879 Vec = InsElt = Builder.CreateInsertVector(
13880 Vec->getType(), Vec, Scalar,
13881 Builder.getInt64(Pos * VecTy->getNumElements()));
13882 auto *II = dyn_cast<IntrinsicInst>(InsElt);
13883 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13884 return Vec;
13885 } else {
13886 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13887 InsElt = dyn_cast<InsertElementInst>(Vec);
13888 if (!InsElt)
13889 return Vec;
13890 }
13891 GatherShuffleExtractSeq.insert(InsElt);
13892 CSEBlocks.insert(InsElt->getParent());
13893 // Add to our 'need-to-extract' list.
13894 if (isa<Instruction>(V)) {
13895 if (TreeEntry *Entry = getTreeEntry(V)) {
13896 // Find which lane we need to extract.
13897 User *UserOp = nullptr;
13898 if (Scalar != V) {
13899 if (auto *SI = dyn_cast<Instruction>(Scalar))
13900 UserOp = SI;
13901 } else {
13902 UserOp = InsElt;
13903 }
13904 if (UserOp) {
13905 unsigned FoundLane = Entry->findLaneForValue(V);
13906 ExternalUses.emplace_back(V, UserOp, FoundLane);
13907 }
13908 }
13909 }
13910 return Vec;
13911 };
13912 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13913 Value *Vec = PoisonValue::get(VecTy);
13914 SmallVector<int> NonConsts;
13916 std::iota(Mask.begin(), Mask.end(), 0);
13917 Value *OriginalRoot = Root;
13918 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13919 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13920 SV->getOperand(0)->getType() == VecTy) {
13921 Root = SV->getOperand(0);
13922 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
13923 }
13924 // Insert constant values at first.
13925 for (int I = 0, E = VL.size(); I < E; ++I) {
13926 if (PostponedIndices.contains(I))
13927 continue;
13928 if (!isConstant(VL[I])) {
13929 NonConsts.push_back(I);
13930 continue;
13931 }
13932 if (isa<PoisonValue>(VL[I]))
13933 continue;
13934 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
13935 Mask[I] = I + E;
13936 }
13937 if (Root) {
13938 if (isa<PoisonValue>(Vec)) {
13939 Vec = OriginalRoot;
13940 } else {
13941 Vec = CreateShuffle(Root, Vec, Mask);
13942 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
13943 OI && OI->hasNUses(0) &&
13944 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13945 return TE->VectorizedValue == OI;
13946 }))
13947 eraseInstruction(OI);
13948 }
13949 }
13950 // Insert non-constant values.
13951 for (int I : NonConsts)
13952 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
13953 // Append instructions, which are/may be part of the loop, in the end to make
13954 // it possible to hoist non-loop-based instructions.
13955 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
13956 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
13957
13958 return Vec;
13959}
13960
13961/// Merges shuffle masks and emits final shuffle instruction, if required. It
13962/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13963/// when the actual shuffle instruction is generated only if this is actually
13964/// required. Otherwise, the shuffle instruction emission is delayed till the
13965/// end of the process, to reduce the number of emitted instructions and further
13966/// analysis/transformations.
13967/// The class also will look through the previously emitted shuffle instructions
13968/// and properly mark indices in mask as undef.
13969/// For example, given the code
13970/// \code
13971/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
13972/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
13973/// \endcode
13974/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
13975/// look through %s1 and %s2 and emit
13976/// \code
13977/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
13978/// \endcode
13979/// instead.
13980/// If 2 operands are of different size, the smallest one will be resized and
13981/// the mask recalculated properly.
13982/// For example, given the code
13983/// \code
13984/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
13985/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
13986/// \endcode
13987/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
13988/// look through %s1 and %s2 and emit
13989/// \code
13990/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
13991/// \endcode
13992/// instead.
13993class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
13994 bool IsFinalized = false;
13995 /// Combined mask for all applied operands and masks. It is built during
13996 /// analysis and actual emission of shuffle vector instructions.
13997 SmallVector<int> CommonMask;
13998 /// List of operands for the shuffle vector instruction. It hold at max 2
13999 /// operands, if the 3rd is going to be added, the first 2 are combined into
14000 /// shuffle with \p CommonMask mask, the first operand sets to be the
14001 /// resulting shuffle and the second operand sets to be the newly added
14002 /// operand. The \p CommonMask is transformed in the proper way after that.
14003 SmallVector<Value *, 2> InVectors;
14004 IRBuilderBase &Builder;
14005 BoUpSLP &R;
14006
14007 class ShuffleIRBuilder {
14008 IRBuilderBase &Builder;
14009 /// Holds all of the instructions that we gathered.
14010 SetVector<Instruction *> &GatherShuffleExtractSeq;
14011 /// A list of blocks that we are going to CSE.
14012 DenseSet<BasicBlock *> &CSEBlocks;
14013 /// Data layout.
14014 const DataLayout &DL;
14015
14016 public:
14017 ShuffleIRBuilder(IRBuilderBase &Builder,
14018 SetVector<Instruction *> &GatherShuffleExtractSeq,
14019 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
14020 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14021 CSEBlocks(CSEBlocks), DL(DL) {}
14022 ~ShuffleIRBuilder() = default;
14023 /// Creates shufflevector for the 2 operands with the given mask.
14024 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
14025 if (V1->getType() != V2->getType()) {
14027 V1->getType()->isIntOrIntVectorTy() &&
14028 "Expected integer vector types only.");
14029 if (V1->getType() != V2->getType()) {
14030 if (cast<VectorType>(V2->getType())
14031 ->getElementType()
14032 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14033 ->getElementType()
14034 ->getIntegerBitWidth())
14035 V2 = Builder.CreateIntCast(
14036 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
14037 else
14038 V1 = Builder.CreateIntCast(
14039 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
14040 }
14041 }
14042 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14043 if (auto *I = dyn_cast<Instruction>(Vec)) {
14044 GatherShuffleExtractSeq.insert(I);
14045 CSEBlocks.insert(I->getParent());
14046 }
14047 return Vec;
14048 }
14049 /// Creates permutation of the single vector operand with the given mask, if
14050 /// it is not identity mask.
14051 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
14052 if (Mask.empty())
14053 return V1;
14054 unsigned VF = Mask.size();
14055 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14056 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
14057 return V1;
14058 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14059 if (auto *I = dyn_cast<Instruction>(Vec)) {
14060 GatherShuffleExtractSeq.insert(I);
14061 CSEBlocks.insert(I->getParent());
14062 }
14063 return Vec;
14064 }
14065 Value *createIdentity(Value *V) { return V; }
14066 Value *createPoison(Type *Ty, unsigned VF) {
14067 return PoisonValue::get(getWidenedType(Ty, VF));
14068 }
14069 /// Resizes 2 input vector to match the sizes, if the they are not equal
14070 /// yet. The smallest vector is resized to the size of the larger vector.
14071 void resizeToMatch(Value *&V1, Value *&V2) {
14072 if (V1->getType() == V2->getType())
14073 return;
14074 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14075 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14076 int VF = std::max(V1VF, V2VF);
14077 int MinVF = std::min(V1VF, V2VF);
14078 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
14079 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14080 0);
14081 Value *&Op = MinVF == V1VF ? V1 : V2;
14082 Op = Builder.CreateShuffleVector(Op, IdentityMask);
14083 if (auto *I = dyn_cast<Instruction>(Op)) {
14084 GatherShuffleExtractSeq.insert(I);
14085 CSEBlocks.insert(I->getParent());
14086 }
14087 if (MinVF == V1VF)
14088 V1 = Op;
14089 else
14090 V2 = Op;
14091 }
14092 };
14093
14094 /// Smart shuffle instruction emission, walks through shuffles trees and
14095 /// tries to find the best matching vector for the actual shuffle
14096 /// instruction.
14097 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
14098 assert(V1 && "Expected at least one vector value.");
14099 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14100 R.CSEBlocks, *R.DL);
14101 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14102 ShuffleBuilder);
14103 }
14104
14105 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
14106 /// shuffle emission.
14107 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
14108 ArrayRef<int> Mask) {
14109 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14110 if (Mask[Idx] != PoisonMaskElem)
14111 CommonMask[Idx] = Idx;
14112 }
14113
14114 /// Cast value \p V to the vector type with the same number of elements, but
14115 /// the base type \p ScalarTy.
14116 Value *castToScalarTyElem(Value *V,
14117 std::optional<bool> IsSigned = std::nullopt) {
14118 auto *VecTy = cast<VectorType>(V->getType());
14119 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
14120 if (VecTy->getElementType() == ScalarTy->getScalarType())
14121 return V;
14122 return Builder.CreateIntCast(
14123 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14124 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
14125 }
14126
14127public:
14129 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14130
14131 /// Adjusts extractelements after reusing them.
14132 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14133 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14134 unsigned NumParts, bool &UseVecBaseAsInput) {
14135 UseVecBaseAsInput = false;
14136 SmallPtrSet<Value *, 4> UniqueBases;
14137 Value *VecBase = nullptr;
14138 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14139 if (!E->ReorderIndices.empty()) {
14140 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14141 E->ReorderIndices.end());
14142 reorderScalars(VL, ReorderMask);
14143 }
14144 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14145 int Idx = Mask[I];
14146 if (Idx == PoisonMaskElem)
14147 continue;
14148 auto *EI = cast<ExtractElementInst>(VL[I]);
14149 VecBase = EI->getVectorOperand();
14150 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
14151 VecBase = TE->VectorizedValue;
14152 assert(VecBase && "Expected vectorized value.");
14153 UniqueBases.insert(VecBase);
14154 // If the only one use is vectorized - can delete the extractelement
14155 // itself.
14156 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14157 (NumParts != 1 && count(VL, EI) > 1) ||
14158 any_of(EI->users(), [&](User *U) {
14159 const TreeEntry *UTE = R.getTreeEntry(U);
14160 return !UTE || R.MultiNodeScalars.contains(U) ||
14161 (isa<GetElementPtrInst>(U) &&
14162 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14163 count_if(R.VectorizableTree,
14164 [&](const std::unique_ptr<TreeEntry> &TE) {
14165 return any_of(TE->UserTreeIndices,
14166 [&](const EdgeInfo &Edge) {
14167 return Edge.UserTE == UTE;
14168 }) &&
14169 is_contained(VL, EI);
14170 }) != 1;
14171 }))
14172 continue;
14173 R.eraseInstruction(EI);
14174 }
14175 if (NumParts == 1 || UniqueBases.size() == 1) {
14176 assert(VecBase && "Expected vectorized value.");
14177 return castToScalarTyElem(VecBase);
14178 }
14179 UseVecBaseAsInput = true;
14180 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14181 for (auto [I, Idx] : enumerate(Mask))
14182 if (Idx != PoisonMaskElem)
14183 Idx = I;
14184 };
14185 // Perform multi-register vector shuffle, joining them into a single virtual
14186 // long vector.
14187 // Need to shuffle each part independently and then insert all this parts
14188 // into a long virtual vector register, forming the original vector.
14189 Value *Vec = nullptr;
14190 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14191 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14192 for (unsigned Part : seq<unsigned>(NumParts)) {
14193 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14194 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14195 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14196 constexpr int MaxBases = 2;
14197 SmallVector<Value *, MaxBases> Bases(MaxBases);
14198 auto VLMask = zip(SubVL, SubMask);
14199 const unsigned VF = std::accumulate(
14200 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14201 if (std::get<1>(D) == PoisonMaskElem)
14202 return S;
14203 Value *VecOp =
14204 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14205 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14206 VecOp = TE->VectorizedValue;
14207 assert(VecOp && "Expected vectorized value.");
14208 const unsigned Size =
14209 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14210 return std::max(S, Size);
14211 });
14212 for (const auto [V, I] : VLMask) {
14213 if (I == PoisonMaskElem)
14214 continue;
14215 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14216 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14217 VecOp = TE->VectorizedValue;
14218 assert(VecOp && "Expected vectorized value.");
14219 VecOp = castToScalarTyElem(VecOp);
14220 Bases[I / VF] = VecOp;
14221 }
14222 if (!Bases.front())
14223 continue;
14224 Value *SubVec;
14225 if (Bases.back()) {
14226 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14227 TransformToIdentity(SubMask);
14228 } else {
14229 SubVec = Bases.front();
14230 }
14231 if (!Vec) {
14232 Vec = SubVec;
14233 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14234 [&](unsigned P) {
14235 ArrayRef<int> SubMask =
14236 Mask.slice(P * SliceSize,
14237 getNumElems(Mask.size(),
14238 SliceSize, P));
14239 return all_of(SubMask, [](int Idx) {
14240 return Idx == PoisonMaskElem;
14241 });
14242 })) &&
14243 "Expected first part or all previous parts masked.");
14244 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14245 } else {
14246 unsigned NewVF =
14247 cast<FixedVectorType>(Vec->getType())->getNumElements();
14248 if (Vec->getType() != SubVec->getType()) {
14249 unsigned SubVecVF =
14250 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14251 NewVF = std::max(NewVF, SubVecVF);
14252 }
14253 // Adjust SubMask.
14254 for (int &Idx : SubMask)
14255 if (Idx != PoisonMaskElem)
14256 Idx += NewVF;
14257 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14258 Vec = createShuffle(Vec, SubVec, VecMask);
14259 TransformToIdentity(VecMask);
14260 }
14261 }
14262 copy(VecMask, Mask.begin());
14263 return Vec;
14264 }
14265 /// Checks if the specified entry \p E needs to be delayed because of its
14266 /// dependency nodes.
14267 std::optional<Value *>
14268 needToDelay(const TreeEntry *E,
14270 // No need to delay emission if all deps are ready.
14271 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14272 return all_of(
14273 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14274 }))
14275 return std::nullopt;
14276 // Postpone gather emission, will be emitted after the end of the
14277 // process to keep correct order.
14278 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14279 return Builder.CreateAlignedLoad(
14280 ResVecTy,
14282 MaybeAlign());
14283 }
14284 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14285 /// shuffling.
14286 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14287 Value *V1 = E1.VectorizedValue;
14288 if (V1->getType()->isIntOrIntVectorTy())
14289 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14290 if (isa<PoisonValue>(V))
14291 return false;
14292 return !isKnownNonNegative(
14293 V, SimplifyQuery(*R.DL));
14294 }));
14295 Value *V2 = E2.VectorizedValue;
14296 if (V2->getType()->isIntOrIntVectorTy())
14297 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14298 if (isa<PoisonValue>(V))
14299 return false;
14300 return !isKnownNonNegative(
14301 V, SimplifyQuery(*R.DL));
14302 }));
14303 add(V1, V2, Mask);
14304 }
14305 /// Adds single input vector (in form of tree entry) and the mask for its
14306 /// shuffling.
14307 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14308 Value *V1 = E1.VectorizedValue;
14309 if (V1->getType()->isIntOrIntVectorTy())
14310 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14311 if (isa<PoisonValue>(V))
14312 return false;
14313 return !isKnownNonNegative(
14314 V, SimplifyQuery(*R.DL));
14315 }));
14316 add(V1, Mask);
14317 }
14318 /// Adds 2 input vectors and the mask for their shuffling.
14319 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14320 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14321 assert(isa<FixedVectorType>(V1->getType()) &&
14322 isa<FixedVectorType>(V2->getType()) &&
14323 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14324 V1 = castToScalarTyElem(V1);
14325 V2 = castToScalarTyElem(V2);
14326 if (InVectors.empty()) {
14327 InVectors.push_back(V1);
14328 InVectors.push_back(V2);
14329 CommonMask.assign(Mask.begin(), Mask.end());
14330 return;
14331 }
14332 Value *Vec = InVectors.front();
14333 if (InVectors.size() == 2) {
14334 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14335 transformMaskAfterShuffle(CommonMask, CommonMask);
14336 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14337 Mask.size()) {
14338 Vec = createShuffle(Vec, nullptr, CommonMask);
14339 transformMaskAfterShuffle(CommonMask, CommonMask);
14340 }
14341 V1 = createShuffle(V1, V2, Mask);
14342 unsigned VF = std::max(getVF(V1), getVF(Vec));
14343 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14344 if (Mask[Idx] != PoisonMaskElem)
14345 CommonMask[Idx] = Idx + VF;
14346 InVectors.front() = Vec;
14347 if (InVectors.size() == 2)
14348 InVectors.back() = V1;
14349 else
14350 InVectors.push_back(V1);
14351 }
14352 /// Adds another one input vector and the mask for the shuffling.
14353 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14354 assert(isa<FixedVectorType>(V1->getType()) &&
14355 "castToScalarTyElem expects V1 to be FixedVectorType");
14356 V1 = castToScalarTyElem(V1);
14357 if (InVectors.empty()) {
14358 InVectors.push_back(V1);
14359 CommonMask.assign(Mask.begin(), Mask.end());
14360 return;
14361 }
14362 const auto *It = find(InVectors, V1);
14363 if (It == InVectors.end()) {
14364 if (InVectors.size() == 2 ||
14365 InVectors.front()->getType() != V1->getType()) {
14366 Value *V = InVectors.front();
14367 if (InVectors.size() == 2) {
14368 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14369 transformMaskAfterShuffle(CommonMask, CommonMask);
14370 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14371 CommonMask.size()) {
14372 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14373 transformMaskAfterShuffle(CommonMask, CommonMask);
14374 }
14375 unsigned VF = std::max(CommonMask.size(), Mask.size());
14376 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14377 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14378 CommonMask[Idx] =
14379 V->getType() != V1->getType()
14380 ? Idx + VF
14381 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14382 ->getNumElements();
14383 if (V->getType() != V1->getType())
14384 V1 = createShuffle(V1, nullptr, Mask);
14385 InVectors.front() = V;
14386 if (InVectors.size() == 2)
14387 InVectors.back() = V1;
14388 else
14389 InVectors.push_back(V1);
14390 return;
14391 }
14392 // Check if second vector is required if the used elements are already
14393 // used from the first one.
14394 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14395 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14396 InVectors.push_back(V1);
14397 break;
14398 }
14399 }
14400 int VF = getVF(V1);
14401 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14402 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14403 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14404 }
14405 /// Adds another one input vector and the mask for the shuffling.
14407 SmallVector<int> NewMask;
14408 inversePermutation(Order, NewMask);
14409 add(V1, NewMask);
14410 }
14411 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14412 Value *Root = nullptr) {
14413 return R.gather(VL, Root, ScalarTy,
14414 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14415 return createShuffle(V1, V2, Mask);
14416 });
14417 }
14418 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14419 /// Finalize emission of the shuffles.
14420 /// \param Action the action (if any) to be performed before final applying of
14421 /// the \p ExtMask mask.
14422 Value *
14424 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14425 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14426 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14427 IsFinalized = true;
14428 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
14429 SmallVector<int> NewExtMask(ExtMask);
14430 if (ScalarTyNumElements != 1) {
14431 assert(SLPReVec && "FixedVectorType is not expected.");
14432 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask);
14433 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask);
14434 ExtMask = NewExtMask;
14435 }
14436 if (Action) {
14437 Value *Vec = InVectors.front();
14438 if (InVectors.size() == 2) {
14439 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14440 InVectors.pop_back();
14441 } else {
14442 Vec = createShuffle(Vec, nullptr, CommonMask);
14443 }
14444 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14445 if (CommonMask[Idx] != PoisonMaskElem)
14446 CommonMask[Idx] = Idx;
14447 assert(VF > 0 &&
14448 "Expected vector length for the final value before action.");
14449 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14450 if (VecVF < VF) {
14451 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14452 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14453 Vec = createShuffle(Vec, nullptr, ResizeMask);
14454 }
14455 Action(Vec, CommonMask);
14456 InVectors.front() = Vec;
14457 }
14458 if (!SubVectors.empty()) {
14459 Value *Vec = InVectors.front();
14460 if (InVectors.size() == 2) {
14461 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14462 InVectors.pop_back();
14463 } else {
14464 Vec = createShuffle(Vec, nullptr, CommonMask);
14465 }
14466 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14467 if (CommonMask[Idx] != PoisonMaskElem)
14468 CommonMask[Idx] = Idx;
14469 auto CreateSubVectors = [&](Value *Vec,
14470 SmallVectorImpl<int> &CommonMask) {
14471 for (auto [E, Idx] : SubVectors) {
14472 Value *V = E->VectorizedValue;
14473 if (V->getType()->isIntOrIntVectorTy())
14474 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14475 if (isa<PoisonValue>(V))
14476 return false;
14477 return !isKnownNonNegative(
14478 V, SimplifyQuery(*R.DL));
14479 }));
14480 unsigned InsertionIndex = Idx * ScalarTyNumElements;
14481 const unsigned SubVecVF =
14482 cast<FixedVectorType>(V->getType())->getNumElements();
14483 if (InsertionIndex % SubVecVF == 0) {
14484 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
14485 Builder.getInt64(InsertionIndex));
14486 } else {
14487 // Create shuffle, insertvector requires that index is multiple of
14488 // the subvectors length.
14489 const unsigned VecVF =
14490 cast<FixedVectorType>(Vec->getType())->getNumElements();
14492 std::iota(Mask.begin(), Mask.end(), 0);
14493 for (unsigned I : seq<unsigned>(
14494 InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements))
14495 Mask[I] = I - Idx + VecVF;
14496 Vec = createShuffle(Vec, V, Mask);
14497 }
14498 if (!CommonMask.empty()) {
14499 std::iota(
14500 std::next(CommonMask.begin(), InsertionIndex),
14501 std::next(CommonMask.begin(),
14502 (Idx + E->getVectorFactor()) * ScalarTyNumElements),
14503 InsertionIndex);
14504 }
14505 }
14506 return Vec;
14507 };
14508 if (SubVectorsMask.empty()) {
14509 Vec = CreateSubVectors(Vec, CommonMask);
14510 } else {
14511 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14512 copy(SubVectorsMask, SVMask.begin());
14513 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14514 if (I2 != PoisonMaskElem) {
14515 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14516 I1 = I2 + CommonMask.size();
14517 }
14518 }
14519 Value *InsertVec =
14520 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14521 Vec = createShuffle(InsertVec, Vec, SVMask);
14522 for (unsigned I : seq<unsigned>(CommonMask.size())) {
14523 if (SVMask[I] != PoisonMaskElem)
14524 CommonMask[I] = I;
14525 }
14526 }
14527 InVectors.front() = Vec;
14528 }
14529
14530 if (!ExtMask.empty()) {
14531 if (CommonMask.empty()) {
14532 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14533 } else {
14534 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14535 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14536 if (ExtMask[I] == PoisonMaskElem)
14537 continue;
14538 NewMask[I] = CommonMask[ExtMask[I]];
14539 }
14540 CommonMask.swap(NewMask);
14541 }
14542 }
14543 if (CommonMask.empty()) {
14544 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14545 return InVectors.front();
14546 }
14547 if (InVectors.size() == 2)
14548 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14549 return createShuffle(InVectors.front(), nullptr, CommonMask);
14550 }
14551
14553 assert((IsFinalized || CommonMask.empty()) &&
14554 "Shuffle construction must be finalized.");
14555 }
14556};
14557
14558BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14559 unsigned NodeIdx) {
14560 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14561 InstructionsState S = getSameOpcode(VL, *TLI);
14562 // Special processing for GEPs bundle, which may include non-gep values.
14563 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
14564 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14565 if (It != VL.end())
14566 S = getSameOpcode(*It, *TLI);
14567 }
14568 if (!S.getOpcode())
14569 return nullptr;
14570 auto CheckSameVE = [&](const TreeEntry *VE) {
14571 return VE->isSame(VL) &&
14572 (any_of(VE->UserTreeIndices,
14573 [E, NodeIdx](const EdgeInfo &EI) {
14574 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14575 }) ||
14576 any_of(VectorizableTree,
14577 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14578 return TE->isOperandGatherNode(
14579 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14580 VE->isSame(TE->Scalars);
14581 }));
14582 };
14583 TreeEntry *VE = getTreeEntry(S.getMainOp());
14584 if (VE && CheckSameVE(VE))
14585 return VE;
14586 auto It = MultiNodeScalars.find(S.getMainOp());
14587 if (It != MultiNodeScalars.end()) {
14588 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
14589 return TE != VE && CheckSameVE(TE);
14590 });
14591 if (I != It->getSecond().end())
14592 return *I;
14593 }
14594 return nullptr;
14595}
14596
14597Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14598 bool PostponedPHIs) {
14599 ValueList &VL = E->getOperand(NodeIdx);
14600 const unsigned VF = VL.size();
14601 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14602 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14603 // V may be affected by MinBWs.
14604 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14605 // factor is the number of elements, not their type.
14606 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14607 unsigned NumElements = getNumElements(VL.front()->getType());
14608 ShuffleInstructionBuilder ShuffleBuilder(
14609 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14610 : ScalarTy,
14611 Builder, *this);
14612 ShuffleBuilder.add(V, Mask);
14614 E->CombinedEntriesWithIndices.size());
14615 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14616 [&](const auto &P) {
14617 return std::make_pair(VectorizableTree[P.first].get(),
14618 P.second);
14619 });
14620 assert((E->CombinedEntriesWithIndices.empty() ||
14621 E->ReorderIndices.empty()) &&
14622 "Expected either combined subnodes or reordering");
14623 return ShuffleBuilder.finalize({}, SubVectors, {});
14624 };
14625 Value *V = vectorizeTree(VE, PostponedPHIs);
14626 if (VF * getNumElements(VL[0]->getType()) !=
14627 cast<FixedVectorType>(V->getType())->getNumElements()) {
14628 if (!VE->ReuseShuffleIndices.empty()) {
14629 // Reshuffle to get only unique values.
14630 // If some of the scalars are duplicated in the vectorization
14631 // tree entry, we do not vectorize them but instead generate a
14632 // mask for the reuses. But if there are several users of the
14633 // same entry, they may have different vectorization factors.
14634 // This is especially important for PHI nodes. In this case, we
14635 // need to adapt the resulting instruction for the user
14636 // vectorization factor and have to reshuffle it again to take
14637 // only unique elements of the vector. Without this code the
14638 // function incorrectly returns reduced vector instruction with
14639 // the same elements, not with the unique ones.
14640
14641 // block:
14642 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14643 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14644 // ... (use %2)
14645 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14646 // br %block
14648 for (auto [I, V] : enumerate(VL)) {
14649 if (isa<PoisonValue>(V))
14650 continue;
14651 Mask[I] = VE->findLaneForValue(V);
14652 }
14653 V = FinalShuffle(V, Mask);
14654 } else {
14655 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14656 "Expected vectorization factor less "
14657 "than original vector size.");
14658 SmallVector<int> UniformMask(VF, 0);
14659 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14660 V = FinalShuffle(V, UniformMask);
14661 }
14662 }
14663 // Need to update the operand gather node, if actually the operand is not a
14664 // vectorized node, but the buildvector/gather node, which matches one of
14665 // the vectorized nodes.
14666 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14667 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14668 }) == VE->UserTreeIndices.end()) {
14669 auto *It =
14670 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14671 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14672 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14673 });
14674 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14675 (*It)->VectorizedValue = V;
14676 }
14677 return V;
14678 }
14679
14680 // Find the corresponding gather entry and vectorize it.
14681 // Allows to be more accurate with tree/graph transformations, checks for the
14682 // correctness of the transformations in many cases.
14683 auto *I = find_if(VectorizableTree,
14684 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14685 return TE->isOperandGatherNode({E, NodeIdx});
14686 });
14687 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14688 assert(I->get()->UserTreeIndices.size() == 1 &&
14689 "Expected only single user for the gather node.");
14690 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14691 return vectorizeTree(I->get(), PostponedPHIs);
14692}
14693
14694template <typename BVTy, typename ResTy, typename... Args>
14695ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14696 Args &...Params) {
14697 assert(E->isGather() && "Expected gather node.");
14698 unsigned VF = E->getVectorFactor();
14699
14700 bool NeedFreeze = false;
14701 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14702 E->ReuseShuffleIndices.end());
14703 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14704 // Clear values, to be replaced by insertvector instructions.
14705 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14706 for_each(MutableArrayRef(GatheredScalars)
14707 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14708 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14710 E->CombinedEntriesWithIndices.size());
14711 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14712 [&](const auto &P) {
14713 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14714 });
14715 // Build a mask out of the reorder indices and reorder scalars per this
14716 // mask.
14717 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14718 E->ReorderIndices.end());
14719 if (!ReorderMask.empty())
14720 reorderScalars(GatheredScalars, ReorderMask);
14721 SmallVector<int> SubVectorsMask;
14722 inversePermutation(E->ReorderIndices, SubVectorsMask);
14723 // Transform non-clustered elements in the mask to poison (-1).
14724 // "Clustered" operations will be reordered using this mask later.
14725 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14726 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14727 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14728 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14729 } else {
14730 SubVectorsMask.clear();
14731 }
14732 SmallVector<Value *> StoredGS(GatheredScalars);
14733 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14734 unsigned I, unsigned SliceSize,
14735 bool IsNotPoisonous) {
14736 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14737 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14738 }))
14739 return false;
14740 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14741 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14742 if (UserTE->getNumOperands() != 2)
14743 return false;
14744 if (!IsNotPoisonous) {
14745 auto *It =
14746 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14747 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14748 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14749 }) != TE->UserTreeIndices.end();
14750 });
14751 if (It == VectorizableTree.end())
14752 return false;
14753 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14754 if (!(*It)->ReorderIndices.empty()) {
14755 inversePermutation((*It)->ReorderIndices, ReorderMask);
14756 reorderScalars(GS, ReorderMask);
14757 }
14758 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14759 Value *V0 = std::get<0>(P);
14760 Value *V1 = std::get<1>(P);
14761 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14762 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14763 is_contained(E->Scalars, V1));
14764 }))
14765 return false;
14766 }
14767 int Idx;
14768 if ((Mask.size() < InputVF &&
14770 Idx == 0) ||
14771 (Mask.size() == InputVF &&
14772 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14773 std::iota(
14774 std::next(Mask.begin(), I * SliceSize),
14775 std::next(Mask.begin(),
14776 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14777 0);
14778 } else {
14779 unsigned IVal =
14780 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14781 std::fill(
14782 std::next(Mask.begin(), I * SliceSize),
14783 std::next(Mask.begin(),
14784 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14785 IVal);
14786 }
14787 return true;
14788 };
14789 BVTy ShuffleBuilder(ScalarTy, Params...);
14790 ResTy Res = ResTy();
14792 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14794 Value *ExtractVecBase = nullptr;
14795 bool UseVecBaseAsInput = false;
14798 Type *OrigScalarTy = GatheredScalars.front()->getType();
14799 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14800 unsigned NumParts = TTI->getNumberOfParts(VecTy);
14801 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14802 VecTy->getNumElements() % NumParts != 0 ||
14804 VecTy->getNumElements() / NumParts))
14805 NumParts = 1;
14806 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14807 // Check for gathered extracts.
14808 bool Resized = false;
14809 ExtractShuffles =
14810 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14811 if (!ExtractShuffles.empty()) {
14812 SmallVector<const TreeEntry *> ExtractEntries;
14813 for (auto [Idx, I] : enumerate(ExtractMask)) {
14814 if (I == PoisonMaskElem)
14815 continue;
14816 if (const auto *TE = getTreeEntry(
14817 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14818 ExtractEntries.push_back(TE);
14819 }
14820 if (std::optional<ResTy> Delayed =
14821 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14822 // Delay emission of gathers which are not ready yet.
14823 PostponedGathers.insert(E);
14824 // Postpone gather emission, will be emitted after the end of the
14825 // process to keep correct order.
14826 return *Delayed;
14827 }
14828 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14829 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14830 ExtractVecBase = VecBase;
14831 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14832 if (VF == VecBaseTy->getNumElements() &&
14833 GatheredScalars.size() != VF) {
14834 Resized = true;
14835 GatheredScalars.append(VF - GatheredScalars.size(),
14836 PoisonValue::get(OrigScalarTy));
14837 }
14838 }
14839 }
14840 // Gather extracts after we check for full matched gathers only.
14841 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
14842 ((E->getOpcode() == Instruction::Load ||
14843 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14844 any_of(E->Scalars,
14845 [this](Value *V) {
14846 return isa<LoadInst>(V) && getTreeEntry(V);
14847 })) ||
14848 E->isAltShuffle() ||
14849 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14850 isSplat(E->Scalars) ||
14851 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14852 GatherShuffles =
14853 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14854 }
14855 if (!GatherShuffles.empty()) {
14856 if (std::optional<ResTy> Delayed =
14857 ShuffleBuilder.needToDelay(E, Entries)) {
14858 // Delay emission of gathers which are not ready yet.
14859 PostponedGathers.insert(E);
14860 // Postpone gather emission, will be emitted after the end of the
14861 // process to keep correct order.
14862 return *Delayed;
14863 }
14864 if (GatherShuffles.size() == 1 &&
14865 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14866 Entries.front().front()->isSame(E->Scalars)) {
14867 // Perfect match in the graph, will reuse the previously vectorized
14868 // node. Cost is 0.
14869 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14870 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14871 // Restore the mask for previous partially matched values.
14872 Mask.resize(E->Scalars.size());
14873 const TreeEntry *FrontTE = Entries.front().front();
14874 if (FrontTE->ReorderIndices.empty() &&
14875 ((FrontTE->ReuseShuffleIndices.empty() &&
14876 E->Scalars.size() == FrontTE->Scalars.size()) ||
14877 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14878 std::iota(Mask.begin(), Mask.end(), 0);
14879 } else {
14880 for (auto [I, V] : enumerate(E->Scalars)) {
14881 if (isa<PoisonValue>(V)) {
14883 continue;
14884 }
14885 Mask[I] = FrontTE->findLaneForValue(V);
14886 }
14887 }
14888 ShuffleBuilder.add(*FrontTE, Mask);
14889 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
14890 SubVectorsMask);
14891 return Res;
14892 }
14893 if (!Resized) {
14894 if (GatheredScalars.size() != VF &&
14895 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14896 return any_of(TEs, [&](const TreeEntry *TE) {
14897 return TE->getVectorFactor() == VF;
14898 });
14899 }))
14900 GatheredScalars.append(VF - GatheredScalars.size(),
14901 PoisonValue::get(OrigScalarTy));
14902 }
14903 // Remove shuffled elements from list of gathers.
14904 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14905 if (Mask[I] != PoisonMaskElem)
14906 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14907 }
14908 }
14909 }
14910 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14911 SmallVectorImpl<int> &ReuseMask,
14912 bool IsRootPoison) {
14913 // For splats with can emit broadcasts instead of gathers, so try to find
14914 // such sequences.
14915 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
14916 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14917 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
14918 SmallVector<int> UndefPos;
14919 DenseMap<Value *, unsigned> UniquePositions;
14920 // Gather unique non-const values and all constant values.
14921 // For repeated values, just shuffle them.
14922 int NumNonConsts = 0;
14923 int SinglePos = 0;
14924 for (auto [I, V] : enumerate(Scalars)) {
14925 if (isa<UndefValue>(V)) {
14926 if (!isa<PoisonValue>(V)) {
14927 ReuseMask[I] = I;
14928 UndefPos.push_back(I);
14929 }
14930 continue;
14931 }
14932 if (isConstant(V)) {
14933 ReuseMask[I] = I;
14934 continue;
14935 }
14936 ++NumNonConsts;
14937 SinglePos = I;
14938 Value *OrigV = V;
14939 Scalars[I] = PoisonValue::get(OrigScalarTy);
14940 if (IsSplat) {
14941 Scalars.front() = OrigV;
14942 ReuseMask[I] = 0;
14943 } else {
14944 const auto Res = UniquePositions.try_emplace(OrigV, I);
14945 Scalars[Res.first->second] = OrigV;
14946 ReuseMask[I] = Res.first->second;
14947 }
14948 }
14949 if (NumNonConsts == 1) {
14950 // Restore single insert element.
14951 if (IsSplat) {
14952 ReuseMask.assign(VF, PoisonMaskElem);
14953 std::swap(Scalars.front(), Scalars[SinglePos]);
14954 if (!UndefPos.empty() && UndefPos.front() == 0)
14955 Scalars.front() = UndefValue::get(OrigScalarTy);
14956 }
14957 ReuseMask[SinglePos] = SinglePos;
14958 } else if (!UndefPos.empty() && IsSplat) {
14959 // For undef values, try to replace them with the simple broadcast.
14960 // We can do it if the broadcasted value is guaranteed to be
14961 // non-poisonous, or by freezing the incoming scalar value first.
14962 auto *It = find_if(Scalars, [this, E](Value *V) {
14963 return !isa<UndefValue>(V) &&
14964 (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
14965 (E->UserTreeIndices.size() == 1 &&
14966 any_of(V->uses(), [E](const Use &U) {
14967 // Check if the value already used in the same operation in
14968 // one of the nodes already.
14969 return E->UserTreeIndices.front().EdgeIdx !=
14970 U.getOperandNo() &&
14971 is_contained(
14972 E->UserTreeIndices.front().UserTE->Scalars,
14973 U.getUser());
14974 })));
14975 });
14976 if (It != Scalars.end()) {
14977 // Replace undefs by the non-poisoned scalars and emit broadcast.
14978 int Pos = std::distance(Scalars.begin(), It);
14979 for (int I : UndefPos) {
14980 // Set the undef position to the non-poisoned scalar.
14981 ReuseMask[I] = Pos;
14982 // Replace the undef by the poison, in the mask it is replaced by
14983 // non-poisoned scalar already.
14984 if (I != Pos)
14985 Scalars[I] = PoisonValue::get(OrigScalarTy);
14986 }
14987 } else {
14988 // Replace undefs by the poisons, emit broadcast and then emit
14989 // freeze.
14990 for (int I : UndefPos) {
14991 ReuseMask[I] = PoisonMaskElem;
14992 if (isa<UndefValue>(Scalars[I]))
14993 Scalars[I] = PoisonValue::get(OrigScalarTy);
14994 }
14995 NeedFreeze = true;
14996 }
14997 }
14998 };
14999 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15000 bool IsNonPoisoned = true;
15001 bool IsUsedInExpr = true;
15002 Value *Vec1 = nullptr;
15003 if (!ExtractShuffles.empty()) {
15004 // Gather of extractelements can be represented as just a shuffle of
15005 // a single/two vectors the scalars are extracted from.
15006 // Find input vectors.
15007 Value *Vec2 = nullptr;
15008 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15009 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
15010 ExtractMask[I] = PoisonMaskElem;
15011 }
15012 if (UseVecBaseAsInput) {
15013 Vec1 = ExtractVecBase;
15014 } else {
15015 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15016 if (ExtractMask[I] == PoisonMaskElem)
15017 continue;
15018 if (isa<UndefValue>(E->Scalars[I]))
15019 continue;
15020 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15021 Value *VecOp = EI->getVectorOperand();
15022 if (const auto *TE = getTreeEntry(VecOp))
15023 if (TE->VectorizedValue)
15024 VecOp = TE->VectorizedValue;
15025 if (!Vec1) {
15026 Vec1 = VecOp;
15027 } else if (Vec1 != VecOp) {
15028 assert((!Vec2 || Vec2 == VecOp) &&
15029 "Expected only 1 or 2 vectors shuffle.");
15030 Vec2 = VecOp;
15031 }
15032 }
15033 }
15034 if (Vec2) {
15035 IsUsedInExpr = false;
15036 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
15037 isGuaranteedNotToBePoison(Vec2, AC);
15038 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15039 } else if (Vec1) {
15040 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
15041 IsUsedInExpr &= FindReusedSplat(
15042 ExtractMask,
15043 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15044 ExtractMask.size(), IsNotPoisonedVec);
15045 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
15046 IsNonPoisoned &= IsNotPoisonedVec;
15047 } else {
15048 IsUsedInExpr = false;
15049 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15050 /*ForExtracts=*/true);
15051 }
15052 }
15053 if (!GatherShuffles.empty()) {
15054 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
15055 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
15056 for (const auto [I, TEs] : enumerate(Entries)) {
15057 if (TEs.empty()) {
15058 assert(!GatherShuffles[I] &&
15059 "No shuffles with empty entries list expected.");
15060 continue;
15061 }
15062 assert((TEs.size() == 1 || TEs.size() == 2) &&
15063 "Expected shuffle of 1 or 2 entries.");
15064 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
15065 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
15066 VecMask.assign(VecMask.size(), PoisonMaskElem);
15067 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
15068 if (TEs.size() == 1) {
15069 bool IsNotPoisonedVec =
15070 TEs.front()->VectorizedValue
15071 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15072 : true;
15073 IsUsedInExpr &=
15074 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
15075 SliceSize, IsNotPoisonedVec);
15076 ShuffleBuilder.add(*TEs.front(), VecMask);
15077 IsNonPoisoned &= IsNotPoisonedVec;
15078 } else {
15079 IsUsedInExpr = false;
15080 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15081 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15082 IsNonPoisoned &=
15083 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15084 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15085 }
15086 }
15087 }
15088 // Try to figure out best way to combine values: build a shuffle and insert
15089 // elements or just build several shuffles.
15090 // Insert non-constant scalars.
15091 SmallVector<Value *> NonConstants(GatheredScalars);
15092 int EMSz = ExtractMask.size();
15093 int MSz = Mask.size();
15094 // Try to build constant vector and shuffle with it only if currently we
15095 // have a single permutation and more than 1 scalar constants.
15096 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15097 bool IsIdentityShuffle =
15098 ((UseVecBaseAsInput ||
15099 all_of(ExtractShuffles,
15100 [](const std::optional<TTI::ShuffleKind> &SK) {
15101 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15103 })) &&
15104 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
15105 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15106 (!GatherShuffles.empty() &&
15107 all_of(GatherShuffles,
15108 [](const std::optional<TTI::ShuffleKind> &SK) {
15109 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15111 }) &&
15112 none_of(Mask, [&](int I) { return I >= MSz; }) &&
15114 bool EnoughConstsForShuffle =
15115 IsSingleShuffle &&
15116 (none_of(GatheredScalars,
15117 [](Value *V) {
15118 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15119 }) ||
15120 any_of(GatheredScalars,
15121 [](Value *V) {
15122 return isa<Constant>(V) && !isa<UndefValue>(V);
15123 })) &&
15124 (!IsIdentityShuffle ||
15125 (GatheredScalars.size() == 2 &&
15126 any_of(GatheredScalars,
15127 [](Value *V) { return !isa<UndefValue>(V); })) ||
15128 count_if(GatheredScalars, [](Value *V) {
15129 return isa<Constant>(V) && !isa<PoisonValue>(V);
15130 }) > 1);
15131 // NonConstants array contains just non-constant values, GatheredScalars
15132 // contains only constant to build final vector and then shuffle.
15133 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
15134 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15135 NonConstants[I] = PoisonValue::get(OrigScalarTy);
15136 else
15137 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15138 }
15139 // Generate constants for final shuffle and build a mask for them.
15140 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15141 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
15142 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
15143 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15144 ShuffleBuilder.add(BV, BVMask);
15145 }
15146 if (all_of(NonConstants, [=](Value *V) {
15147 return isa<PoisonValue>(V) ||
15148 (IsSingleShuffle && ((IsIdentityShuffle &&
15149 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15150 }))
15151 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15152 SubVectorsMask);
15153 else
15154 Res = ShuffleBuilder.finalize(
15155 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15156 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
15157 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
15158 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
15159 });
15160 } else if (!allConstant(GatheredScalars)) {
15161 // Gather unique scalars and all constants.
15162 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
15163 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
15164 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15165 ShuffleBuilder.add(BV, ReuseMask);
15166 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15167 SubVectorsMask);
15168 } else {
15169 // Gather all constants.
15170 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15171 for (auto [I, V] : enumerate(GatheredScalars)) {
15172 if (!isa<PoisonValue>(V))
15173 Mask[I] = I;
15174 }
15175 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15176 ShuffleBuilder.add(BV, Mask);
15177 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15178 SubVectorsMask);
15179 }
15180
15181 if (NeedFreeze)
15182 Res = ShuffleBuilder.createFreeze(Res);
15183 return Res;
15184}
15185
15186Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15187 bool PostponedPHIs) {
15188 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15189 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15190 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15191 Builder, *this);
15192}
15193
15194/// \returns \p I after propagating metadata from \p VL only for instructions in
15195/// \p VL.
15198 for (Value *V : VL)
15199 if (isa<Instruction>(V))
15200 Insts.push_back(V);
15201 return llvm::propagateMetadata(Inst, Insts);
15202}
15203
15204Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15205 IRBuilderBase::InsertPointGuard Guard(Builder);
15206
15207 if (E->VectorizedValue &&
15208 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15209 E->isAltShuffle())) {
15210 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15211 return E->VectorizedValue;
15212 }
15213
15214 Value *V = E->Scalars.front();
15215 Type *ScalarTy = V->getType();
15216 if (!isa<CmpInst>(V))
15217 ScalarTy = getValueType(V);
15218 auto It = MinBWs.find(E);
15219 if (It != MinBWs.end()) {
15220 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15221 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15222 if (VecTy)
15223 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15224 }
15225 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15226 if (E->isGather()) {
15227 // Set insert point for non-reduction initial nodes.
15228 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15229 setInsertPointAfterBundle(E);
15230 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15231 E->VectorizedValue = Vec;
15232 return Vec;
15233 }
15234
15235 bool IsReverseOrder =
15236 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15237 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15238 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15239 if (E->getOpcode() == Instruction::Store &&
15240 E->State == TreeEntry::Vectorize) {
15242 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15243 E->ReorderIndices.size());
15244 ShuffleBuilder.add(V, Mask);
15245 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15246 ShuffleBuilder.addOrdered(V, {});
15247 } else {
15248 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15249 }
15251 E->CombinedEntriesWithIndices.size());
15252 transform(
15253 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15254 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15255 });
15256 assert(
15257 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15258 "Expected either combined subnodes or reordering");
15259 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15260 };
15261
15262 assert(!E->isGather() && "Unhandled state");
15263 unsigned ShuffleOrOp =
15264 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15265 Instruction *VL0 = E->getMainOp();
15266 auto GetOperandSignedness = [&](unsigned Idx) {
15267 const TreeEntry *OpE = getOperandEntry(E, Idx);
15268 bool IsSigned = false;
15269 auto It = MinBWs.find(OpE);
15270 if (It != MinBWs.end())
15271 IsSigned = It->second.second;
15272 else
15273 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15274 if (isa<PoisonValue>(V))
15275 return false;
15276 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15277 });
15278 return IsSigned;
15279 };
15280 switch (ShuffleOrOp) {
15281 case Instruction::PHI: {
15282 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15283 E != VectorizableTree.front().get() ||
15284 !E->UserTreeIndices.empty()) &&
15285 "PHI reordering is free.");
15286 if (PostponedPHIs && E->VectorizedValue)
15287 return E->VectorizedValue;
15288 auto *PH = cast<PHINode>(VL0);
15289 Builder.SetInsertPoint(PH->getParent(),
15290 PH->getParent()->getFirstNonPHIIt());
15291 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15292 if (PostponedPHIs || !E->VectorizedValue) {
15293 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15294 E->PHI = NewPhi;
15295 Value *V = NewPhi;
15296
15297 // Adjust insertion point once all PHI's have been generated.
15298 Builder.SetInsertPoint(PH->getParent(),
15299 PH->getParent()->getFirstInsertionPt());
15300 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15301
15302 V = FinalShuffle(V, E);
15303
15304 E->VectorizedValue = V;
15305 if (PostponedPHIs)
15306 return V;
15307 }
15308 PHINode *NewPhi = cast<PHINode>(E->PHI);
15309 // If phi node is fully emitted - exit.
15310 if (NewPhi->getNumIncomingValues() != 0)
15311 return NewPhi;
15312
15313 // PHINodes may have multiple entries from the same block. We want to
15314 // visit every block once.
15316
15317 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15319 BasicBlock *IBB = PH->getIncomingBlock(I);
15320
15321 // Stop emission if all incoming values are generated.
15322 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15323 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15324 return NewPhi;
15325 }
15326
15327 if (!VisitedBBs.insert(IBB).second) {
15328 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15329 continue;
15330 }
15331
15332 Builder.SetInsertPoint(IBB->getTerminator());
15333 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15334 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15335 if (VecTy != Vec->getType()) {
15336 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15337 MinBWs.contains(getOperandEntry(E, I))) &&
15338 "Expected item in MinBWs.");
15339 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15340 }
15341 NewPhi->addIncoming(Vec, IBB);
15342 }
15343
15344 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15345 "Invalid number of incoming values");
15346 assert(E->VectorizedValue && "Expected vectorized value.");
15347 return E->VectorizedValue;
15348 }
15349
15350 case Instruction::ExtractElement: {
15351 Value *V = E->getSingleOperand(0);
15352 if (const TreeEntry *TE = getTreeEntry(V))
15353 V = TE->VectorizedValue;
15354 setInsertPointAfterBundle(E);
15355 V = FinalShuffle(V, E);
15356 E->VectorizedValue = V;
15357 return V;
15358 }
15359 case Instruction::ExtractValue: {
15360 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15361 Builder.SetInsertPoint(LI);
15362 Value *Ptr = LI->getPointerOperand();
15363 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15364 Value *NewV = ::propagateMetadata(V, E->Scalars);
15365 NewV = FinalShuffle(NewV, E);
15366 E->VectorizedValue = NewV;
15367 return NewV;
15368 }
15369 case Instruction::InsertElement: {
15370 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15371 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15372 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15373 ArrayRef<Value *> Op = E->getOperand(1);
15374 Type *ScalarTy = Op.front()->getType();
15375 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15376 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15377 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15378 assert(Res.first > 0 && "Expected item in MinBWs.");
15379 V = Builder.CreateIntCast(
15380 V,
15382 ScalarTy,
15383 cast<FixedVectorType>(V->getType())->getNumElements()),
15384 Res.second);
15385 }
15386
15387 // Create InsertVector shuffle if necessary
15388 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15389 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15390 }));
15391 const unsigned NumElts =
15392 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15393 const unsigned NumScalars = E->Scalars.size();
15394
15395 unsigned Offset = *getElementIndex(VL0);
15396 assert(Offset < NumElts && "Failed to find vector index offset");
15397
15398 // Create shuffle to resize vector
15400 if (!E->ReorderIndices.empty()) {
15401 inversePermutation(E->ReorderIndices, Mask);
15402 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15403 } else {
15404 Mask.assign(NumElts, PoisonMaskElem);
15405 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15406 }
15407 // Create InsertVector shuffle if necessary
15408 bool IsIdentity = true;
15409 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15410 Mask.swap(PrevMask);
15411 for (unsigned I = 0; I < NumScalars; ++I) {
15412 Value *Scalar = E->Scalars[PrevMask[I]];
15413 unsigned InsertIdx = *getElementIndex(Scalar);
15414 IsIdentity &= InsertIdx - Offset == I;
15415 Mask[InsertIdx - Offset] = I;
15416 }
15417 if (!IsIdentity || NumElts != NumScalars) {
15418 Value *V2 = nullptr;
15419 bool IsVNonPoisonous =
15421 SmallVector<int> InsertMask(Mask);
15422 if (NumElts != NumScalars && Offset == 0) {
15423 // Follow all insert element instructions from the current buildvector
15424 // sequence.
15425 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15426 do {
15427 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15428 if (!InsertIdx)
15429 break;
15430 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15431 InsertMask[*InsertIdx] = *InsertIdx;
15432 if (!Ins->hasOneUse())
15433 break;
15434 Ins = dyn_cast_or_null<InsertElementInst>(
15435 Ins->getUniqueUndroppableUser());
15436 } while (Ins);
15437 SmallBitVector UseMask =
15438 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15439 SmallBitVector IsFirstPoison =
15440 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15441 SmallBitVector IsFirstUndef =
15442 isUndefVector(FirstInsert->getOperand(0), UseMask);
15443 if (!IsFirstPoison.all()) {
15444 unsigned Idx = 0;
15445 for (unsigned I = 0; I < NumElts; I++) {
15446 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15447 IsFirstUndef.test(I)) {
15448 if (IsVNonPoisonous) {
15449 InsertMask[I] = I < NumScalars ? I : 0;
15450 continue;
15451 }
15452 if (!V2)
15453 V2 = UndefValue::get(V->getType());
15454 if (Idx >= NumScalars)
15455 Idx = NumScalars - 1;
15456 InsertMask[I] = NumScalars + Idx;
15457 ++Idx;
15458 } else if (InsertMask[I] != PoisonMaskElem &&
15459 Mask[I] == PoisonMaskElem) {
15460 InsertMask[I] = PoisonMaskElem;
15461 }
15462 }
15463 } else {
15464 InsertMask = Mask;
15465 }
15466 }
15467 if (!V2)
15468 V2 = PoisonValue::get(V->getType());
15469 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15470 if (auto *I = dyn_cast<Instruction>(V)) {
15471 GatherShuffleExtractSeq.insert(I);
15472 CSEBlocks.insert(I->getParent());
15473 }
15474 }
15475
15476 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15477 for (unsigned I = 0; I < NumElts; I++) {
15478 if (Mask[I] != PoisonMaskElem)
15479 InsertMask[Offset + I] = I;
15480 }
15481 SmallBitVector UseMask =
15482 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15483 SmallBitVector IsFirstUndef =
15484 isUndefVector(FirstInsert->getOperand(0), UseMask);
15485 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15486 NumElts != NumScalars) {
15487 if (IsFirstUndef.all()) {
15488 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15489 SmallBitVector IsFirstPoison =
15490 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15491 if (!IsFirstPoison.all()) {
15492 for (unsigned I = 0; I < NumElts; I++) {
15493 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15494 InsertMask[I] = I + NumElts;
15495 }
15496 }
15497 V = Builder.CreateShuffleVector(
15498 V,
15499 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15500 : FirstInsert->getOperand(0),
15501 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15502 if (auto *I = dyn_cast<Instruction>(V)) {
15503 GatherShuffleExtractSeq.insert(I);
15504 CSEBlocks.insert(I->getParent());
15505 }
15506 }
15507 } else {
15508 SmallBitVector IsFirstPoison =
15509 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15510 for (unsigned I = 0; I < NumElts; I++) {
15511 if (InsertMask[I] == PoisonMaskElem)
15512 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15513 else
15514 InsertMask[I] += NumElts;
15515 }
15516 V = Builder.CreateShuffleVector(
15517 FirstInsert->getOperand(0), V, InsertMask,
15518 cast<Instruction>(E->Scalars.back())->getName());
15519 if (auto *I = dyn_cast<Instruction>(V)) {
15520 GatherShuffleExtractSeq.insert(I);
15521 CSEBlocks.insert(I->getParent());
15522 }
15523 }
15524 }
15525
15526 ++NumVectorInstructions;
15527 E->VectorizedValue = V;
15528 return V;
15529 }
15530 case Instruction::ZExt:
15531 case Instruction::SExt:
15532 case Instruction::FPToUI:
15533 case Instruction::FPToSI:
15534 case Instruction::FPExt:
15535 case Instruction::PtrToInt:
15536 case Instruction::IntToPtr:
15537 case Instruction::SIToFP:
15538 case Instruction::UIToFP:
15539 case Instruction::Trunc:
15540 case Instruction::FPTrunc:
15541 case Instruction::BitCast: {
15542 setInsertPointAfterBundle(E);
15543
15544 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15545 if (E->VectorizedValue) {
15546 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15547 return E->VectorizedValue;
15548 }
15549
15550 auto *CI = cast<CastInst>(VL0);
15551 Instruction::CastOps VecOpcode = CI->getOpcode();
15552 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15553 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15554 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15555 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15556 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15557 // Check if the values are candidates to demote.
15558 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15559 if (SrcIt != MinBWs.end())
15560 SrcBWSz = SrcIt->second.first;
15561 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15562 if (BWSz == SrcBWSz) {
15563 VecOpcode = Instruction::BitCast;
15564 } else if (BWSz < SrcBWSz) {
15565 VecOpcode = Instruction::Trunc;
15566 } else if (It != MinBWs.end()) {
15567 assert(BWSz > SrcBWSz && "Invalid cast!");
15568 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15569 } else if (SrcIt != MinBWs.end()) {
15570 assert(BWSz > SrcBWSz && "Invalid cast!");
15571 VecOpcode =
15572 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15573 }
15574 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15575 !SrcIt->second.second) {
15576 VecOpcode = Instruction::UIToFP;
15577 }
15578 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15579 ? InVec
15580 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15581 V = FinalShuffle(V, E);
15582
15583 E->VectorizedValue = V;
15584 ++NumVectorInstructions;
15585 return V;
15586 }
15587 case Instruction::FCmp:
15588 case Instruction::ICmp: {
15589 setInsertPointAfterBundle(E);
15590
15591 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15592 if (E->VectorizedValue) {
15593 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15594 return E->VectorizedValue;
15595 }
15596 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15597 if (E->VectorizedValue) {
15598 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15599 return E->VectorizedValue;
15600 }
15601 if (L->getType() != R->getType()) {
15602 assert((getOperandEntry(E, 0)->isGather() ||
15603 getOperandEntry(E, 1)->isGather() ||
15604 MinBWs.contains(getOperandEntry(E, 0)) ||
15605 MinBWs.contains(getOperandEntry(E, 1))) &&
15606 "Expected item in MinBWs.");
15607 if (cast<VectorType>(L->getType())
15608 ->getElementType()
15609 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15610 ->getElementType()
15611 ->getIntegerBitWidth()) {
15612 Type *CastTy = R->getType();
15613 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15614 } else {
15615 Type *CastTy = L->getType();
15616 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15617 }
15618 }
15619
15620 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15621 Value *V = Builder.CreateCmp(P0, L, R);
15622 propagateIRFlags(V, E->Scalars, VL0);
15623 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15624 ICmp->setSameSign(/*B=*/false);
15625 // Do not cast for cmps.
15626 VecTy = cast<FixedVectorType>(V->getType());
15627 V = FinalShuffle(V, E);
15628
15629 E->VectorizedValue = V;
15630 ++NumVectorInstructions;
15631 return V;
15632 }
15633 case Instruction::Select: {
15634 setInsertPointAfterBundle(E);
15635
15636 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15637 if (E->VectorizedValue) {
15638 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15639 return E->VectorizedValue;
15640 }
15641 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15642 if (E->VectorizedValue) {
15643 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15644 return E->VectorizedValue;
15645 }
15646 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15647 if (E->VectorizedValue) {
15648 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15649 return E->VectorizedValue;
15650 }
15651 if (True->getType() != VecTy || False->getType() != VecTy) {
15652 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15653 getOperandEntry(E, 2)->isGather() ||
15654 MinBWs.contains(getOperandEntry(E, 1)) ||
15655 MinBWs.contains(getOperandEntry(E, 2))) &&
15656 "Expected item in MinBWs.");
15657 if (True->getType() != VecTy)
15658 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15659 if (False->getType() != VecTy)
15660 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15661 }
15662
15663 unsigned CondNumElements = getNumElements(Cond->getType());
15664 unsigned TrueNumElements = getNumElements(True->getType());
15665 assert(TrueNumElements >= CondNumElements &&
15666 TrueNumElements % CondNumElements == 0 &&
15667 "Cannot vectorize Instruction::Select");
15668 assert(TrueNumElements == getNumElements(False->getType()) &&
15669 "Cannot vectorize Instruction::Select");
15670 if (CondNumElements != TrueNumElements) {
15671 // When the return type is i1 but the source is fixed vector type, we
15672 // need to duplicate the condition value.
15673 Cond = Builder.CreateShuffleVector(
15674 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15675 CondNumElements));
15676 }
15677 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15678 "Cannot vectorize Instruction::Select");
15679 Value *V = Builder.CreateSelect(Cond, True, False);
15680 V = FinalShuffle(V, E);
15681
15682 E->VectorizedValue = V;
15683 ++NumVectorInstructions;
15684 return V;
15685 }
15686 case Instruction::FNeg: {
15687 setInsertPointAfterBundle(E);
15688
15689 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15690
15691 if (E->VectorizedValue) {
15692 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15693 return E->VectorizedValue;
15694 }
15695
15696 Value *V = Builder.CreateUnOp(
15697 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15698 propagateIRFlags(V, E->Scalars, VL0);
15699 if (auto *I = dyn_cast<Instruction>(V))
15700 V = ::propagateMetadata(I, E->Scalars);
15701
15702 V = FinalShuffle(V, E);
15703
15704 E->VectorizedValue = V;
15705 ++NumVectorInstructions;
15706
15707 return V;
15708 }
15709 case Instruction::Freeze: {
15710 setInsertPointAfterBundle(E);
15711
15712 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15713
15714 if (E->VectorizedValue) {
15715 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15716 return E->VectorizedValue;
15717 }
15718
15719 if (Op->getType() != VecTy) {
15720 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15721 MinBWs.contains(getOperandEntry(E, 0))) &&
15722 "Expected item in MinBWs.");
15723 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15724 }
15725 Value *V = Builder.CreateFreeze(Op);
15726 V = FinalShuffle(V, E);
15727
15728 E->VectorizedValue = V;
15729 ++NumVectorInstructions;
15730
15731 return V;
15732 }
15733 case Instruction::Add:
15734 case Instruction::FAdd:
15735 case Instruction::Sub:
15736 case Instruction::FSub:
15737 case Instruction::Mul:
15738 case Instruction::FMul:
15739 case Instruction::UDiv:
15740 case Instruction::SDiv:
15741 case Instruction::FDiv:
15742 case Instruction::URem:
15743 case Instruction::SRem:
15744 case Instruction::FRem:
15745 case Instruction::Shl:
15746 case Instruction::LShr:
15747 case Instruction::AShr:
15748 case Instruction::And:
15749 case Instruction::Or:
15750 case Instruction::Xor: {
15751 setInsertPointAfterBundle(E);
15752
15753 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15754 if (E->VectorizedValue) {
15755 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15756 return E->VectorizedValue;
15757 }
15758 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15759 if (E->VectorizedValue) {
15760 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15761 return E->VectorizedValue;
15762 }
15763 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15764 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15765 ArrayRef<Value *> Ops = E->getOperand(I);
15766 if (all_of(Ops, [&](Value *Op) {
15767 auto *CI = dyn_cast<ConstantInt>(Op);
15768 return CI && CI->getValue().countr_one() >= It->second.first;
15769 })) {
15770 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15771 E->VectorizedValue = V;
15772 ++NumVectorInstructions;
15773 return V;
15774 }
15775 }
15776 }
15777 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15778 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15779 getOperandEntry(E, 1)->isGather() ||
15780 MinBWs.contains(getOperandEntry(E, 0)) ||
15781 MinBWs.contains(getOperandEntry(E, 1))) &&
15782 "Expected item in MinBWs.");
15783 if (LHS->getType() != VecTy)
15784 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15785 if (RHS->getType() != VecTy)
15786 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15787 }
15788
15789 Value *V = Builder.CreateBinOp(
15790 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15791 RHS);
15792 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15793 if (auto *I = dyn_cast<Instruction>(V)) {
15794 V = ::propagateMetadata(I, E->Scalars);
15795 // Drop nuw flags for abs(sub(commutative), true).
15796 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15797 any_of(E->Scalars, [](Value *V) {
15798 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15799 }))
15800 I->setHasNoUnsignedWrap(/*b=*/false);
15801 }
15802
15803 V = FinalShuffle(V, E);
15804
15805 E->VectorizedValue = V;
15806 ++NumVectorInstructions;
15807
15808 return V;
15809 }
15810 case Instruction::Load: {
15811 // Loads are inserted at the head of the tree because we don't want to
15812 // sink them all the way down past store instructions.
15813 setInsertPointAfterBundle(E);
15814
15815 LoadInst *LI = cast<LoadInst>(VL0);
15816 Instruction *NewLI;
15817 Value *PO = LI->getPointerOperand();
15818 if (E->State == TreeEntry::Vectorize) {
15819 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15820 } else if (E->State == TreeEntry::StridedVectorize) {
15821 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15822 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15823 PO = IsReverseOrder ? PtrN : Ptr0;
15824 std::optional<int> Diff = getPointersDiff(
15825 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15826 Type *StrideTy = DL->getIndexType(PO->getType());
15827 Value *StrideVal;
15828 if (Diff) {
15829 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15830 StrideVal =
15831 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15832 DL->getTypeAllocSize(ScalarTy));
15833 } else {
15834 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15835 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15836 return cast<LoadInst>(V)->getPointerOperand();
15837 });
15838 OrdersType Order;
15839 std::optional<Value *> Stride =
15840 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15841 &*Builder.GetInsertPoint());
15842 Value *NewStride =
15843 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15844 StrideVal = Builder.CreateMul(
15845 NewStride,
15846 ConstantInt::get(
15847 StrideTy,
15848 (IsReverseOrder ? -1 : 1) *
15849 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15850 }
15851 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15852 auto *Inst = Builder.CreateIntrinsic(
15853 Intrinsic::experimental_vp_strided_load,
15854 {VecTy, PO->getType(), StrideTy},
15855 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15856 Builder.getInt32(E->Scalars.size())});
15857 Inst->addParamAttr(
15858 /*ArgNo=*/0,
15859 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15860 NewLI = Inst;
15861 } else {
15862 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15863 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15864 if (E->VectorizedValue) {
15865 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15866 return E->VectorizedValue;
15867 }
15868 if (isa<FixedVectorType>(ScalarTy)) {
15869 assert(SLPReVec && "FixedVectorType is not expected.");
15870 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15871 // to expand VecPtr if ScalarTy is a vector type.
15872 unsigned ScalarTyNumElements =
15873 cast<FixedVectorType>(ScalarTy)->getNumElements();
15874 unsigned VecTyNumElements =
15875 cast<FixedVectorType>(VecTy)->getNumElements();
15876 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15877 "Cannot expand getelementptr.");
15878 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15879 SmallVector<Constant *> Indices(VecTyNumElements);
15880 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15881 return Builder.getInt64(I % ScalarTyNumElements);
15882 });
15883 VecPtr = Builder.CreateGEP(
15884 VecTy->getElementType(),
15885 Builder.CreateShuffleVector(
15886 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15887 ConstantVector::get(Indices));
15888 }
15889 // Use the minimum alignment of the gathered loads.
15890 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15891 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15892 }
15893 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15894
15895 V = FinalShuffle(V, E);
15896 E->VectorizedValue = V;
15897 ++NumVectorInstructions;
15898 return V;
15899 }
15900 case Instruction::Store: {
15901 auto *SI = cast<StoreInst>(VL0);
15902
15903 setInsertPointAfterBundle(E);
15904
15905 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15906 if (VecValue->getType() != VecTy)
15907 VecValue =
15908 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15909 VecValue = FinalShuffle(VecValue, E);
15910
15911 Value *Ptr = SI->getPointerOperand();
15912 Instruction *ST;
15913 if (E->State == TreeEntry::Vectorize) {
15914 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
15915 } else {
15916 assert(E->State == TreeEntry::StridedVectorize &&
15917 "Expected either strided or consecutive stores.");
15918 if (!E->ReorderIndices.empty()) {
15919 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15920 Ptr = SI->getPointerOperand();
15921 }
15922 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15923 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
15924 auto *Inst = Builder.CreateIntrinsic(
15925 Intrinsic::experimental_vp_strided_store,
15926 {VecTy, Ptr->getType(), StrideTy},
15927 {VecValue, Ptr,
15928 ConstantInt::get(
15929 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15930 Builder.getAllOnesMask(VecTy->getElementCount()),
15931 Builder.getInt32(E->Scalars.size())});
15932 Inst->addParamAttr(
15933 /*ArgNo=*/1,
15934 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15935 ST = Inst;
15936 }
15937
15938 Value *V = ::propagateMetadata(ST, E->Scalars);
15939
15940 E->VectorizedValue = V;
15941 ++NumVectorInstructions;
15942 return V;
15943 }
15944 case Instruction::GetElementPtr: {
15945 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15946 setInsertPointAfterBundle(E);
15947
15948 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15949 if (E->VectorizedValue) {
15950 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15951 return E->VectorizedValue;
15952 }
15953
15954 SmallVector<Value *> OpVecs;
15955 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
15956 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15957 if (E->VectorizedValue) {
15958 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15959 return E->VectorizedValue;
15960 }
15961 OpVecs.push_back(OpVec);
15962 }
15963
15964 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
15965 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
15967 for (Value *V : E->Scalars) {
15968 if (isa<GetElementPtrInst>(V))
15969 GEPs.push_back(V);
15970 }
15971 V = ::propagateMetadata(I, GEPs);
15972 }
15973
15974 V = FinalShuffle(V, E);
15975
15976 E->VectorizedValue = V;
15977 ++NumVectorInstructions;
15978
15979 return V;
15980 }
15981 case Instruction::Call: {
15982 CallInst *CI = cast<CallInst>(VL0);
15983 setInsertPointAfterBundle(E);
15984
15986
15988 CI, ID, VecTy->getNumElements(),
15989 It != MinBWs.end() ? It->second.first : 0, TTI);
15990 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15991 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
15992 VecCallCosts.first <= VecCallCosts.second;
15993
15994 Value *ScalarArg = nullptr;
15995 SmallVector<Value *> OpVecs;
15996 SmallVector<Type *, 2> TysForDecl;
15997 // Add return type if intrinsic is overloaded on it.
15998 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
15999 TysForDecl.push_back(VecTy);
16000 auto *CEI = cast<CallInst>(VL0);
16001 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
16002 ValueList OpVL;
16003 // Some intrinsics have scalar arguments. This argument should not be
16004 // vectorized.
16005 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
16006 ScalarArg = CEI->getArgOperand(I);
16007 // if decided to reduce bitwidth of abs intrinsic, it second argument
16008 // must be set false (do not return poison, if value issigned min).
16009 if (ID == Intrinsic::abs && It != MinBWs.end() &&
16010 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
16011 ScalarArg = Builder.getFalse();
16012 OpVecs.push_back(ScalarArg);
16014 TysForDecl.push_back(ScalarArg->getType());
16015 continue;
16016 }
16017
16018 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
16019 if (E->VectorizedValue) {
16020 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16021 return E->VectorizedValue;
16022 }
16023 ScalarArg = CEI->getArgOperand(I);
16024 if (cast<VectorType>(OpVec->getType())->getElementType() !=
16025 ScalarArg->getType()->getScalarType() &&
16026 It == MinBWs.end()) {
16027 auto *CastTy =
16028 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16029 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16030 } else if (It != MinBWs.end()) {
16031 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16032 }
16033 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
16034 OpVecs.push_back(OpVec);
16035 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
16036 TysForDecl.push_back(OpVec->getType());
16037 }
16038
16039 Function *CF;
16040 if (!UseIntrinsic) {
16041 VFShape Shape =
16044 static_cast<unsigned>(VecTy->getNumElements())),
16045 false /*HasGlobalPred*/);
16046 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
16047 } else {
16048 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
16049 }
16050
16052 CI->getOperandBundlesAsDefs(OpBundles);
16053 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16054
16055 propagateIRFlags(V, E->Scalars, VL0);
16056 V = FinalShuffle(V, E);
16057
16058 E->VectorizedValue = V;
16059 ++NumVectorInstructions;
16060 return V;
16061 }
16062 case Instruction::ShuffleVector: {
16063 Value *V;
16064 if (SLPReVec && !E->isAltShuffle()) {
16065 setInsertPointAfterBundle(E);
16066 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16067 if (E->VectorizedValue) {
16068 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16069 return E->VectorizedValue;
16070 }
16071 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16072 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16073 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16074 "Not supported shufflevector usage.");
16075 SmallVector<int> NewMask(ThisMask.size());
16076 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16077 return SVSrc->getShuffleMask()[Mask];
16078 });
16079 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16080 } else {
16081 V = Builder.CreateShuffleVector(Src, ThisMask);
16082 }
16083 propagateIRFlags(V, E->Scalars, VL0);
16084 if (auto *I = dyn_cast<Instruction>(V))
16085 V = ::propagateMetadata(I, E->Scalars);
16086 V = FinalShuffle(V, E);
16087 } else {
16088 assert(E->isAltShuffle() &&
16089 ((Instruction::isBinaryOp(E->getOpcode()) &&
16090 Instruction::isBinaryOp(E->getAltOpcode())) ||
16091 (Instruction::isCast(E->getOpcode()) &&
16092 Instruction::isCast(E->getAltOpcode())) ||
16093 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16094 "Invalid Shuffle Vector Operand");
16095
16096 Value *LHS = nullptr, *RHS = nullptr;
16097 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16098 setInsertPointAfterBundle(E);
16099 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16100 if (E->VectorizedValue) {
16101 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16102 return E->VectorizedValue;
16103 }
16104 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16105 } else {
16106 setInsertPointAfterBundle(E);
16107 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16108 }
16109 if (E->VectorizedValue) {
16110 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16111 return E->VectorizedValue;
16112 }
16113 if (LHS && RHS &&
16114 ((Instruction::isBinaryOp(E->getOpcode()) &&
16115 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
16116 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
16117 assert((It != MinBWs.end() ||
16118 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16119 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16120 MinBWs.contains(getOperandEntry(E, 0)) ||
16121 MinBWs.contains(getOperandEntry(E, 1))) &&
16122 "Expected item in MinBWs.");
16123 Type *CastTy = VecTy;
16124 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
16125 if (cast<VectorType>(LHS->getType())
16126 ->getElementType()
16127 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16128 ->getElementType()
16129 ->getIntegerBitWidth())
16130 CastTy = RHS->getType();
16131 else
16132 CastTy = LHS->getType();
16133 }
16134 if (LHS->getType() != CastTy)
16135 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16136 if (RHS->getType() != CastTy)
16137 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16138 }
16139
16140 Value *V0, *V1;
16141 if (Instruction::isBinaryOp(E->getOpcode())) {
16142 V0 = Builder.CreateBinOp(
16143 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16144 V1 = Builder.CreateBinOp(
16145 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16146 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16147 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16148 auto *AltCI = cast<CmpInst>(E->getAltOp());
16149 CmpInst::Predicate AltPred = AltCI->getPredicate();
16150 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16151 } else {
16152 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16153 unsigned SrcBWSz = DL->getTypeSizeInBits(
16154 cast<VectorType>(LHS->getType())->getElementType());
16155 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16156 if (BWSz <= SrcBWSz) {
16157 if (BWSz < SrcBWSz)
16158 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16159 assert(LHS->getType() == VecTy &&
16160 "Expected same type as operand.");
16161 if (auto *I = dyn_cast<Instruction>(LHS))
16162 LHS = ::propagateMetadata(I, E->Scalars);
16163 LHS = FinalShuffle(LHS, E);
16164 E->VectorizedValue = LHS;
16165 ++NumVectorInstructions;
16166 return LHS;
16167 }
16168 }
16169 V0 = Builder.CreateCast(
16170 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16171 V1 = Builder.CreateCast(
16172 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16173 }
16174 // Add V0 and V1 to later analysis to try to find and remove matching
16175 // instruction, if any.
16176 for (Value *V : {V0, V1}) {
16177 if (auto *I = dyn_cast<Instruction>(V)) {
16178 GatherShuffleExtractSeq.insert(I);
16179 CSEBlocks.insert(I->getParent());
16180 }
16181 }
16182
16183 // Create shuffle to take alternate operations from the vector.
16184 // Also, gather up main and alt scalar ops to propagate IR flags to
16185 // each vector operation.
16186 ValueList OpScalars, AltScalars;
16188 E->buildAltOpShuffleMask(
16189 [E, this](Instruction *I) {
16190 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16191 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16192 *TLI);
16193 },
16194 Mask, &OpScalars, &AltScalars);
16195
16196 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16197 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16198 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16199 // Drop nuw flags for abs(sub(commutative), true).
16200 if (auto *I = dyn_cast<Instruction>(Vec);
16201 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16202 any_of(E->Scalars, [](Value *V) {
16203 if (isa<PoisonValue>(V))
16204 return false;
16205 auto *IV = cast<Instruction>(V);
16206 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16207 }))
16208 I->setHasNoUnsignedWrap(/*b=*/false);
16209 };
16210 DropNuwFlag(V0, E->getOpcode());
16211 DropNuwFlag(V1, E->getAltOpcode());
16212
16213 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16214 assert(SLPReVec && "FixedVectorType is not expected.");
16216 }
16217 V = Builder.CreateShuffleVector(V0, V1, Mask);
16218 if (auto *I = dyn_cast<Instruction>(V)) {
16219 V = ::propagateMetadata(I, E->Scalars);
16220 GatherShuffleExtractSeq.insert(I);
16221 CSEBlocks.insert(I->getParent());
16222 }
16223 }
16224
16225 E->VectorizedValue = V;
16226 ++NumVectorInstructions;
16227
16228 return V;
16229 }
16230 default:
16231 llvm_unreachable("unknown inst");
16232 }
16233 return nullptr;
16234}
16235
16237 ExtraValueToDebugLocsMap ExternallyUsedValues;
16238 return vectorizeTree(ExternallyUsedValues);
16239}
16240
16241Value *
16243 Instruction *ReductionRoot) {
16244 // All blocks must be scheduled before any instructions are inserted.
16245 for (auto &BSIter : BlocksSchedules) {
16246 scheduleBlock(BSIter.second.get());
16247 }
16248 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16249 // need to rebuild it.
16250 EntryToLastInstruction.clear();
16251
16252 if (ReductionRoot)
16253 Builder.SetInsertPoint(ReductionRoot->getParent(),
16254 ReductionRoot->getIterator());
16255 else
16256 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16257
16258 // Emit gathered loads first to emit better code for the users of those
16259 // gathered loads.
16260 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16261 if (GatheredLoadsEntriesFirst.has_value() &&
16262 TE->Idx >= *GatheredLoadsEntriesFirst &&
16263 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16264 assert((!TE->UserTreeIndices.empty() ||
16265 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16266 "Expected gathered load node.");
16267 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16268 }
16269 }
16270 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16271 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16272 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16273 if (TE->State == TreeEntry::Vectorize &&
16274 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16275 TE->VectorizedValue)
16276 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16277 // Run through the list of postponed gathers and emit them, replacing the temp
16278 // emitted allocas with actual vector instructions.
16279 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16281 for (const TreeEntry *E : PostponedNodes) {
16282 auto *TE = const_cast<TreeEntry *>(E);
16283 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16284 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16285 TE->UserTreeIndices.front().EdgeIdx)) &&
16286 VecTE->isSame(TE->Scalars))
16287 // Found gather node which is absolutely the same as one of the
16288 // vectorized nodes. It may happen after reordering.
16289 continue;
16290 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16291 TE->VectorizedValue = nullptr;
16292 auto *UserI =
16293 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16294 // If user is a PHI node, its vector code have to be inserted right before
16295 // block terminator. Since the node was delayed, there were some unresolved
16296 // dependencies at the moment when stab instruction was emitted. In a case
16297 // when any of these dependencies turn out an operand of another PHI, coming
16298 // from this same block, position of a stab instruction will become invalid.
16299 // The is because source vector that supposed to feed this gather node was
16300 // inserted at the end of the block [after stab instruction]. So we need
16301 // to adjust insertion point again to the end of block.
16302 if (isa<PHINode>(UserI)) {
16303 // Insert before all users.
16304 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16305 for (User *U : PrevVec->users()) {
16306 if (U == UserI)
16307 continue;
16308 auto *UI = dyn_cast<Instruction>(U);
16309 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16310 continue;
16311 if (UI->comesBefore(InsertPt))
16312 InsertPt = UI;
16313 }
16314 Builder.SetInsertPoint(InsertPt);
16315 } else {
16316 Builder.SetInsertPoint(PrevVec);
16317 }
16318 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16319 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16320 if (auto *VecI = dyn_cast<Instruction>(Vec);
16321 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16322 Builder.GetInsertPoint()->comesBefore(VecI))
16323 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16324 Builder.GetInsertPoint());
16325 if (Vec->getType() != PrevVec->getType()) {
16326 assert(Vec->getType()->isIntOrIntVectorTy() &&
16327 PrevVec->getType()->isIntOrIntVectorTy() &&
16328 "Expected integer vector types only.");
16329 std::optional<bool> IsSigned;
16330 for (Value *V : TE->Scalars) {
16331 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16332 auto It = MinBWs.find(BaseTE);
16333 if (It != MinBWs.end()) {
16334 IsSigned = IsSigned.value_or(false) || It->second.second;
16335 if (*IsSigned)
16336 break;
16337 }
16338 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16339 auto It = MinBWs.find(MNTE);
16340 if (It != MinBWs.end()) {
16341 IsSigned = IsSigned.value_or(false) || It->second.second;
16342 if (*IsSigned)
16343 break;
16344 }
16345 }
16346 if (IsSigned.value_or(false))
16347 break;
16348 // Scan through gather nodes.
16349 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16350 auto It = MinBWs.find(BVE);
16351 if (It != MinBWs.end()) {
16352 IsSigned = IsSigned.value_or(false) || It->second.second;
16353 if (*IsSigned)
16354 break;
16355 }
16356 }
16357 if (IsSigned.value_or(false))
16358 break;
16359 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16360 IsSigned =
16361 IsSigned.value_or(false) ||
16362 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16363 continue;
16364 }
16365 if (IsSigned.value_or(false))
16366 break;
16367 }
16368 }
16369 if (IsSigned.value_or(false)) {
16370 // Final attempt - check user node.
16371 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16372 if (It != MinBWs.end())
16373 IsSigned = It->second.second;
16374 }
16375 assert(IsSigned &&
16376 "Expected user node or perfect diamond match in MinBWs.");
16377 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16378 }
16379 PrevVec->replaceAllUsesWith(Vec);
16380 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16381 // Replace the stub vector node, if it was used before for one of the
16382 // buildvector nodes already.
16383 auto It = PostponedValues.find(PrevVec);
16384 if (It != PostponedValues.end()) {
16385 for (TreeEntry *VTE : It->getSecond())
16386 VTE->VectorizedValue = Vec;
16387 }
16388 eraseInstruction(PrevVec);
16389 }
16390
16391 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16392 << " values .\n");
16393
16395 // Maps vector instruction to original insertelement instruction
16396 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16397 // Maps extract Scalar to the corresponding extractelement instruction in the
16398 // basic block. Only one extractelement per block should be emitted.
16400 ScalarToEEs;
16401 SmallDenseSet<Value *, 4> UsedInserts;
16403 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16405 // Extract all of the elements with the external uses.
16406 for (const auto &ExternalUse : ExternalUses) {
16407 Value *Scalar = ExternalUse.Scalar;
16408 llvm::User *User = ExternalUse.User;
16409
16410 // Skip users that we already RAUW. This happens when one instruction
16411 // has multiple uses of the same value.
16412 if (User && !is_contained(Scalar->users(), User))
16413 continue;
16414 TreeEntry *E = getTreeEntry(Scalar);
16415 assert(E && "Invalid scalar");
16416 assert(!E->isGather() && "Extracting from a gather list");
16417 // Non-instruction pointers are not deleted, just skip them.
16418 if (E->getOpcode() == Instruction::GetElementPtr &&
16419 !isa<GetElementPtrInst>(Scalar))
16420 continue;
16421
16422 Value *Vec = E->VectorizedValue;
16423 assert(Vec && "Can't find vectorizable value");
16424
16425 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16426 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16427 if (Scalar->getType() != Vec->getType()) {
16428 Value *Ex = nullptr;
16429 Value *ExV = nullptr;
16430 auto *Inst = dyn_cast<Instruction>(Scalar);
16431 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16432 auto It = ScalarToEEs.find(Scalar);
16433 if (It != ScalarToEEs.end()) {
16434 // No need to emit many extracts, just move the only one in the
16435 // current block.
16436 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16437 : Builder.GetInsertBlock());
16438 if (EEIt != It->second.end()) {
16439 Value *PrevV = EEIt->second.first;
16440 if (auto *I = dyn_cast<Instruction>(PrevV);
16441 I && !ReplaceInst &&
16442 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16443 Builder.GetInsertPoint()->comesBefore(I)) {
16444 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16445 Builder.GetInsertPoint());
16446 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16447 CI->moveAfter(I);
16448 }
16449 Ex = PrevV;
16450 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16451 }
16452 }
16453 if (!Ex) {
16454 // "Reuse" the existing extract to improve final codegen.
16455 if (ReplaceInst) {
16456 // Leave the instruction as is, if it cheaper extracts and all
16457 // operands are scalar.
16458 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16459 IgnoredExtracts.insert(EE);
16460 Ex = EE;
16461 } else {
16462 auto *CloneInst = Inst->clone();
16463 CloneInst->insertBefore(Inst);
16464 if (Inst->hasName())
16465 CloneInst->takeName(Inst);
16466 Ex = CloneInst;
16467 }
16468 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16469 ES && isa<Instruction>(Vec)) {
16470 Value *V = ES->getVectorOperand();
16471 auto *IVec = cast<Instruction>(Vec);
16472 if (const TreeEntry *ETE = getTreeEntry(V))
16473 V = ETE->VectorizedValue;
16474 if (auto *IV = dyn_cast<Instruction>(V);
16475 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16476 IV->comesBefore(IVec))
16477 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16478 else
16479 Ex = Builder.CreateExtractElement(Vec, Lane);
16480 } else if (auto *VecTy =
16481 dyn_cast<FixedVectorType>(Scalar->getType())) {
16482 assert(SLPReVec && "FixedVectorType is not expected.");
16483 unsigned VecTyNumElements = VecTy->getNumElements();
16484 // When REVEC is enabled, we need to extract a vector.
16485 // Note: The element size of Scalar may be different from the
16486 // element size of Vec.
16487 Ex = Builder.CreateExtractVector(
16489 VecTyNumElements),
16490 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
16491 } else {
16492 Ex = Builder.CreateExtractElement(Vec, Lane);
16493 }
16494 // If necessary, sign-extend or zero-extend ScalarRoot
16495 // to the larger type.
16496 ExV = Ex;
16497 if (Scalar->getType() != Ex->getType())
16498 ExV = Builder.CreateIntCast(
16499 Ex, Scalar->getType(),
16500 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16501 auto *I = dyn_cast<Instruction>(Ex);
16502 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16503 : &F->getEntryBlock(),
16504 std::make_pair(Ex, ExV));
16505 }
16506 // The then branch of the previous if may produce constants, since 0
16507 // operand might be a constant.
16508 if (auto *ExI = dyn_cast<Instruction>(Ex);
16509 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16510 GatherShuffleExtractSeq.insert(ExI);
16511 CSEBlocks.insert(ExI->getParent());
16512 }
16513 return ExV;
16514 }
16515 assert(isa<FixedVectorType>(Scalar->getType()) &&
16516 isa<InsertElementInst>(Scalar) &&
16517 "In-tree scalar of vector type is not insertelement?");
16518 auto *IE = cast<InsertElementInst>(Scalar);
16519 VectorToInsertElement.try_emplace(Vec, IE);
16520 return Vec;
16521 };
16522 // If User == nullptr, the Scalar remains as scalar in vectorized
16523 // instructions or is used as extra arg. Generate ExtractElement instruction
16524 // and update the record for this scalar in ExternallyUsedValues.
16525 if (!User) {
16526 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16527 continue;
16528 assert((ExternallyUsedValues.count(Scalar) ||
16529 Scalar->hasNUsesOrMore(UsesLimit) ||
16530 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16531 any_of(Scalar->users(),
16532 [&](llvm::User *U) {
16533 if (ExternalUsesAsOriginalScalar.contains(U))
16534 return true;
16535 TreeEntry *UseEntry = getTreeEntry(U);
16536 return UseEntry &&
16537 (UseEntry->State == TreeEntry::Vectorize ||
16538 UseEntry->State ==
16539 TreeEntry::StridedVectorize) &&
16540 (E->State == TreeEntry::Vectorize ||
16541 E->State == TreeEntry::StridedVectorize) &&
16542 doesInTreeUserNeedToExtract(
16543 Scalar, getRootEntryInstruction(*UseEntry),
16544 TLI, TTI);
16545 })) &&
16546 "Scalar with nullptr User must be registered in "
16547 "ExternallyUsedValues map or remain as scalar in vectorized "
16548 "instructions");
16549 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16550 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16551 if (PHI->getParent()->isLandingPad())
16552 Builder.SetInsertPoint(
16553 PHI->getParent(),
16554 std::next(
16555 PHI->getParent()->getLandingPadInst()->getIterator()));
16556 else
16557 Builder.SetInsertPoint(PHI->getParent(),
16558 PHI->getParent()->getFirstNonPHIIt());
16559 } else {
16560 Builder.SetInsertPoint(VecI->getParent(),
16561 std::next(VecI->getIterator()));
16562 }
16563 } else {
16564 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16565 }
16566 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16567 // Required to update internally referenced instructions.
16568 if (Scalar != NewInst) {
16569 assert((!isa<ExtractElementInst>(Scalar) ||
16570 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16571 "Extractelements should not be replaced.");
16572 Scalar->replaceAllUsesWith(NewInst);
16573 }
16574 continue;
16575 }
16576
16577 if (auto *VU = dyn_cast<InsertElementInst>(User);
16578 VU && VU->getOperand(1) == Scalar) {
16579 // Skip if the scalar is another vector op or Vec is not an instruction.
16580 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16581 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16582 if (!UsedInserts.insert(VU).second)
16583 continue;
16584 // Need to use original vector, if the root is truncated.
16585 auto BWIt = MinBWs.find(E);
16586 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16587 auto *ScalarTy = FTy->getElementType();
16588 auto Key = std::make_pair(Vec, ScalarTy);
16589 auto VecIt = VectorCasts.find(Key);
16590 if (VecIt == VectorCasts.end()) {
16591 IRBuilderBase::InsertPointGuard Guard(Builder);
16592 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16593 if (IVec->getParent()->isLandingPad())
16594 Builder.SetInsertPoint(IVec->getParent(),
16595 std::next(IVec->getParent()
16596 ->getLandingPadInst()
16597 ->getIterator()));
16598 else
16599 Builder.SetInsertPoint(
16600 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16601 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16602 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16603 }
16604 Vec = Builder.CreateIntCast(
16605 Vec,
16607 ScalarTy,
16608 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16609 BWIt->second.second);
16610 VectorCasts.try_emplace(Key, Vec);
16611 } else {
16612 Vec = VecIt->second;
16613 }
16614 }
16615
16616 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16617 if (InsertIdx) {
16618 auto *It = find_if(
16619 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16620 // Checks if 2 insertelements are from the same buildvector.
16621 InsertElementInst *VecInsert = Data.InsertElements.front();
16623 VU, VecInsert,
16624 [](InsertElementInst *II) { return II->getOperand(0); });
16625 });
16626 unsigned Idx = *InsertIdx;
16627 if (It == ShuffledInserts.end()) {
16628 (void)ShuffledInserts.emplace_back();
16629 It = std::next(ShuffledInserts.begin(),
16630 ShuffledInserts.size() - 1);
16631 }
16632 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16633 if (Mask.empty())
16634 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16635 Mask[Idx] = ExternalUse.Lane;
16636 It->InsertElements.push_back(cast<InsertElementInst>(User));
16637 continue;
16638 }
16639 }
16640 }
16641 }
16642
16643 // Generate extracts for out-of-tree users.
16644 // Find the insertion point for the extractelement lane.
16645 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16646 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16647 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16648 if (PH->getIncomingValue(I) == Scalar) {
16649 Instruction *IncomingTerminator =
16650 PH->getIncomingBlock(I)->getTerminator();
16651 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16652 Builder.SetInsertPoint(VecI->getParent(),
16653 std::next(VecI->getIterator()));
16654 } else {
16655 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16656 }
16657 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16658 PH->setOperand(I, NewInst);
16659 }
16660 }
16661 } else {
16662 Builder.SetInsertPoint(cast<Instruction>(User));
16663 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16664 User->replaceUsesOfWith(Scalar, NewInst);
16665 }
16666 } else {
16667 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16668 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16669 User->replaceUsesOfWith(Scalar, NewInst);
16670 }
16671
16672 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16673 }
16674
16675 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16676 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16677 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16678 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16679 for (int I = 0, E = Mask.size(); I < E; ++I) {
16680 if (Mask[I] < VF)
16681 CombinedMask1[I] = Mask[I];
16682 else
16683 CombinedMask2[I] = Mask[I] - VF;
16684 }
16685 ShuffleInstructionBuilder ShuffleBuilder(
16686 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16687 ShuffleBuilder.add(V1, CombinedMask1);
16688 if (V2)
16689 ShuffleBuilder.add(V2, CombinedMask2);
16690 return ShuffleBuilder.finalize({}, {}, {});
16691 };
16692
16693 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16694 bool ForSingleMask) {
16695 unsigned VF = Mask.size();
16696 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16697 if (VF != VecVF) {
16698 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16699 Vec = CreateShuffle(Vec, nullptr, Mask);
16700 return std::make_pair(Vec, true);
16701 }
16702 if (!ForSingleMask) {
16703 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16704 for (unsigned I = 0; I < VF; ++I) {
16705 if (Mask[I] != PoisonMaskElem)
16706 ResizeMask[Mask[I]] = Mask[I];
16707 }
16708 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16709 }
16710 }
16711
16712 return std::make_pair(Vec, false);
16713 };
16714 // Perform shuffling of the vectorize tree entries for better handling of
16715 // external extracts.
16716 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16717 // Find the first and the last instruction in the list of insertelements.
16718 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16719 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16720 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16721 Builder.SetInsertPoint(LastInsert);
16722 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16723 Value *NewInst = performExtractsShuffleAction<Value>(
16724 MutableArrayRef(Vector.data(), Vector.size()),
16725 FirstInsert->getOperand(0),
16726 [](Value *Vec) {
16727 return cast<VectorType>(Vec->getType())
16728 ->getElementCount()
16729 .getKnownMinValue();
16730 },
16731 ResizeToVF,
16732 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16733 ArrayRef<Value *> Vals) {
16734 assert((Vals.size() == 1 || Vals.size() == 2) &&
16735 "Expected exactly 1 or 2 input values.");
16736 if (Vals.size() == 1) {
16737 // Do not create shuffle if the mask is a simple identity
16738 // non-resizing mask.
16739 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16740 ->getNumElements() ||
16741 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16742 return CreateShuffle(Vals.front(), nullptr, Mask);
16743 return Vals.front();
16744 }
16745 return CreateShuffle(Vals.front() ? Vals.front()
16746 : FirstInsert->getOperand(0),
16747 Vals.back(), Mask);
16748 });
16749 auto It = ShuffledInserts[I].InsertElements.rbegin();
16750 // Rebuild buildvector chain.
16751 InsertElementInst *II = nullptr;
16752 if (It != ShuffledInserts[I].InsertElements.rend())
16753 II = *It;
16755 while (It != ShuffledInserts[I].InsertElements.rend()) {
16756 assert(II && "Must be an insertelement instruction.");
16757 if (*It == II)
16758 ++It;
16759 else
16760 Inserts.push_back(cast<Instruction>(II));
16761 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16762 }
16763 for (Instruction *II : reverse(Inserts)) {
16764 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16765 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16766 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16767 II->moveAfter(NewI);
16768 NewInst = II;
16769 }
16770 LastInsert->replaceAllUsesWith(NewInst);
16771 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16772 IE->replaceUsesOfWith(IE->getOperand(0),
16773 PoisonValue::get(IE->getOperand(0)->getType()));
16774 IE->replaceUsesOfWith(IE->getOperand(1),
16775 PoisonValue::get(IE->getOperand(1)->getType()));
16776 eraseInstruction(IE);
16777 }
16778 CSEBlocks.insert(LastInsert->getParent());
16779 }
16780
16781 SmallVector<Instruction *> RemovedInsts;
16782 // For each vectorized value:
16783 for (auto &TEPtr : VectorizableTree) {
16784 TreeEntry *Entry = TEPtr.get();
16785
16786 // No need to handle users of gathered values.
16787 if (Entry->isGather())
16788 continue;
16789
16790 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16791
16792 // For each lane:
16793 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16794 Value *Scalar = Entry->Scalars[Lane];
16795
16796 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16797 !isa<GetElementPtrInst>(Scalar))
16798 continue;
16799 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16800 EE && IgnoredExtracts.contains(EE))
16801 continue;
16802 if (isa<PoisonValue>(Scalar))
16803 continue;
16804#ifndef NDEBUG
16805 Type *Ty = Scalar->getType();
16806 if (!Ty->isVoidTy()) {
16807 for (User *U : Scalar->users()) {
16808 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16809
16810 // It is legal to delete users in the ignorelist.
16811 assert((getTreeEntry(U) ||
16812 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16813 (isa_and_nonnull<Instruction>(U) &&
16814 isDeleted(cast<Instruction>(U)))) &&
16815 "Deleting out-of-tree value");
16816 }
16817 }
16818#endif
16819 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16820 auto *I = cast<Instruction>(Scalar);
16821 RemovedInsts.push_back(I);
16822 }
16823 }
16824
16825 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16826 // new vector instruction.
16827 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16828 V->mergeDIAssignID(RemovedInsts);
16829
16830 // Clear up reduction references, if any.
16831 if (UserIgnoreList) {
16832 for (Instruction *I : RemovedInsts) {
16833 const TreeEntry *IE = getTreeEntry(I);
16834 if (IE->Idx != 0 &&
16835 !(VectorizableTree.front()->isGather() &&
16836 !IE->UserTreeIndices.empty() &&
16837 (ValueToGatherNodes.lookup(I).contains(
16838 VectorizableTree.front().get()) ||
16839 any_of(IE->UserTreeIndices,
16840 [&](const EdgeInfo &EI) {
16841 return EI.UserTE == VectorizableTree.front().get() &&
16842 EI.EdgeIdx == UINT_MAX;
16843 }))) &&
16844 !(GatheredLoadsEntriesFirst.has_value() &&
16845 IE->Idx >= *GatheredLoadsEntriesFirst &&
16846 VectorizableTree.front()->isGather() &&
16847 is_contained(VectorizableTree.front()->Scalars, I)))
16848 continue;
16849 SmallVector<SelectInst *> LogicalOpSelects;
16850 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16851 // Do not replace condition of the logical op in form select <cond>.
16852 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16853 (match(U.getUser(), m_LogicalAnd()) ||
16854 match(U.getUser(), m_LogicalOr())) &&
16855 U.getOperandNo() == 0;
16856 if (IsPoisoningLogicalOp) {
16857 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16858 return false;
16859 }
16860 return UserIgnoreList->contains(U.getUser());
16861 });
16862 // Replace conditions of the poisoning logical ops with the non-poison
16863 // constant value.
16864 for (SelectInst *SI : LogicalOpSelects)
16865 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16866 }
16867 }
16868 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16869 // cache correctness.
16870 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16871 // - instructions are not deleted until later.
16872 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16873
16874 Builder.ClearInsertionPoint();
16875 InstrElementSize.clear();
16876
16877 const TreeEntry &RootTE = *VectorizableTree.front();
16878 Value *Vec = RootTE.VectorizedValue;
16879 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16880 It != MinBWs.end() &&
16881 ReductionBitWidth != It->second.first) {
16882 IRBuilder<>::InsertPointGuard Guard(Builder);
16883 Builder.SetInsertPoint(ReductionRoot->getParent(),
16884 ReductionRoot->getIterator());
16885 Vec = Builder.CreateIntCast(
16886 Vec,
16887 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16888 cast<VectorType>(Vec->getType())->getElementCount()),
16889 It->second.second);
16890 }
16891 return Vec;
16892}
16893
16895 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16896 << " gather sequences instructions.\n");
16897 // LICM InsertElementInst sequences.
16898 for (Instruction *I : GatherShuffleExtractSeq) {
16899 if (isDeleted(I))
16900 continue;
16901
16902 // Check if this block is inside a loop.
16903 Loop *L = LI->getLoopFor(I->getParent());
16904 if (!L)
16905 continue;
16906
16907 // Check if it has a preheader.
16908 BasicBlock *PreHeader = L->getLoopPreheader();
16909 if (!PreHeader)
16910 continue;
16911
16912 // If the vector or the element that we insert into it are
16913 // instructions that are defined in this basic block then we can't
16914 // hoist this instruction.
16915 if (any_of(I->operands(), [L](Value *V) {
16916 auto *OpI = dyn_cast<Instruction>(V);
16917 return OpI && L->contains(OpI);
16918 }))
16919 continue;
16920
16921 // We can hoist this instruction. Move it to the pre-header.
16922 I->moveBefore(PreHeader->getTerminator());
16923 CSEBlocks.insert(PreHeader);
16924 }
16925
16926 // Make a list of all reachable blocks in our CSE queue.
16928 CSEWorkList.reserve(CSEBlocks.size());
16929 for (BasicBlock *BB : CSEBlocks)
16930 if (DomTreeNode *N = DT->getNode(BB)) {
16932 CSEWorkList.push_back(N);
16933 }
16934
16935 // Sort blocks by domination. This ensures we visit a block after all blocks
16936 // dominating it are visited.
16937 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
16938 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
16939 "Different nodes should have different DFS numbers");
16940 return A->getDFSNumIn() < B->getDFSNumIn();
16941 });
16942
16943 // Less defined shuffles can be replaced by the more defined copies.
16944 // Between two shuffles one is less defined if it has the same vector operands
16945 // and its mask indeces are the same as in the first one or undefs. E.g.
16946 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
16947 // poison, <0, 0, 0, 0>.
16948 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
16949 SmallVectorImpl<int> &NewMask) {
16950 if (I1->getType() != I2->getType())
16951 return false;
16952 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16953 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16954 if (!SI1 || !SI2)
16955 return I1->isIdenticalTo(I2);
16956 if (SI1->isIdenticalTo(SI2))
16957 return true;
16958 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
16959 if (SI1->getOperand(I) != SI2->getOperand(I))
16960 return false;
16961 // Check if the second instruction is more defined than the first one.
16962 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
16963 ArrayRef<int> SM1 = SI1->getShuffleMask();
16964 // Count trailing undefs in the mask to check the final number of used
16965 // registers.
16966 unsigned LastUndefsCnt = 0;
16967 for (int I = 0, E = NewMask.size(); I < E; ++I) {
16968 if (SM1[I] == PoisonMaskElem)
16969 ++LastUndefsCnt;
16970 else
16971 LastUndefsCnt = 0;
16972 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
16973 NewMask[I] != SM1[I])
16974 return false;
16975 if (NewMask[I] == PoisonMaskElem)
16976 NewMask[I] = SM1[I];
16977 }
16978 // Check if the last undefs actually change the final number of used vector
16979 // registers.
16980 return SM1.size() - LastUndefsCnt > 1 &&
16981 TTI->getNumberOfParts(SI1->getType()) ==
16983 getWidenedType(SI1->getType()->getElementType(),
16984 SM1.size() - LastUndefsCnt));
16985 };
16986 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
16987 // instructions. TODO: We can further optimize this scan if we split the
16988 // instructions into different buckets based on the insert lane.
16990 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
16991 assert(*I &&
16992 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
16993 "Worklist not sorted properly!");
16994 BasicBlock *BB = (*I)->getBlock();
16995 // For all instructions in blocks containing gather sequences:
16996 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
16997 if (isDeleted(&In))
16998 continue;
16999 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17000 !GatherShuffleExtractSeq.contains(&In))
17001 continue;
17002
17003 // Check if we can replace this instruction with any of the
17004 // visited instructions.
17005 bool Replaced = false;
17006 for (Instruction *&V : Visited) {
17007 SmallVector<int> NewMask;
17008 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17009 DT->dominates(V->getParent(), In.getParent())) {
17010 In.replaceAllUsesWith(V);
17011 eraseInstruction(&In);
17012 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17013 if (!NewMask.empty())
17014 SI->setShuffleMask(NewMask);
17015 Replaced = true;
17016 break;
17017 }
17018 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17019 GatherShuffleExtractSeq.contains(V) &&
17020 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17021 DT->dominates(In.getParent(), V->getParent())) {
17022 In.moveAfter(V);
17023 V->replaceAllUsesWith(&In);
17025 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17026 if (!NewMask.empty())
17027 SI->setShuffleMask(NewMask);
17028 V = &In;
17029 Replaced = true;
17030 break;
17031 }
17032 }
17033 if (!Replaced) {
17034 assert(!is_contained(Visited, &In));
17035 Visited.push_back(&In);
17036 }
17037 }
17038 }
17039 CSEBlocks.clear();
17040 GatherShuffleExtractSeq.clear();
17041}
17042
17043BoUpSLP::ScheduleData *
17044BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17045 ScheduleData *Bundle = nullptr;
17046 ScheduleData *PrevInBundle = nullptr;
17047 for (Value *V : VL) {
17049 continue;
17050 ScheduleData *BundleMember = getScheduleData(V);
17051 assert(BundleMember &&
17052 "no ScheduleData for bundle member "
17053 "(maybe not in same basic block)");
17054 assert(BundleMember->isSchedulingEntity() &&
17055 "bundle member already part of other bundle");
17056 if (PrevInBundle) {
17057 PrevInBundle->NextInBundle = BundleMember;
17058 } else {
17059 Bundle = BundleMember;
17060 }
17061
17062 // Group the instructions to a bundle.
17063 BundleMember->FirstInBundle = Bundle;
17064 PrevInBundle = BundleMember;
17065 }
17066 assert(Bundle && "Failed to find schedule bundle");
17067 return Bundle;
17068}
17069
17070// Groups the instructions to a bundle (which is then a single scheduling entity)
17071// and schedules instructions until the bundle gets ready.
17072std::optional<BoUpSLP::ScheduleData *>
17073BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
17074 const InstructionsState &S) {
17075 // No need to schedule PHIs, insertelement, extractelement and extractvalue
17076 // instructions.
17077 if (isa<PHINode>(S.getMainOp()) ||
17079 return nullptr;
17080
17081 // Initialize the instruction bundle.
17082 Instruction *OldScheduleEnd = ScheduleEnd;
17083 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
17084
17085 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17086 ScheduleData *Bundle) {
17087 // The scheduling region got new instructions at the lower end (or it is a
17088 // new region for the first bundle). This makes it necessary to
17089 // recalculate all dependencies.
17090 // It is seldom that this needs to be done a second time after adding the
17091 // initial bundle to the region.
17092 if (ScheduleEnd != OldScheduleEnd) {
17093 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
17094 if (ScheduleData *SD = getScheduleData(I))
17095 SD->clearDependencies();
17096 ReSchedule = true;
17097 }
17098 if (Bundle) {
17099 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
17100 << " in block " << BB->getName() << "\n");
17101 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
17102 }
17103
17104 if (ReSchedule) {
17105 resetSchedule();
17106 initialFillReadyList(ReadyInsts);
17107 }
17108
17109 // Now try to schedule the new bundle or (if no bundle) just calculate
17110 // dependencies. As soon as the bundle is "ready" it means that there are no
17111 // cyclic dependencies and we can schedule it. Note that's important that we
17112 // don't "schedule" the bundle yet (see cancelScheduling).
17113 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17114 !ReadyInsts.empty()) {
17115 ScheduleData *Picked = ReadyInsts.pop_back_val();
17116 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17117 "must be ready to schedule");
17118 schedule(Picked, ReadyInsts);
17119 }
17120 };
17121
17122 // Make sure that the scheduling region contains all
17123 // instructions of the bundle.
17124 for (Value *V : VL) {
17126 continue;
17127 if (!extendSchedulingRegion(V, S)) {
17128 // If the scheduling region got new instructions at the lower end (or it
17129 // is a new region for the first bundle). This makes it necessary to
17130 // recalculate all dependencies.
17131 // Otherwise the compiler may crash trying to incorrectly calculate
17132 // dependencies and emit instruction in the wrong order at the actual
17133 // scheduling.
17134 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
17135 return std::nullopt;
17136 }
17137 }
17138
17139 bool ReSchedule = false;
17140 for (Value *V : VL) {
17142 continue;
17143 ScheduleData *BundleMember = getScheduleData(V);
17144 assert(BundleMember &&
17145 "no ScheduleData for bundle member (maybe not in same basic block)");
17146
17147 // Make sure we don't leave the pieces of the bundle in the ready list when
17148 // whole bundle might not be ready.
17149 ReadyInsts.remove(BundleMember);
17150
17151 if (!BundleMember->IsScheduled)
17152 continue;
17153 // A bundle member was scheduled as single instruction before and now
17154 // needs to be scheduled as part of the bundle. We just get rid of the
17155 // existing schedule.
17156 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
17157 << " was already scheduled\n");
17158 ReSchedule = true;
17159 }
17160
17161 auto *Bundle = buildBundle(VL);
17162 TryScheduleBundleImpl(ReSchedule, Bundle);
17163 if (!Bundle->isReady()) {
17164 cancelScheduling(VL, S.getMainOp());
17165 return std::nullopt;
17166 }
17167 return Bundle;
17168}
17169
17170void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17171 Value *OpValue) {
17172 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17174 return;
17175
17176 if (doesNotNeedToBeScheduled(OpValue))
17177 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17178 ScheduleData *Bundle = getScheduleData(OpValue);
17179 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17180 assert(!Bundle->IsScheduled &&
17181 "Can't cancel bundle which is already scheduled");
17182 assert(Bundle->isSchedulingEntity() &&
17183 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17184 "tried to unbundle something which is not a bundle");
17185
17186 // Remove the bundle from the ready list.
17187 if (Bundle->isReady())
17188 ReadyInsts.remove(Bundle);
17189
17190 // Un-bundle: make single instructions out of the bundle.
17191 ScheduleData *BundleMember = Bundle;
17192 while (BundleMember) {
17193 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17194 BundleMember->FirstInBundle = BundleMember;
17195 ScheduleData *Next = BundleMember->NextInBundle;
17196 BundleMember->NextInBundle = nullptr;
17197 BundleMember->TE = nullptr;
17198 if (BundleMember->unscheduledDepsInBundle() == 0) {
17199 ReadyInsts.insert(BundleMember);
17200 }
17201 BundleMember = Next;
17202 }
17203}
17204
17205BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17206 // Allocate a new ScheduleData for the instruction.
17207 if (ChunkPos >= ChunkSize) {
17208 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17209 ChunkPos = 0;
17210 }
17211 return &(ScheduleDataChunks.back()[ChunkPos++]);
17212}
17213
17214bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17215 Value *V, const InstructionsState &S) {
17216 Instruction *I = dyn_cast<Instruction>(V);
17217 assert(I && "bundle member must be an instruction");
17218 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17220 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17221 "be scheduled");
17222 if (getScheduleData(I))
17223 return true;
17224 if (!ScheduleStart) {
17225 // It's the first instruction in the new region.
17226 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17227 ScheduleStart = I;
17228 ScheduleEnd = I->getNextNode();
17229 assert(ScheduleEnd && "tried to vectorize a terminator?");
17230 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17231 return true;
17232 }
17233 // Search up and down at the same time, because we don't know if the new
17234 // instruction is above or below the existing scheduling region.
17235 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17236 // against the budget. Otherwise debug info could affect codegen.
17238 ++ScheduleStart->getIterator().getReverse();
17239 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17240 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17241 BasicBlock::iterator LowerEnd = BB->end();
17242 auto IsAssumeLikeIntr = [](const Instruction &I) {
17243 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17244 return II->isAssumeLikeIntrinsic();
17245 return false;
17246 };
17247 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17248 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17249 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17250 &*DownIter != I) {
17251 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17252 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17253 return false;
17254 }
17255
17256 ++UpIter;
17257 ++DownIter;
17258
17259 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17260 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17261 }
17262 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17263 assert(I->getParent() == ScheduleStart->getParent() &&
17264 "Instruction is in wrong basic block.");
17265 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17266 ScheduleStart = I;
17267 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17268 << "\n");
17269 return true;
17270 }
17271 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17272 "Expected to reach top of the basic block or instruction down the "
17273 "lower end.");
17274 assert(I->getParent() == ScheduleEnd->getParent() &&
17275 "Instruction is in wrong basic block.");
17276 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17277 nullptr);
17278 ScheduleEnd = I->getNextNode();
17279 assert(ScheduleEnd && "tried to vectorize a terminator?");
17280 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17281 return true;
17282}
17283
17284void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17285 Instruction *ToI,
17286 ScheduleData *PrevLoadStore,
17287 ScheduleData *NextLoadStore) {
17288 ScheduleData *CurrentLoadStore = PrevLoadStore;
17289 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17290 // No need to allocate data for non-schedulable instructions.
17292 continue;
17293 ScheduleData *SD = ScheduleDataMap.lookup(I);
17294 if (!SD) {
17295 SD = allocateScheduleDataChunks();
17296 ScheduleDataMap[I] = SD;
17297 }
17298 assert(!isInSchedulingRegion(SD) &&
17299 "new ScheduleData already in scheduling region");
17300 SD->init(SchedulingRegionID, I);
17301
17302 if (I->mayReadOrWriteMemory() &&
17303 (!isa<IntrinsicInst>(I) ||
17304 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17305 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17306 Intrinsic::pseudoprobe))) {
17307 // Update the linked list of memory accessing instructions.
17308 if (CurrentLoadStore) {
17309 CurrentLoadStore->NextLoadStore = SD;
17310 } else {
17311 FirstLoadStoreInRegion = SD;
17312 }
17313 CurrentLoadStore = SD;
17314 }
17315
17316 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17317 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17318 RegionHasStackSave = true;
17319 }
17320 if (NextLoadStore) {
17321 if (CurrentLoadStore)
17322 CurrentLoadStore->NextLoadStore = NextLoadStore;
17323 } else {
17324 LastLoadStoreInRegion = CurrentLoadStore;
17325 }
17326}
17327
17328void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17329 bool InsertInReadyList,
17330 BoUpSLP *SLP) {
17331 assert(SD->isSchedulingEntity());
17332
17334 WorkList.push_back(SD);
17335
17336 while (!WorkList.empty()) {
17337 ScheduleData *SD = WorkList.pop_back_val();
17338 for (ScheduleData *BundleMember = SD; BundleMember;
17339 BundleMember = BundleMember->NextInBundle) {
17340 assert(isInSchedulingRegion(BundleMember));
17341 if (BundleMember->hasValidDependencies())
17342 continue;
17343
17344 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17345 << "\n");
17346 BundleMember->Dependencies = 0;
17347 BundleMember->resetUnscheduledDeps();
17348
17349 // Handle def-use chain dependencies.
17350 for (User *U : BundleMember->Inst->users()) {
17351 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17352 BundleMember->Dependencies++;
17353 ScheduleData *DestBundle = UseSD->FirstInBundle;
17354 if (!DestBundle->IsScheduled)
17355 BundleMember->incrementUnscheduledDeps(1);
17356 if (!DestBundle->hasValidDependencies())
17357 WorkList.push_back(DestBundle);
17358 }
17359 }
17360
17361 auto MakeControlDependent = [&](Instruction *I) {
17362 auto *DepDest = getScheduleData(I);
17363 assert(DepDest && "must be in schedule window");
17364 DepDest->ControlDependencies.push_back(BundleMember);
17365 BundleMember->Dependencies++;
17366 ScheduleData *DestBundle = DepDest->FirstInBundle;
17367 if (!DestBundle->IsScheduled)
17368 BundleMember->incrementUnscheduledDeps(1);
17369 if (!DestBundle->hasValidDependencies())
17370 WorkList.push_back(DestBundle);
17371 };
17372
17373 // Any instruction which isn't safe to speculate at the beginning of the
17374 // block is control dependend on any early exit or non-willreturn call
17375 // which proceeds it.
17376 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17377 for (Instruction *I = BundleMember->Inst->getNextNode();
17378 I != ScheduleEnd; I = I->getNextNode()) {
17379 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17380 continue;
17381
17382 // Add the dependency
17383 MakeControlDependent(I);
17384
17386 // Everything past here must be control dependent on I.
17387 break;
17388 }
17389 }
17390
17391 if (RegionHasStackSave) {
17392 // If we have an inalloc alloca instruction, it needs to be scheduled
17393 // after any preceeding stacksave. We also need to prevent any alloca
17394 // from reordering above a preceeding stackrestore.
17395 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17396 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17397 for (Instruction *I = BundleMember->Inst->getNextNode();
17398 I != ScheduleEnd; I = I->getNextNode()) {
17399 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17400 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17401 // Any allocas past here must be control dependent on I, and I
17402 // must be memory dependend on BundleMember->Inst.
17403 break;
17404
17405 if (!isa<AllocaInst>(I))
17406 continue;
17407
17408 // Add the dependency
17409 MakeControlDependent(I);
17410 }
17411 }
17412
17413 // In addition to the cases handle just above, we need to prevent
17414 // allocas and loads/stores from moving below a stacksave or a
17415 // stackrestore. Avoiding moving allocas below stackrestore is currently
17416 // thought to be conservatism. Moving loads/stores below a stackrestore
17417 // can lead to incorrect code.
17418 if (isa<AllocaInst>(BundleMember->Inst) ||
17419 BundleMember->Inst->mayReadOrWriteMemory()) {
17420 for (Instruction *I = BundleMember->Inst->getNextNode();
17421 I != ScheduleEnd; I = I->getNextNode()) {
17422 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17423 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17424 continue;
17425
17426 // Add the dependency
17427 MakeControlDependent(I);
17428 break;
17429 }
17430 }
17431 }
17432
17433 // Handle the memory dependencies (if any).
17434 ScheduleData *DepDest = BundleMember->NextLoadStore;
17435 if (!DepDest)
17436 continue;
17437 Instruction *SrcInst = BundleMember->Inst;
17438 assert(SrcInst->mayReadOrWriteMemory() &&
17439 "NextLoadStore list for non memory effecting bundle?");
17440 MemoryLocation SrcLoc = getLocation(SrcInst);
17441 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17442 unsigned NumAliased = 0;
17443 unsigned DistToSrc = 1;
17444
17445 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17446 assert(isInSchedulingRegion(DepDest));
17447
17448 // We have two limits to reduce the complexity:
17449 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17450 // SLP->isAliased (which is the expensive part in this loop).
17451 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17452 // the whole loop (even if the loop is fast, it's quadratic).
17453 // It's important for the loop break condition (see below) to
17454 // check this limit even between two read-only instructions.
17455 if (DistToSrc >= MaxMemDepDistance ||
17456 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17457 (NumAliased >= AliasedCheckLimit ||
17458 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17459
17460 // We increment the counter only if the locations are aliased
17461 // (instead of counting all alias checks). This gives a better
17462 // balance between reduced runtime and accurate dependencies.
17463 NumAliased++;
17464
17465 DepDest->MemoryDependencies.push_back(BundleMember);
17466 BundleMember->Dependencies++;
17467 ScheduleData *DestBundle = DepDest->FirstInBundle;
17468 if (!DestBundle->IsScheduled) {
17469 BundleMember->incrementUnscheduledDeps(1);
17470 }
17471 if (!DestBundle->hasValidDependencies()) {
17472 WorkList.push_back(DestBundle);
17473 }
17474 }
17475
17476 // Example, explaining the loop break condition: Let's assume our
17477 // starting instruction is i0 and MaxMemDepDistance = 3.
17478 //
17479 // +--------v--v--v
17480 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17481 // +--------^--^--^
17482 //
17483 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17484 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17485 // Previously we already added dependencies from i3 to i6,i7,i8
17486 // (because of MaxMemDepDistance). As we added a dependency from
17487 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17488 // and we can abort this loop at i6.
17489 if (DistToSrc >= 2 * MaxMemDepDistance)
17490 break;
17491 DistToSrc++;
17492 }
17493 }
17494 if (InsertInReadyList && SD->isReady()) {
17495 ReadyInsts.insert(SD);
17496 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17497 << "\n");
17498 }
17499 }
17500}
17501
17502void BoUpSLP::BlockScheduling::resetSchedule() {
17503 assert(ScheduleStart &&
17504 "tried to reset schedule on block which has not been scheduled");
17505 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17506 if (ScheduleData *SD = getScheduleData(I)) {
17507 assert(isInSchedulingRegion(SD) &&
17508 "ScheduleData not in scheduling region");
17509 SD->IsScheduled = false;
17510 SD->resetUnscheduledDeps();
17511 }
17512 }
17513 ReadyInsts.clear();
17514}
17515
17516void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17517 if (!BS->ScheduleStart)
17518 return;
17519
17520 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17521
17522 // A key point - if we got here, pre-scheduling was able to find a valid
17523 // scheduling of the sub-graph of the scheduling window which consists
17524 // of all vector bundles and their transitive users. As such, we do not
17525 // need to reschedule anything *outside of* that subgraph.
17526
17527 BS->resetSchedule();
17528
17529 // For the real scheduling we use a more sophisticated ready-list: it is
17530 // sorted by the original instruction location. This lets the final schedule
17531 // be as close as possible to the original instruction order.
17532 // WARNING: If changing this order causes a correctness issue, that means
17533 // there is some missing dependence edge in the schedule data graph.
17534 struct ScheduleDataCompare {
17535 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17536 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17537 }
17538 };
17539 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17540
17541 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17542 // and fill the ready-list with initial instructions.
17543 int Idx = 0;
17544 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17545 I = I->getNextNode()) {
17546 if (ScheduleData *SD = BS->getScheduleData(I)) {
17547 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17548 (void)SDTE;
17550 SD->isPartOfBundle() ==
17551 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17552 "scheduler and vectorizer bundle mismatch");
17553 SD->FirstInBundle->SchedulingPriority = Idx++;
17554
17555 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17556 BS->calculateDependencies(SD, false, this);
17557 }
17558 }
17559 BS->initialFillReadyList(ReadyInsts);
17560
17561 Instruction *LastScheduledInst = BS->ScheduleEnd;
17562
17563 // Do the "real" scheduling.
17564 while (!ReadyInsts.empty()) {
17565 ScheduleData *Picked = *ReadyInsts.begin();
17566 ReadyInsts.erase(ReadyInsts.begin());
17567
17568 // Move the scheduled instruction(s) to their dedicated places, if not
17569 // there yet.
17570 for (ScheduleData *BundleMember = Picked; BundleMember;
17571 BundleMember = BundleMember->NextInBundle) {
17572 Instruction *PickedInst = BundleMember->Inst;
17573 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17574 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17575 LastScheduledInst = PickedInst;
17576 }
17577
17578 BS->schedule(Picked, ReadyInsts);
17579 }
17580
17581 // Check that we didn't break any of our invariants.
17582#ifdef EXPENSIVE_CHECKS
17583 BS->verify();
17584#endif
17585
17586#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17587 // Check that all schedulable entities got scheduled
17588 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17589 ScheduleData *SD = BS->getScheduleData(I);
17590 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17591 assert(SD->IsScheduled && "must be scheduled at this point");
17592 }
17593#endif
17594
17595 // Avoid duplicate scheduling of the block.
17596 BS->ScheduleStart = nullptr;
17597}
17598
17600 // If V is a store, just return the width of the stored value (or value
17601 // truncated just before storing) without traversing the expression tree.
17602 // This is the common case.
17603 if (auto *Store = dyn_cast<StoreInst>(V))
17604 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17605
17606 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17607 return getVectorElementSize(IEI->getOperand(1));
17608
17609 auto E = InstrElementSize.find(V);
17610 if (E != InstrElementSize.end())
17611 return E->second;
17612
17613 // If V is not a store, we can traverse the expression tree to find loads
17614 // that feed it. The type of the loaded value may indicate a more suitable
17615 // width than V's type. We want to base the vector element size on the width
17616 // of memory operations where possible.
17619 if (auto *I = dyn_cast<Instruction>(V)) {
17620 Worklist.emplace_back(I, I->getParent(), 0);
17621 Visited.insert(I);
17622 }
17623
17624 // Traverse the expression tree in bottom-up order looking for loads. If we
17625 // encounter an instruction we don't yet handle, we give up.
17626 auto Width = 0u;
17627 Value *FirstNonBool = nullptr;
17628 while (!Worklist.empty()) {
17629 auto [I, Parent, Level] = Worklist.pop_back_val();
17630
17631 // We should only be looking at scalar instructions here. If the current
17632 // instruction has a vector type, skip.
17633 auto *Ty = I->getType();
17634 if (isa<VectorType>(Ty))
17635 continue;
17636 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17637 FirstNonBool = I;
17638 if (Level > RecursionMaxDepth)
17639 continue;
17640
17641 // If the current instruction is a load, update MaxWidth to reflect the
17642 // width of the loaded value.
17643 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17644 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17645
17646 // Otherwise, we need to visit the operands of the instruction. We only
17647 // handle the interesting cases from buildTree here. If an operand is an
17648 // instruction we haven't yet visited and from the same basic block as the
17649 // user or the use is a PHI node, we add it to the worklist.
17652 for (Use &U : I->operands()) {
17653 if (auto *J = dyn_cast<Instruction>(U.get()))
17654 if (Visited.insert(J).second &&
17655 (isa<PHINode>(I) || J->getParent() == Parent)) {
17656 Worklist.emplace_back(J, J->getParent(), Level + 1);
17657 continue;
17658 }
17659 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17660 FirstNonBool = U.get();
17661 }
17662 } else {
17663 break;
17664 }
17665 }
17666
17667 // If we didn't encounter a memory access in the expression tree, or if we
17668 // gave up for some reason, just return the width of V. Otherwise, return the
17669 // maximum width we found.
17670 if (!Width) {
17671 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17672 V = FirstNonBool;
17673 Width = DL->getTypeSizeInBits(V->getType());
17674 }
17675
17676 for (Instruction *I : Visited)
17677 InstrElementSize[I] = Width;
17678
17679 return Width;
17680}
17681
17682bool BoUpSLP::collectValuesToDemote(
17683 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17685 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17686 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17687 // We can always demote constants.
17688 if (all_of(E.Scalars, IsaPred<Constant>))
17689 return true;
17690
17691 unsigned OrigBitWidth =
17692 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17693 if (OrigBitWidth == BitWidth) {
17694 MaxDepthLevel = 1;
17695 return true;
17696 }
17697
17698 // Check if the node was analyzed already and must keep its original bitwidth.
17699 if (NodesToKeepBWs.contains(E.Idx))
17700 return false;
17701
17702 // If the value is not a vectorized instruction in the expression and not used
17703 // by the insertelement instruction and not used in multiple vector nodes, it
17704 // cannot be demoted.
17705 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17706 if (isa<PoisonValue>(R))
17707 return false;
17708 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17709 });
17710 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17711 if (isa<PoisonValue>(V))
17712 return true;
17713 if (MultiNodeScalars.contains(V))
17714 return false;
17715 // For lat shuffle of sext/zext with many uses need to check the extra bit
17716 // for unsigned values, otherwise may have incorrect casting for reused
17717 // scalars.
17718 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17719 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17720 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17721 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17722 return true;
17723 }
17724 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17725 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17726 if (IsSignedNode)
17727 ++BitWidth1;
17728 if (auto *I = dyn_cast<Instruction>(V)) {
17729 APInt Mask = DB->getDemandedBits(I);
17730 unsigned BitWidth2 =
17731 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17732 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17733 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17734 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17735 break;
17736 BitWidth2 *= 2;
17737 }
17738 BitWidth1 = std::min(BitWidth1, BitWidth2);
17739 }
17740 BitWidth = std::max(BitWidth, BitWidth1);
17741 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17742 };
17743 using namespace std::placeholders;
17744 auto FinalAnalysis = [&]() {
17745 if (!IsProfitableToDemote)
17746 return false;
17747 bool Res = all_of(
17748 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17749 // Demote gathers.
17750 if (Res && E.isGather()) {
17751 // Check possible extractelement instructions bases and final vector
17752 // length.
17753 SmallPtrSet<Value *, 4> UniqueBases;
17754 for (Value *V : E.Scalars) {
17755 auto *EE = dyn_cast<ExtractElementInst>(V);
17756 if (!EE)
17757 continue;
17758 UniqueBases.insert(EE->getVectorOperand());
17759 }
17760 const unsigned VF = E.Scalars.size();
17761 Type *OrigScalarTy = E.Scalars.front()->getType();
17762 if (UniqueBases.size() <= 2 ||
17763 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17765 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17766 ToDemote.push_back(E.Idx);
17767 }
17768 return Res;
17769 };
17770 if (E.isGather() || !Visited.insert(&E).second ||
17771 any_of(E.Scalars, [&](Value *V) {
17772 return all_of(V->users(), [&](User *U) {
17773 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17774 });
17775 }))
17776 return FinalAnalysis();
17777
17778 if (any_of(E.Scalars, [&](Value *V) {
17779 return !all_of(V->users(), [=](User *U) {
17780 return getTreeEntry(U) ||
17781 (E.Idx == 0 && UserIgnoreList &&
17782 UserIgnoreList->contains(U)) ||
17783 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17784 !U->getType()->isScalableTy() &&
17785 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17786 }) && !IsPotentiallyTruncated(V, BitWidth);
17787 }))
17788 return false;
17789
17790 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17791 bool &NeedToExit) {
17792 NeedToExit = false;
17793 unsigned InitLevel = MaxDepthLevel;
17794 for (const TreeEntry *Op : Operands) {
17795 unsigned Level = InitLevel;
17796 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17797 ToDemote, Visited, NodesToKeepBWs, Level,
17798 IsProfitableToDemote, IsTruncRoot)) {
17799 if (!IsProfitableToDemote)
17800 return false;
17801 NeedToExit = true;
17802 if (!FinalAnalysis())
17803 return false;
17804 continue;
17805 }
17806 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17807 }
17808 return true;
17809 };
17810 auto AttemptCheckBitwidth =
17811 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17812 // Try all bitwidth < OrigBitWidth.
17813 NeedToExit = false;
17814 unsigned BestFailBitwidth = 0;
17815 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17816 if (Checker(BitWidth, OrigBitWidth))
17817 return true;
17818 if (BestFailBitwidth == 0 && FinalAnalysis())
17819 BestFailBitwidth = BitWidth;
17820 }
17821 if (BitWidth >= OrigBitWidth) {
17822 if (BestFailBitwidth == 0) {
17823 BitWidth = OrigBitWidth;
17824 return false;
17825 }
17826 MaxDepthLevel = 1;
17827 BitWidth = BestFailBitwidth;
17828 NeedToExit = true;
17829 return true;
17830 }
17831 return false;
17832 };
17833 auto TryProcessInstruction =
17834 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17835 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17836 if (Operands.empty()) {
17837 if (!IsTruncRoot)
17838 MaxDepthLevel = 1;
17839 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17840 std::ref(BitWidth)));
17841 } else {
17842 // Several vectorized uses? Check if we can truncate it, otherwise -
17843 // exit.
17844 if (E.UserTreeIndices.size() > 1 &&
17845 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17846 std::ref(BitWidth))))
17847 return false;
17848 bool NeedToExit = false;
17849 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17850 return false;
17851 if (NeedToExit)
17852 return true;
17853 if (!ProcessOperands(Operands, NeedToExit))
17854 return false;
17855 if (NeedToExit)
17856 return true;
17857 }
17858
17859 ++MaxDepthLevel;
17860 // Record the entry that we can demote.
17861 ToDemote.push_back(E.Idx);
17862 return IsProfitableToDemote;
17863 };
17864 switch (E.getOpcode()) {
17865
17866 // We can always demote truncations and extensions. Since truncations can
17867 // seed additional demotion, we save the truncated value.
17868 case Instruction::Trunc:
17869 if (IsProfitableToDemoteRoot)
17870 IsProfitableToDemote = true;
17871 return TryProcessInstruction(BitWidth);
17872 case Instruction::ZExt:
17873 case Instruction::SExt:
17874 IsProfitableToDemote = true;
17875 return TryProcessInstruction(BitWidth);
17876
17877 // We can demote certain binary operations if we can demote both of their
17878 // operands.
17879 case Instruction::Add:
17880 case Instruction::Sub:
17881 case Instruction::Mul:
17882 case Instruction::And:
17883 case Instruction::Or:
17884 case Instruction::Xor: {
17885 return TryProcessInstruction(
17886 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17887 }
17888 case Instruction::Freeze:
17889 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17890 case Instruction::Shl: {
17891 // If we are truncating the result of this SHL, and if it's a shift of an
17892 // inrange amount, we can always perform a SHL in a smaller type.
17893 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17894 return all_of(E.Scalars, [&](Value *V) {
17895 if (isa<PoisonValue>(V))
17896 return true;
17897 auto *I = cast<Instruction>(V);
17898 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17899 return AmtKnownBits.getMaxValue().ult(BitWidth);
17900 });
17901 };
17902 return TryProcessInstruction(
17903 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17904 }
17905 case Instruction::LShr: {
17906 // If this is a truncate of a logical shr, we can truncate it to a smaller
17907 // lshr iff we know that the bits we would otherwise be shifting in are
17908 // already zeros.
17909 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17910 return all_of(E.Scalars, [&](Value *V) {
17911 if (isa<PoisonValue>(V))
17912 return true;
17913 auto *I = cast<Instruction>(V);
17914 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17915 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17916 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17917 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17918 SimplifyQuery(*DL));
17919 });
17920 };
17921 return TryProcessInstruction(
17922 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17923 LShrChecker);
17924 }
17925 case Instruction::AShr: {
17926 // If this is a truncate of an arithmetic shr, we can truncate it to a
17927 // smaller ashr iff we know that all the bits from the sign bit of the
17928 // original type and the sign bit of the truncate type are similar.
17929 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17930 return all_of(E.Scalars, [&](Value *V) {
17931 if (isa<PoisonValue>(V))
17932 return true;
17933 auto *I = cast<Instruction>(V);
17934 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17935 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17936 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17937 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17938 nullptr, DT);
17939 });
17940 };
17941 return TryProcessInstruction(
17942 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17943 AShrChecker);
17944 }
17945 case Instruction::UDiv:
17946 case Instruction::URem: {
17947 // UDiv and URem can be truncated if all the truncated bits are zero.
17948 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17949 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17950 return all_of(E.Scalars, [&](Value *V) {
17951 auto *I = cast<Instruction>(V);
17952 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17953 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17954 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17955 });
17956 };
17957 return TryProcessInstruction(
17958 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17959 }
17960
17961 // We can demote selects if we can demote their true and false values.
17962 case Instruction::Select: {
17963 return TryProcessInstruction(
17964 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
17965 }
17966
17967 // We can demote phis if we can demote all their incoming operands. Note that
17968 // we don't need to worry about cycles since we ensure single use above.
17969 case Instruction::PHI: {
17970 const unsigned NumOps = E.getNumOperands();
17972 transform(seq<unsigned>(0, NumOps), Ops.begin(),
17973 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
17974
17975 return TryProcessInstruction(BitWidth, Ops);
17976 }
17977
17978 case Instruction::Call: {
17979 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
17980 if (!IC)
17981 break;
17983 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
17984 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
17985 break;
17986 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
17987 function_ref<bool(unsigned, unsigned)> CallChecker;
17988 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17989 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17990 return all_of(E.Scalars, [&](Value *V) {
17991 auto *I = cast<Instruction>(V);
17992 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
17993 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17994 return MaskedValueIsZero(I->getOperand(0), Mask,
17995 SimplifyQuery(*DL)) &&
17996 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17997 }
17998 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
17999 "Expected min/max intrinsics only.");
18000 unsigned SignBits = OrigBitWidth - BitWidth;
18001 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18002 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18003 nullptr, DT);
18004 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18005 nullptr, DT);
18006 return SignBits <= Op0SignBits &&
18007 ((SignBits != Op0SignBits &&
18008 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18009 MaskedValueIsZero(I->getOperand(0), Mask,
18010 SimplifyQuery(*DL))) &&
18011 SignBits <= Op1SignBits &&
18012 ((SignBits != Op1SignBits &&
18013 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
18014 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
18015 });
18016 };
18017 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18018 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18019 return all_of(E.Scalars, [&](Value *V) {
18020 auto *I = cast<Instruction>(V);
18021 unsigned SignBits = OrigBitWidth - BitWidth;
18022 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18023 unsigned Op0SignBits =
18024 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18025 return SignBits <= Op0SignBits &&
18026 ((SignBits != Op0SignBits &&
18027 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18028 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18029 });
18030 };
18031 if (ID != Intrinsic::abs) {
18032 Operands.push_back(getOperandEntry(&E, 1));
18033 CallChecker = CompChecker;
18034 } else {
18035 CallChecker = AbsChecker;
18036 }
18037 InstructionCost BestCost =
18038 std::numeric_limits<InstructionCost::CostType>::max();
18039 unsigned BestBitWidth = BitWidth;
18040 unsigned VF = E.Scalars.size();
18041 // Choose the best bitwidth based on cost estimations.
18042 auto Checker = [&](unsigned BitWidth, unsigned) {
18043 unsigned MinBW = PowerOf2Ceil(BitWidth);
18044 SmallVector<Type *> ArgTys =
18045 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
18046 auto VecCallCosts = getVectorCallCosts(
18047 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18048 TTI, TLI, ArgTys);
18049 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
18050 if (Cost < BestCost) {
18051 BestCost = Cost;
18052 BestBitWidth = BitWidth;
18053 }
18054 return false;
18055 };
18056 [[maybe_unused]] bool NeedToExit;
18057 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18058 BitWidth = BestBitWidth;
18059 return TryProcessInstruction(BitWidth, Operands, CallChecker);
18060 }
18061
18062 // Otherwise, conservatively give up.
18063 default:
18064 break;
18065 }
18066 MaxDepthLevel = 1;
18067 return FinalAnalysis();
18068}
18069
18070static RecurKind getRdxKind(Value *V);
18071
18073 // We only attempt to truncate integer expressions.
18074 bool IsStoreOrInsertElt =
18075 VectorizableTree.front()->getOpcode() == Instruction::Store ||
18076 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
18077 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18078 ExtraBitWidthNodes.size() <= 1 &&
18079 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18080 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18081 return;
18082
18083 unsigned NodeIdx = 0;
18084 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18085 NodeIdx = 1;
18086
18087 // Ensure the roots of the vectorizable tree don't form a cycle.
18088 if (VectorizableTree[NodeIdx]->isGather() ||
18089 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18090 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18091 [NodeIdx](const EdgeInfo &EI) {
18092 return EI.UserTE->Idx > NodeIdx;
18093 })))
18094 return;
18095
18096 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
18097 // resize to the final type.
18098 bool IsTruncRoot = false;
18099 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18100 SmallVector<unsigned> RootDemotes;
18101 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18102 if (NodeIdx != 0 &&
18103 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18104 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18105 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
18106 IsTruncRoot = true;
18107 RootDemotes.push_back(NodeIdx);
18108 IsProfitableToDemoteRoot = true;
18109 ++NodeIdx;
18110 }
18111
18112 // Analyzed the reduction already and not profitable - exit.
18113 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18114 return;
18115
18116 SmallVector<unsigned> ToDemote;
18117 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
18118 bool IsProfitableToDemoteRoot, unsigned Opcode,
18119 unsigned Limit, bool IsTruncRoot,
18120 bool IsSignedCmp) -> unsigned {
18121 ToDemote.clear();
18122 // Check if the root is trunc and the next node is gather/buildvector, then
18123 // keep trunc in scalars, which is free in most cases.
18124 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18125 !NodesToKeepBWs.contains(E.Idx) &&
18126 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18127 all_of(E.Scalars, [&](Value *V) {
18128 return V->hasOneUse() || isa<Constant>(V) ||
18129 (!V->hasNUsesOrMore(UsesLimit) &&
18130 none_of(V->users(), [&](User *U) {
18131 const TreeEntry *TE = getTreeEntry(U);
18132 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18133 if (TE == UserTE || !TE)
18134 return false;
18135 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18136 SelectInst>(U) ||
18137 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18138 SelectInst>(UserTE->getMainOp()))
18139 return true;
18140 unsigned UserTESz = DL->getTypeSizeInBits(
18141 UserTE->Scalars.front()->getType());
18142 auto It = MinBWs.find(TE);
18143 if (It != MinBWs.end() && It->second.first > UserTESz)
18144 return true;
18145 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18146 }));
18147 })) {
18148 ToDemote.push_back(E.Idx);
18149 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18150 auto It = MinBWs.find(UserTE);
18151 if (It != MinBWs.end())
18152 return It->second.first;
18153 unsigned MaxBitWidth =
18154 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18155 MaxBitWidth = bit_ceil(MaxBitWidth);
18156 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18157 MaxBitWidth = 8;
18158 return MaxBitWidth;
18159 }
18160
18161 unsigned VF = E.getVectorFactor();
18162 Type *ScalarTy = E.Scalars.front()->getType();
18163 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18164 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18165 if (!TreeRootIT || !Opcode)
18166 return 0u;
18167
18168 if (any_of(E.Scalars,
18169 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18170 return 0u;
18171
18172 unsigned NumParts = TTI->getNumberOfParts(
18173 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18174
18175 // The maximum bit width required to represent all the values that can be
18176 // demoted without loss of precision. It would be safe to truncate the roots
18177 // of the expression to this width.
18178 unsigned MaxBitWidth = 1u;
18179
18180 // True if the roots can be zero-extended back to their original type,
18181 // rather than sign-extended. We know that if the leading bits are not
18182 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18183 // True.
18184 // Determine if the sign bit of all the roots is known to be zero. If not,
18185 // IsKnownPositive is set to False.
18186 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18187 if (isa<PoisonValue>(R))
18188 return true;
18189 KnownBits Known = computeKnownBits(R, *DL);
18190 return Known.isNonNegative();
18191 });
18192
18193 // We first check if all the bits of the roots are demanded. If they're not,
18194 // we can truncate the roots to this narrower type.
18195 for (Value *Root : E.Scalars) {
18196 if (isa<PoisonValue>(Root))
18197 continue;
18198 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18199 TypeSize NumTypeBits =
18200 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18201 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18202 // If we can't prove that the sign bit is zero, we must add one to the
18203 // maximum bit width to account for the unknown sign bit. This preserves
18204 // the existing sign bit so we can safely sign-extend the root back to the
18205 // original type. Otherwise, if we know the sign bit is zero, we will
18206 // zero-extend the root instead.
18207 //
18208 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18209 // one to the maximum bit width will yield a larger-than-necessary
18210 // type. In general, we need to add an extra bit only if we can't
18211 // prove that the upper bit of the original type is equal to the
18212 // upper bit of the proposed smaller type. If these two bits are
18213 // the same (either zero or one) we know that sign-extending from
18214 // the smaller type will result in the same value. Here, since we
18215 // can't yet prove this, we are just making the proposed smaller
18216 // type larger to ensure correctness.
18217 if (!IsKnownPositive)
18218 ++BitWidth1;
18219
18220 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18221 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18222 MaxBitWidth =
18223 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18224 }
18225
18226 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18227 MaxBitWidth = 8;
18228
18229 // If the original type is large, but reduced type does not improve the reg
18230 // use - ignore it.
18231 if (NumParts > 1 &&
18232 NumParts ==
18234 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18235 return 0u;
18236
18237 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18238 Opcode == Instruction::SExt ||
18239 Opcode == Instruction::ZExt || NumParts > 1;
18240 // Conservatively determine if we can actually truncate the roots of the
18241 // expression. Collect the values that can be demoted in ToDemote and
18242 // additional roots that require investigating in Roots.
18244 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18245 bool NeedToDemote = IsProfitableToDemote;
18246
18247 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18248 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18249 NeedToDemote, IsTruncRoot) ||
18250 (MaxDepthLevel <= Limit &&
18251 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18252 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18253 DL->getTypeSizeInBits(TreeRootIT) /
18254 DL->getTypeSizeInBits(
18255 E.getMainOp()->getOperand(0)->getType()) >
18256 2)))))
18257 return 0u;
18258 // Round MaxBitWidth up to the next power-of-two.
18259 MaxBitWidth = bit_ceil(MaxBitWidth);
18260
18261 return MaxBitWidth;
18262 };
18263
18264 // If we can truncate the root, we must collect additional values that might
18265 // be demoted as a result. That is, those seeded by truncations we will
18266 // modify.
18267 // Add reduction ops sizes, if any.
18268 if (UserIgnoreList &&
18269 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18270 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18271 // x i1> to in)).
18272 if (all_of(*UserIgnoreList,
18273 [](Value *V) {
18274 return isa<PoisonValue>(V) ||
18275 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18276 }) &&
18277 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18278 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18279 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18280 Builder.getInt1Ty()) {
18281 ReductionBitWidth = 1;
18282 } else {
18283 for (Value *V : *UserIgnoreList) {
18284 if (isa<PoisonValue>(V))
18285 continue;
18286 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18287 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18288 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18290 ++BitWidth1;
18291 unsigned BitWidth2 = BitWidth1;
18293 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18294 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18295 }
18296 ReductionBitWidth =
18297 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18298 }
18299 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18300 ReductionBitWidth = 8;
18301
18302 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18303 }
18304 }
18305 bool IsTopRoot = NodeIdx == 0;
18306 while (NodeIdx < VectorizableTree.size() &&
18307 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18308 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18309 RootDemotes.push_back(NodeIdx);
18310 ++NodeIdx;
18311 IsTruncRoot = true;
18312 }
18313 bool IsSignedCmp = false;
18314 while (NodeIdx < VectorizableTree.size()) {
18315 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18316 unsigned Limit = 2;
18317 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18318 if (IsTopRoot &&
18319 ReductionBitWidth ==
18320 DL->getTypeSizeInBits(
18321 VectorizableTree.front()->Scalars.front()->getType()))
18322 Limit = 3;
18323 unsigned MaxBitWidth = ComputeMaxBitWidth(
18324 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18325 Limit, IsTruncRoot, IsSignedCmp);
18326 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18327 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18328 ReductionBitWidth = bit_ceil(MaxBitWidth);
18329 else if (MaxBitWidth == 0)
18330 ReductionBitWidth = 0;
18331 }
18332
18333 for (unsigned Idx : RootDemotes) {
18334 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18335 uint32_t OrigBitWidth =
18336 DL->getTypeSizeInBits(V->getType()->getScalarType());
18337 if (OrigBitWidth > MaxBitWidth) {
18338 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18339 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18340 }
18341 return false;
18342 }))
18343 ToDemote.push_back(Idx);
18344 }
18345 RootDemotes.clear();
18346 IsTopRoot = false;
18347 IsProfitableToDemoteRoot = true;
18348
18349 if (ExtraBitWidthNodes.empty()) {
18350 NodeIdx = VectorizableTree.size();
18351 } else {
18352 unsigned NewIdx = 0;
18353 do {
18354 NewIdx = *ExtraBitWidthNodes.begin();
18355 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18356 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18357 NodeIdx = NewIdx;
18358 IsTruncRoot =
18359 NodeIdx < VectorizableTree.size() &&
18360 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18361 [](const EdgeInfo &EI) {
18362 return EI.EdgeIdx == 0 &&
18363 EI.UserTE->getOpcode() == Instruction::Trunc &&
18364 !EI.UserTE->isAltShuffle();
18365 });
18366 IsSignedCmp =
18367 NodeIdx < VectorizableTree.size() &&
18368 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18369 [&](const EdgeInfo &EI) {
18370 return EI.UserTE->getOpcode() == Instruction::ICmp &&
18371 any_of(EI.UserTE->Scalars, [&](Value *V) {
18372 auto *IC = dyn_cast<ICmpInst>(V);
18373 return IC &&
18374 (IC->isSigned() ||
18375 !isKnownNonNegative(IC->getOperand(0),
18376 SimplifyQuery(*DL)) ||
18377 !isKnownNonNegative(IC->getOperand(1),
18378 SimplifyQuery(*DL)));
18379 });
18380 });
18381 }
18382
18383 // If the maximum bit width we compute is less than the width of the roots'
18384 // type, we can proceed with the narrowing. Otherwise, do nothing.
18385 if (MaxBitWidth == 0 ||
18386 MaxBitWidth >=
18387 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18388 ->getBitWidth()) {
18389 if (UserIgnoreList)
18390 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18391 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18392 continue;
18393 }
18394
18395 // Finally, map the values we can demote to the maximum bit with we
18396 // computed.
18397 for (unsigned Idx : ToDemote) {
18398 TreeEntry *TE = VectorizableTree[Idx].get();
18399 if (MinBWs.contains(TE))
18400 continue;
18401 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18402 if (isa<PoisonValue>(R))
18403 return false;
18404 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18405 });
18406 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18407 }
18408 }
18409}
18410
18412 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18413 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18414 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18415 auto *AA = &AM.getResult<AAManager>(F);
18416 auto *LI = &AM.getResult<LoopAnalysis>(F);
18417 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18418 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18419 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18421
18422 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18423 if (!Changed)
18424 return PreservedAnalyses::all();
18425
18428 return PA;
18429}
18430
18432 TargetTransformInfo *TTI_,
18433 TargetLibraryInfo *TLI_, AAResults *AA_,
18434 LoopInfo *LI_, DominatorTree *DT_,
18435 AssumptionCache *AC_, DemandedBits *DB_,
18438 return false;
18439 SE = SE_;
18440 TTI = TTI_;
18441 TLI = TLI_;
18442 AA = AA_;
18443 LI = LI_;
18444 DT = DT_;
18445 AC = AC_;
18446 DB = DB_;
18447 DL = &F.getDataLayout();
18448
18449 Stores.clear();
18450 GEPs.clear();
18451 bool Changed = false;
18452
18453 // If the target claims to have no vector registers don't attempt
18454 // vectorization.
18456 LLVM_DEBUG(
18457 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18458 return false;
18459 }
18460
18461 // Don't vectorize when the attribute NoImplicitFloat is used.
18462 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18463 return false;
18464
18465 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18466
18467 // Use the bottom up slp vectorizer to construct chains that start with
18468 // store instructions.
18469 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18470
18471 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18472 // delete instructions.
18473
18474 // Update DFS numbers now so that we can use them for ordering.
18475 DT->updateDFSNumbers();
18476
18477 // Scan the blocks in the function in post order.
18478 for (auto *BB : post_order(&F.getEntryBlock())) {
18479 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18480 continue;
18481
18482 // Start new block - clear the list of reduction roots.
18483 R.clearReductionData();
18484 collectSeedInstructions(BB);
18485
18486 // Vectorize trees that end at stores.
18487 if (!Stores.empty()) {
18488 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18489 << " underlying objects.\n");
18490 Changed |= vectorizeStoreChains(R);
18491 }
18492
18493 // Vectorize trees that end at reductions.
18494 Changed |= vectorizeChainsInBlock(BB, R);
18495
18496 // Vectorize the index computations of getelementptr instructions. This
18497 // is primarily intended to catch gather-like idioms ending at
18498 // non-consecutive loads.
18499 if (!GEPs.empty()) {
18500 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18501 << " underlying objects.\n");
18502 Changed |= vectorizeGEPIndices(BB, R);
18503 }
18504 }
18505
18506 if (Changed) {
18507 R.optimizeGatherSequence();
18508 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18509 }
18510 return Changed;
18511}
18512
18513std::optional<bool>
18514SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18515 unsigned Idx, unsigned MinVF,
18516 unsigned &Size) {
18517 Size = 0;
18518 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18519 << "\n");
18520 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18521 unsigned VF = Chain.size();
18522
18523 if (!has_single_bit(Sz) ||
18525 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18526 VF) ||
18527 VF < 2 || VF < MinVF) {
18528 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18529 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18530 // all vector lanes are used.
18531 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18532 return false;
18533 }
18534
18535 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18536 << "\n");
18537
18538 SetVector<Value *> ValOps;
18539 for (Value *V : Chain)
18540 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18541 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18542 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18543 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18544 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18545 bool IsAllowedSize =
18546 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18547 ValOps.size()) ||
18548 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18549 if ((!IsAllowedSize && S.getOpcode() &&
18550 S.getOpcode() != Instruction::Load &&
18551 (!S.getMainOp()->isSafeToRemove() ||
18552 any_of(ValOps.getArrayRef(),
18553 [&](Value *V) {
18554 return !isa<ExtractElementInst>(V) &&
18555 (V->getNumUses() > Chain.size() ||
18556 any_of(V->users(), [&](User *U) {
18557 return !Stores.contains(U);
18558 }));
18559 }))) ||
18560 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
18561 Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2;
18562 return false;
18563 }
18564 }
18565 if (R.isLoadCombineCandidate(Chain))
18566 return true;
18567 R.buildTree(Chain);
18568 // Check if tree tiny and store itself or its value is not vectorized.
18569 if (R.isTreeTinyAndNotFullyVectorizable()) {
18570 if (R.isGathered(Chain.front()) ||
18571 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18572 return std::nullopt;
18573 Size = R.getCanonicalGraphSize();
18574 return false;
18575 }
18576 R.reorderTopToBottom();
18577 R.reorderBottomToTop();
18578 R.transformNodes();
18579 R.buildExternalUses();
18580
18581 R.computeMinimumValueSizes();
18582
18583 Size = R.getCanonicalGraphSize();
18584 if (S.getOpcode() == Instruction::Load)
18585 Size = 2; // cut off masked gather small trees
18586 InstructionCost Cost = R.getTreeCost();
18587
18588 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18589 if (Cost < -SLPCostThreshold) {
18590 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18591
18592 using namespace ore;
18593
18594 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18595 cast<StoreInst>(Chain[0]))
18596 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18597 << " and with tree size "
18598 << NV("TreeSize", R.getTreeSize()));
18599
18600 R.vectorizeTree();
18601 return true;
18602 }
18603
18604 return false;
18605}
18606
18607/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18608static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18609 bool First) {
18610 unsigned Num = 0;
18611 uint64_t Sum = std::accumulate(
18612 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18613 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18614 unsigned Size = First ? Val.first : Val.second;
18615 if (Size == 1)
18616 return V;
18617 ++Num;
18618 return V + Size;
18619 });
18620 if (Num == 0)
18621 return true;
18622 uint64_t Mean = Sum / Num;
18623 if (Mean == 0)
18624 return true;
18625 uint64_t Dev = std::accumulate(
18626 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18627 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18628 unsigned P = First ? Val.first : Val.second;
18629 if (P == 1)
18630 return V;
18631 return V + (P - Mean) * (P - Mean);
18632 }) /
18633 Num;
18634 return Dev * 81 / (Mean * Mean) == 0;
18635}
18636
18637bool SLPVectorizerPass::vectorizeStores(
18638 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18639 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18640 &Visited) {
18641 // We may run into multiple chains that merge into a single chain. We mark the
18642 // stores that we vectorized so that we don't visit the same store twice.
18643 BoUpSLP::ValueSet VectorizedStores;
18644 bool Changed = false;
18645
18646 struct StoreDistCompare {
18647 bool operator()(const std::pair<unsigned, int> &Op1,
18648 const std::pair<unsigned, int> &Op2) const {
18649 return Op1.second < Op2.second;
18650 }
18651 };
18652 // A set of pairs (index of store in Stores array ref, Distance of the store
18653 // address relative to base store address in units).
18654 using StoreIndexToDistSet =
18655 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18656 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18657 int PrevDist = -1;
18659 // Collect the chain into a list.
18660 for (auto [Idx, Data] : enumerate(Set)) {
18661 if (Operands.empty() || Data.second - PrevDist == 1) {
18662 Operands.push_back(Stores[Data.first]);
18663 PrevDist = Data.second;
18664 if (Idx != Set.size() - 1)
18665 continue;
18666 }
18667 auto E = make_scope_exit([&, &DataVar = Data]() {
18668 Operands.clear();
18669 Operands.push_back(Stores[DataVar.first]);
18670 PrevDist = DataVar.second;
18671 });
18672
18673 if (Operands.size() <= 1 ||
18674 !Visited
18675 .insert({Operands.front(),
18676 cast<StoreInst>(Operands.front())->getValueOperand(),
18677 Operands.back(),
18678 cast<StoreInst>(Operands.back())->getValueOperand(),
18679 Operands.size()})
18680 .second)
18681 continue;
18682
18683 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18684 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18685 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18686
18687 unsigned MaxVF =
18688 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18689 auto *Store = cast<StoreInst>(Operands[0]);
18690 Type *StoreTy = Store->getValueOperand()->getType();
18691 Type *ValueTy = StoreTy;
18692 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18693 ValueTy = Trunc->getSrcTy();
18694 unsigned MinVF = std::max<unsigned>(
18696 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18697 ValueTy)));
18698
18699 if (MaxVF < MinVF) {
18700 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18701 << ") < "
18702 << "MinVF (" << MinVF << ")\n");
18703 continue;
18704 }
18705
18706 unsigned NonPowerOf2VF = 0;
18708 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18709 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18710 // lanes are used.
18711 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18712 if (has_single_bit(CandVF + 1)) {
18713 NonPowerOf2VF = CandVF;
18714 assert(NonPowerOf2VF != MaxVF &&
18715 "Non-power-of-2 VF should not be equal to MaxVF");
18716 }
18717 }
18718
18719 unsigned MaxRegVF = MaxVF;
18720 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18721 if (MaxVF < MinVF) {
18722 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18723 << ") < "
18724 << "MinVF (" << MinVF << ")\n");
18725 continue;
18726 }
18727
18728 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18729 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18730 unsigned Size = MinVF;
18731 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18732 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18733 Size *= 2;
18734 });
18735 unsigned End = Operands.size();
18736 unsigned Repeat = 0;
18737 constexpr unsigned MaxAttempts = 4;
18739 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18740 P.first = P.second = 1;
18741 });
18743 auto IsNotVectorized = [](bool First,
18744 const std::pair<unsigned, unsigned> &P) {
18745 return First ? P.first > 0 : P.second > 0;
18746 };
18747 auto IsVectorized = [](bool First,
18748 const std::pair<unsigned, unsigned> &P) {
18749 return First ? P.first == 0 : P.second == 0;
18750 };
18751 auto VFIsProfitable = [](bool First, unsigned Size,
18752 const std::pair<unsigned, unsigned> &P) {
18753 return First ? Size >= P.first : Size >= P.second;
18754 };
18755 auto FirstSizeSame = [](unsigned Size,
18756 const std::pair<unsigned, unsigned> &P) {
18757 return Size == P.first;
18758 };
18759 while (true) {
18760 ++Repeat;
18761 bool RepeatChanged = false;
18762 bool AnyProfitableGraph = false;
18763 for (unsigned Size : CandidateVFs) {
18764 AnyProfitableGraph = false;
18765 unsigned StartIdx = std::distance(
18766 RangeSizes.begin(),
18767 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18768 std::placeholders::_1)));
18769 while (StartIdx < End) {
18770 unsigned EndIdx =
18771 std::distance(RangeSizes.begin(),
18772 find_if(RangeSizes.drop_front(StartIdx),
18773 std::bind(IsVectorized, Size >= MaxRegVF,
18774 std::placeholders::_1)));
18775 unsigned Sz = EndIdx >= End ? End : EndIdx;
18776 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18777 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18778 Size >= MaxRegVF)) {
18779 ++Cnt;
18780 continue;
18781 }
18783 assert(all_of(Slice,
18784 [&](Value *V) {
18785 return cast<StoreInst>(V)
18786 ->getValueOperand()
18787 ->getType() ==
18788 cast<StoreInst>(Slice.front())
18789 ->getValueOperand()
18790 ->getType();
18791 }) &&
18792 "Expected all operands of same type.");
18793 if (!NonSchedulable.empty()) {
18794 auto [NonSchedSizeMax, NonSchedSizeMin] =
18795 NonSchedulable.lookup(Slice.front());
18796 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18797 Cnt += NonSchedSizeMax;
18798 continue;
18799 }
18800 }
18801 unsigned TreeSize;
18802 std::optional<bool> Res =
18803 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18804 if (!Res) {
18805 NonSchedulable
18806 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18807 .first->getSecond()
18808 .second = Size;
18809 } else if (*Res) {
18810 // Mark the vectorized stores so that we don't vectorize them
18811 // again.
18812 VectorizedStores.insert(Slice.begin(), Slice.end());
18813 // Mark the vectorized stores so that we don't vectorize them
18814 // again.
18815 AnyProfitableGraph = RepeatChanged = Changed = true;
18816 // If we vectorized initial block, no need to try to vectorize
18817 // it again.
18818 for_each(RangeSizes.slice(Cnt, Size),
18819 [](std::pair<unsigned, unsigned> &P) {
18820 P.first = P.second = 0;
18821 });
18822 if (Cnt < StartIdx + MinVF) {
18823 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18824 [](std::pair<unsigned, unsigned> &P) {
18825 P.first = P.second = 0;
18826 });
18827 StartIdx = Cnt + Size;
18828 }
18829 if (Cnt > Sz - Size - MinVF) {
18830 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18831 [](std::pair<unsigned, unsigned> &P) {
18832 P.first = P.second = 0;
18833 });
18834 if (Sz == End)
18835 End = Cnt;
18836 Sz = Cnt;
18837 }
18838 Cnt += Size;
18839 continue;
18840 }
18841 if (Size > 2 && Res &&
18842 !all_of(RangeSizes.slice(Cnt, Size),
18843 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18844 std::placeholders::_1))) {
18845 Cnt += Size;
18846 continue;
18847 }
18848 // Check for the very big VFs that we're not rebuilding same
18849 // trees, just with larger number of elements.
18850 if (Size > MaxRegVF && TreeSize > 1 &&
18851 all_of(RangeSizes.slice(Cnt, Size),
18852 std::bind(FirstSizeSame, TreeSize,
18853 std::placeholders::_1))) {
18854 Cnt += Size;
18855 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18856 ++Cnt;
18857 continue;
18858 }
18859 if (TreeSize > 1)
18860 for_each(RangeSizes.slice(Cnt, Size),
18861 [&](std::pair<unsigned, unsigned> &P) {
18862 if (Size >= MaxRegVF)
18863 P.second = std::max(P.second, TreeSize);
18864 else
18865 P.first = std::max(P.first, TreeSize);
18866 });
18867 ++Cnt;
18868 AnyProfitableGraph = true;
18869 }
18870 if (StartIdx >= End)
18871 break;
18872 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18873 AnyProfitableGraph = true;
18874 StartIdx = std::distance(
18875 RangeSizes.begin(),
18876 find_if(RangeSizes.drop_front(Sz),
18877 std::bind(IsNotVectorized, Size >= MaxRegVF,
18878 std::placeholders::_1)));
18879 }
18880 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18881 break;
18882 }
18883 // All values vectorized - exit.
18884 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18885 return P.first == 0 && P.second == 0;
18886 }))
18887 break;
18888 // Check if tried all attempts or no need for the last attempts at all.
18889 if (Repeat >= MaxAttempts ||
18890 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18891 break;
18892 constexpr unsigned StoresLimit = 64;
18893 const unsigned MaxTotalNum = std::min<unsigned>(
18894 Operands.size(),
18895 static_cast<unsigned>(
18896 End -
18897 std::distance(
18898 RangeSizes.begin(),
18899 find_if(RangeSizes, std::bind(IsNotVectorized, true,
18900 std::placeholders::_1))) +
18901 1));
18902 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
18903 unsigned Limit =
18904 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18905 CandidateVFs.clear();
18906 if (bit_floor(Limit) == VF)
18907 CandidateVFs.push_back(Limit);
18908 if (VF > MaxTotalNum || VF >= StoresLimit)
18909 break;
18910 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18911 if (P.first != 0)
18912 P.first = std::max(P.second, P.first);
18913 });
18914 // Last attempt to vectorize max number of elements, if all previous
18915 // attempts were unsuccessful because of the cost issues.
18916 CandidateVFs.push_back(VF);
18917 }
18918 }
18919 };
18920
18921 // Stores pair (first: index of the store into Stores array ref, address of
18922 // which taken as base, second: sorted set of pairs {index, dist}, which are
18923 // indices of stores in the set and their store location distances relative to
18924 // the base address).
18925
18926 // Need to store the index of the very first store separately, since the set
18927 // may be reordered after the insertion and the first store may be moved. This
18928 // container allows to reduce number of calls of getPointersDiff() function.
18930 // Inserts the specified store SI with the given index Idx to the set of the
18931 // stores. If the store with the same distance is found already - stop
18932 // insertion, try to vectorize already found stores. If some stores from this
18933 // sequence were not vectorized - try to vectorize them with the new store
18934 // later. But this logic is applied only to the stores, that come before the
18935 // previous store with the same distance.
18936 // Example:
18937 // 1. store x, %p
18938 // 2. store y, %p+1
18939 // 3. store z, %p+2
18940 // 4. store a, %p
18941 // 5. store b, %p+3
18942 // - Scan this from the last to first store. The very first bunch of stores is
18943 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
18944 // vector).
18945 // - The next store in the list - #1 - has the same distance from store #5 as
18946 // the store #4.
18947 // - Try to vectorize sequence of stores 4,2,3,5.
18948 // - If all these stores are vectorized - just drop them.
18949 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
18950 // - Start new stores sequence.
18951 // The new bunch of stores is {1, {1, 0}}.
18952 // - Add the stores from previous sequence, that were not vectorized.
18953 // Here we consider the stores in the reversed order, rather they are used in
18954 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
18955 // Store #3 can be added -> comes after store #4 with the same distance as
18956 // store #1.
18957 // Store #5 cannot be added - comes before store #4.
18958 // This logic allows to improve the compile time, we assume that the stores
18959 // after previous store with the same distance most likely have memory
18960 // dependencies and no need to waste compile time to try to vectorize them.
18961 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
18962 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
18963 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
18964 std::optional<int> Diff = getPointersDiff(
18965 Stores[Set.first]->getValueOperand()->getType(),
18966 Stores[Set.first]->getPointerOperand(),
18967 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
18968 /*StrictCheck=*/true);
18969 if (!Diff)
18970 continue;
18971 auto It = Set.second.find(std::make_pair(Idx, *Diff));
18972 if (It == Set.second.end()) {
18973 Set.second.emplace(Idx, *Diff);
18974 return;
18975 }
18976 // Try to vectorize the first found set to avoid duplicate analysis.
18977 TryToVectorize(Set.second);
18978 unsigned ItIdx = It->first;
18979 int ItDist = It->second;
18980 StoreIndexToDistSet PrevSet;
18981 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
18982 [&](const std::pair<unsigned, int> &Pair) {
18983 return Pair.first > ItIdx;
18984 });
18985 Set.second.clear();
18986 Set.first = Idx;
18987 Set.second.emplace(Idx, 0);
18988 // Insert stores that followed previous match to try to vectorize them
18989 // with this store.
18990 unsigned StartIdx = ItIdx + 1;
18991 SmallBitVector UsedStores(Idx - StartIdx);
18992 // Distances to previously found dup store (or this store, since they
18993 // store to the same addresses).
18994 SmallVector<int> Dists(Idx - StartIdx, 0);
18995 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
18996 // Do not try to vectorize sequences, we already tried.
18997 if (VectorizedStores.contains(Stores[Pair.first]))
18998 break;
18999 unsigned BI = Pair.first - StartIdx;
19000 UsedStores.set(BI);
19001 Dists[BI] = Pair.second - ItDist;
19002 }
19003 for (unsigned I = StartIdx; I < Idx; ++I) {
19004 unsigned BI = I - StartIdx;
19005 if (UsedStores.test(BI))
19006 Set.second.emplace(I, Dists[BI]);
19007 }
19008 return;
19009 }
19010 auto &Res = SortedStores.emplace_back();
19011 Res.first = Idx;
19012 Res.second.emplace(Idx, 0);
19013 };
19014 Type *PrevValTy = nullptr;
19015 for (auto [I, SI] : enumerate(Stores)) {
19016 if (R.isDeleted(SI))
19017 continue;
19018 if (!PrevValTy)
19019 PrevValTy = SI->getValueOperand()->getType();
19020 // Check that we do not try to vectorize stores of different types.
19021 if (PrevValTy != SI->getValueOperand()->getType()) {
19022 for (auto &Set : SortedStores)
19023 TryToVectorize(Set.second);
19024 SortedStores.clear();
19025 PrevValTy = SI->getValueOperand()->getType();
19026 }
19027 FillStoresSet(I, SI);
19028 }
19029
19030 // Final vectorization attempt.
19031 for (auto &Set : SortedStores)
19032 TryToVectorize(Set.second);
19033
19034 return Changed;
19035}
19036
19037void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19038 // Initialize the collections. We will make a single pass over the block.
19039 Stores.clear();
19040 GEPs.clear();
19041
19042 // Visit the store and getelementptr instructions in BB and organize them in
19043 // Stores and GEPs according to the underlying objects of their pointer
19044 // operands.
19045 for (Instruction &I : *BB) {
19046 // Ignore store instructions that are volatile or have a pointer operand
19047 // that doesn't point to a scalar type.
19048 if (auto *SI = dyn_cast<StoreInst>(&I)) {
19049 if (!SI->isSimple())
19050 continue;
19051 if (!isValidElementType(SI->getValueOperand()->getType()))
19052 continue;
19053 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19054 }
19055
19056 // Ignore getelementptr instructions that have more than one index, a
19057 // constant index, or a pointer operand that doesn't point to a scalar
19058 // type.
19059 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19060 if (GEP->getNumIndices() != 1)
19061 continue;
19062 Value *Idx = GEP->idx_begin()->get();
19063 if (isa<Constant>(Idx))
19064 continue;
19065 if (!isValidElementType(Idx->getType()))
19066 continue;
19067 if (GEP->getType()->isVectorTy())
19068 continue;
19069 GEPs[GEP->getPointerOperand()].push_back(GEP);
19070 }
19071 }
19072}
19073
19074bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19075 bool MaxVFOnly) {
19076 if (VL.size() < 2)
19077 return false;
19078
19079 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
19080 << VL.size() << ".\n");
19081
19082 // Check that all of the parts are instructions of the same type,
19083 // we permit an alternate opcode via InstructionsState.
19084 InstructionsState S = getSameOpcode(VL, *TLI);
19085 if (!S.getOpcode())
19086 return false;
19087
19088 Instruction *I0 = S.getMainOp();
19089 // Make sure invalid types (including vector type) are rejected before
19090 // determining vectorization factor for scalar instructions.
19091 for (Value *V : VL) {
19092 Type *Ty = V->getType();
19093 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19094 // NOTE: the following will give user internal llvm type name, which may
19095 // not be useful.
19096 R.getORE()->emit([&]() {
19097 std::string TypeStr;
19098 llvm::raw_string_ostream rso(TypeStr);
19099 Ty->print(rso);
19100 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
19101 << "Cannot SLP vectorize list: type "
19102 << TypeStr + " is unsupported by vectorizer";
19103 });
19104 return false;
19105 }
19106 }
19107
19108 unsigned Sz = R.getVectorElementSize(I0);
19109 unsigned MinVF = R.getMinVF(Sz);
19110 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
19111 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19112 if (MaxVF < 2) {
19113 R.getORE()->emit([&]() {
19114 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
19115 << "Cannot SLP vectorize list: vectorization factor "
19116 << "less than 2 is not supported";
19117 });
19118 return false;
19119 }
19120
19121 bool Changed = false;
19122 bool CandidateFound = false;
19123 InstructionCost MinCost = SLPCostThreshold.getValue();
19124 Type *ScalarTy = getValueType(VL[0]);
19125
19126 unsigned NextInst = 0, MaxInst = VL.size();
19127 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19128 // No actual vectorization should happen, if number of parts is the same as
19129 // provided vectorization factor (i.e. the scalar type is used for vector
19130 // code during codegen).
19131 auto *VecTy = getWidenedType(ScalarTy, VF);
19132 if (TTI->getNumberOfParts(VecTy) == VF)
19133 continue;
19134 for (unsigned I = NextInst; I < MaxInst; ++I) {
19135 unsigned ActualVF = std::min(MaxInst - I, VF);
19136
19137 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19138 continue;
19139
19140 if (MaxVFOnly && ActualVF < MaxVF)
19141 break;
19142 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19143 break;
19144
19145 SmallVector<Value *> Ops(ActualVF, nullptr);
19146 unsigned Idx = 0;
19147 for (Value *V : VL.drop_front(I)) {
19148 // Check that a previous iteration of this loop did not delete the
19149 // Value.
19150 if (auto *Inst = dyn_cast<Instruction>(V);
19151 !Inst || !R.isDeleted(Inst)) {
19152 Ops[Idx] = V;
19153 ++Idx;
19154 if (Idx == ActualVF)
19155 break;
19156 }
19157 }
19158 // Not enough vectorizable instructions - exit.
19159 if (Idx != ActualVF)
19160 break;
19161
19162 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
19163 << "\n");
19164
19165 R.buildTree(Ops);
19166 if (R.isTreeTinyAndNotFullyVectorizable())
19167 continue;
19168 R.reorderTopToBottom();
19169 R.reorderBottomToTop(
19170 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19171 !R.doesRootHaveInTreeUses());
19172 R.transformNodes();
19173 R.buildExternalUses();
19174
19175 R.computeMinimumValueSizes();
19176 InstructionCost Cost = R.getTreeCost();
19177 CandidateFound = true;
19178 MinCost = std::min(MinCost, Cost);
19179
19180 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19181 << " for VF=" << ActualVF << "\n");
19182 if (Cost < -SLPCostThreshold) {
19183 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19184 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19185 cast<Instruction>(Ops[0]))
19186 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19187 << " and with tree size "
19188 << ore::NV("TreeSize", R.getTreeSize()));
19189
19190 R.vectorizeTree();
19191 // Move to the next bundle.
19192 I += VF - 1;
19193 NextInst = I + 1;
19194 Changed = true;
19195 }
19196 }
19197 }
19198
19199 if (!Changed && CandidateFound) {
19200 R.getORE()->emit([&]() {
19201 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19202 << "List vectorization was possible but not beneficial with cost "
19203 << ore::NV("Cost", MinCost) << " >= "
19204 << ore::NV("Treshold", -SLPCostThreshold);
19205 });
19206 } else if (!Changed) {
19207 R.getORE()->emit([&]() {
19208 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19209 << "Cannot SLP vectorize list: vectorization was impossible"
19210 << " with available vectorization factors";
19211 });
19212 }
19213 return Changed;
19214}
19215
19216bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19217 if (!I)
19218 return false;
19219
19220 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19221 return false;
19222
19223 Value *P = I->getParent();
19224
19225 // Vectorize in current basic block only.
19226 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19227 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19228 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19229 R.isDeleted(Op0) || R.isDeleted(Op1))
19230 return false;
19231
19232 // First collect all possible candidates
19234 Candidates.emplace_back(Op0, Op1);
19235
19236 auto *A = dyn_cast<BinaryOperator>(Op0);
19237 auto *B = dyn_cast<BinaryOperator>(Op1);
19238 // Try to skip B.
19239 if (A && B && B->hasOneUse()) {
19240 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19241 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19242 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19243 Candidates.emplace_back(A, B0);
19244 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19245 Candidates.emplace_back(A, B1);
19246 }
19247 // Try to skip A.
19248 if (B && A && A->hasOneUse()) {
19249 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19250 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19251 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19252 Candidates.emplace_back(A0, B);
19253 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19254 Candidates.emplace_back(A1, B);
19255 }
19256
19257 if (Candidates.size() == 1)
19258 return tryToVectorizeList({Op0, Op1}, R);
19259
19260 // We have multiple options. Try to pick the single best.
19261 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19262 if (!BestCandidate)
19263 return false;
19264 return tryToVectorizeList(
19265 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19266}
19267
19268namespace {
19269
19270/// Model horizontal reductions.
19271///
19272/// A horizontal reduction is a tree of reduction instructions that has values
19273/// that can be put into a vector as its leaves. For example:
19274///
19275/// mul mul mul mul
19276/// \ / \ /
19277/// + +
19278/// \ /
19279/// +
19280/// This tree has "mul" as its leaf values and "+" as its reduction
19281/// instructions. A reduction can feed into a store or a binary operation
19282/// feeding a phi.
19283/// ...
19284/// \ /
19285/// +
19286/// |
19287/// phi +=
19288///
19289/// Or:
19290/// ...
19291/// \ /
19292/// +
19293/// |
19294/// *p =
19295///
19296class HorizontalReduction {
19297 using ReductionOpsType = SmallVector<Value *, 16>;
19298 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19299 ReductionOpsListType ReductionOps;
19300 /// List of possibly reduced values.
19302 /// Maps reduced value to the corresponding reduction operation.
19304 WeakTrackingVH ReductionRoot;
19305 /// The type of reduction operation.
19306 RecurKind RdxKind;
19307 /// Checks if the optimization of original scalar identity operations on
19308 /// matched horizontal reductions is enabled and allowed.
19309 bool IsSupportedHorRdxIdentityOp = false;
19310
19311 static bool isCmpSelMinMax(Instruction *I) {
19312 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19314 }
19315
19316 // And/or are potentially poison-safe logical patterns like:
19317 // select x, y, false
19318 // select x, true, y
19319 static bool isBoolLogicOp(Instruction *I) {
19320 return isa<SelectInst>(I) &&
19321 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19322 }
19323
19324 /// Checks if instruction is associative and can be vectorized.
19325 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19326 if (Kind == RecurKind::None)
19327 return false;
19328
19329 // Integer ops that map to select instructions or intrinsics are fine.
19331 isBoolLogicOp(I))
19332 return true;
19333
19334 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19335 // FP min/max are associative except for NaN and -0.0. We do not
19336 // have to rule out -0.0 here because the intrinsic semantics do not
19337 // specify a fixed result for it.
19338 return I->getFastMathFlags().noNaNs();
19339 }
19340
19341 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19342 return true;
19343
19344 return I->isAssociative();
19345 }
19346
19347 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19348 // Poison-safe 'or' takes the form: select X, true, Y
19349 // To make that work with the normal operand processing, we skip the
19350 // true value operand.
19351 // TODO: Change the code and data structures to handle this without a hack.
19352 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19353 return I->getOperand(2);
19354 return I->getOperand(Index);
19355 }
19356
19357 /// Creates reduction operation with the current opcode.
19358 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19359 Value *RHS, const Twine &Name, bool UseSelect) {
19360 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19361 switch (Kind) {
19362 case RecurKind::Or:
19363 if (UseSelect &&
19365 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19366 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19367 Name);
19368 case RecurKind::And:
19369 if (UseSelect &&
19371 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19372 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19373 Name);
19374 case RecurKind::Add:
19375 case RecurKind::Mul:
19376 case RecurKind::Xor:
19377 case RecurKind::FAdd:
19378 case RecurKind::FMul:
19379 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19380 Name);
19381 case RecurKind::FMax:
19382 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
19383 case RecurKind::FMin:
19384 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
19385 case RecurKind::FMaximum:
19386 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
19387 case RecurKind::FMinimum:
19388 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
19389 case RecurKind::SMax:
19390 if (UseSelect) {
19391 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
19392 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19393 }
19394 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
19395 case RecurKind::SMin:
19396 if (UseSelect) {
19397 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
19398 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19399 }
19400 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
19401 case RecurKind::UMax:
19402 if (UseSelect) {
19403 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
19404 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19405 }
19406 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
19407 case RecurKind::UMin:
19408 if (UseSelect) {
19409 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
19410 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19411 }
19412 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
19413 default:
19414 llvm_unreachable("Unknown reduction operation.");
19415 }
19416 }
19417
19418 /// Creates reduction operation with the current opcode with the IR flags
19419 /// from \p ReductionOps, dropping nuw/nsw flags.
19420 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19421 Value *RHS, const Twine &Name,
19422 const ReductionOpsListType &ReductionOps) {
19423 bool UseSelect = ReductionOps.size() == 2 ||
19424 // Logical or/and.
19425 (ReductionOps.size() == 1 &&
19426 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19427 assert((!UseSelect || ReductionOps.size() != 2 ||
19428 isa<SelectInst>(ReductionOps[1][0])) &&
19429 "Expected cmp + select pairs for reduction");
19430 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19432 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19433 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19434 /*IncludeWrapFlags=*/false);
19435 propagateIRFlags(Op, ReductionOps[1], nullptr,
19436 /*IncludeWrapFlags=*/false);
19437 return Op;
19438 }
19439 }
19440 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19441 return Op;
19442 }
19443
19444public:
19445 static RecurKind getRdxKind(Value *V) {
19446 auto *I = dyn_cast<Instruction>(V);
19447 if (!I)
19448 return RecurKind::None;
19449 if (match(I, m_Add(m_Value(), m_Value())))
19450 return RecurKind::Add;
19451 if (match(I, m_Mul(m_Value(), m_Value())))
19452 return RecurKind::Mul;
19453 if (match(I, m_And(m_Value(), m_Value())) ||
19455 return RecurKind::And;
19456 if (match(I, m_Or(m_Value(), m_Value())) ||
19458 return RecurKind::Or;
19459 if (match(I, m_Xor(m_Value(), m_Value())))
19460 return RecurKind::Xor;
19461 if (match(I, m_FAdd(m_Value(), m_Value())))
19462 return RecurKind::FAdd;
19463 if (match(I, m_FMul(m_Value(), m_Value())))
19464 return RecurKind::FMul;
19465
19466 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19467 return RecurKind::FMax;
19468 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19469 return RecurKind::FMin;
19470
19471 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19472 return RecurKind::FMaximum;
19473 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19474 return RecurKind::FMinimum;
19475 // This matches either cmp+select or intrinsics. SLP is expected to handle
19476 // either form.
19477 // TODO: If we are canonicalizing to intrinsics, we can remove several
19478 // special-case paths that deal with selects.
19479 if (match(I, m_SMax(m_Value(), m_Value())))
19480 return RecurKind::SMax;
19481 if (match(I, m_SMin(m_Value(), m_Value())))
19482 return RecurKind::SMin;
19483 if (match(I, m_UMax(m_Value(), m_Value())))
19484 return RecurKind::UMax;
19485 if (match(I, m_UMin(m_Value(), m_Value())))
19486 return RecurKind::UMin;
19487
19488 if (auto *Select = dyn_cast<SelectInst>(I)) {
19489 // Try harder: look for min/max pattern based on instructions producing
19490 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19491 // During the intermediate stages of SLP, it's very common to have
19492 // pattern like this (since optimizeGatherSequence is run only once
19493 // at the end):
19494 // %1 = extractelement <2 x i32> %a, i32 0
19495 // %2 = extractelement <2 x i32> %a, i32 1
19496 // %cond = icmp sgt i32 %1, %2
19497 // %3 = extractelement <2 x i32> %a, i32 0
19498 // %4 = extractelement <2 x i32> %a, i32 1
19499 // %select = select i1 %cond, i32 %3, i32 %4
19500 CmpPredicate Pred;
19501 Instruction *L1;
19502 Instruction *L2;
19503
19504 Value *LHS = Select->getTrueValue();
19505 Value *RHS = Select->getFalseValue();
19506 Value *Cond = Select->getCondition();
19507
19508 // TODO: Support inverse predicates.
19509 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19510 if (!isa<ExtractElementInst>(RHS) ||
19511 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19512 return RecurKind::None;
19513 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19514 if (!isa<ExtractElementInst>(LHS) ||
19515 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19516 return RecurKind::None;
19517 } else {
19518 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19519 return RecurKind::None;
19520 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19521 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19522 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19523 return RecurKind::None;
19524 }
19525
19526 switch (Pred) {
19527 default:
19528 return RecurKind::None;
19529 case CmpInst::ICMP_SGT:
19530 case CmpInst::ICMP_SGE:
19531 return RecurKind::SMax;
19532 case CmpInst::ICMP_SLT:
19533 case CmpInst::ICMP_SLE:
19534 return RecurKind::SMin;
19535 case CmpInst::ICMP_UGT:
19536 case CmpInst::ICMP_UGE:
19537 return RecurKind::UMax;
19538 case CmpInst::ICMP_ULT:
19539 case CmpInst::ICMP_ULE:
19540 return RecurKind::UMin;
19541 }
19542 }
19543 return RecurKind::None;
19544 }
19545
19546 /// Get the index of the first operand.
19547 static unsigned getFirstOperandIndex(Instruction *I) {
19548 return isCmpSelMinMax(I) ? 1 : 0;
19549 }
19550
19551private:
19552 /// Total number of operands in the reduction operation.
19553 static unsigned getNumberOfOperands(Instruction *I) {
19554 return isCmpSelMinMax(I) ? 3 : 2;
19555 }
19556
19557 /// Checks if the instruction is in basic block \p BB.
19558 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19559 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19560 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19561 auto *Sel = cast<SelectInst>(I);
19562 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19563 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19564 }
19565 return I->getParent() == BB;
19566 }
19567
19568 /// Expected number of uses for reduction operations/reduced values.
19569 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19570 if (IsCmpSelMinMax) {
19571 // SelectInst must be used twice while the condition op must have single
19572 // use only.
19573 if (auto *Sel = dyn_cast<SelectInst>(I))
19574 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19575 return I->hasNUses(2);
19576 }
19577
19578 // Arithmetic reduction operation must be used once only.
19579 return I->hasOneUse();
19580 }
19581
19582 /// Initializes the list of reduction operations.
19583 void initReductionOps(Instruction *I) {
19584 if (isCmpSelMinMax(I))
19585 ReductionOps.assign(2, ReductionOpsType());
19586 else
19587 ReductionOps.assign(1, ReductionOpsType());
19588 }
19589
19590 /// Add all reduction operations for the reduction instruction \p I.
19591 void addReductionOps(Instruction *I) {
19592 if (isCmpSelMinMax(I)) {
19593 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19594 ReductionOps[1].emplace_back(I);
19595 } else {
19596 ReductionOps[0].emplace_back(I);
19597 }
19598 }
19599
19600 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19601 int Sz = Data.size();
19602 auto *I = dyn_cast<Instruction>(Data.front());
19603 return Sz > 1 || isConstant(Data.front()) ||
19604 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19605 }
19606
19607public:
19608 HorizontalReduction() = default;
19609
19610 /// Try to find a reduction tree.
19611 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19612 ScalarEvolution &SE, const DataLayout &DL,
19613 const TargetLibraryInfo &TLI) {
19614 RdxKind = HorizontalReduction::getRdxKind(Root);
19615 if (!isVectorizable(RdxKind, Root))
19616 return false;
19617
19618 // Analyze "regular" integer/FP types for reductions - no target-specific
19619 // types or pointers.
19620 Type *Ty = Root->getType();
19621 if (!isValidElementType(Ty) || Ty->isPointerTy())
19622 return false;
19623
19624 // Though the ultimate reduction may have multiple uses, its condition must
19625 // have only single use.
19626 if (auto *Sel = dyn_cast<SelectInst>(Root))
19627 if (!Sel->getCondition()->hasOneUse())
19628 return false;
19629
19630 ReductionRoot = Root;
19631
19632 // Iterate through all the operands of the possible reduction tree and
19633 // gather all the reduced values, sorting them by their value id.
19634 BasicBlock *BB = Root->getParent();
19635 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19637 1, std::make_pair(Root, 0));
19638 // Checks if the operands of the \p TreeN instruction are also reduction
19639 // operations or should be treated as reduced values or an extra argument,
19640 // which is not part of the reduction.
19641 auto CheckOperands = [&](Instruction *TreeN,
19642 SmallVectorImpl<Value *> &PossibleReducedVals,
19643 SmallVectorImpl<Instruction *> &ReductionOps,
19644 unsigned Level) {
19645 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19646 getNumberOfOperands(TreeN)))) {
19647 Value *EdgeVal = getRdxOperand(TreeN, I);
19648 ReducedValsToOps[EdgeVal].push_back(TreeN);
19649 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19650 // If the edge is not an instruction, or it is different from the main
19651 // reduction opcode or has too many uses - possible reduced value.
19652 // Also, do not try to reduce const values, if the operation is not
19653 // foldable.
19654 if (!EdgeInst || Level > RecursionMaxDepth ||
19655 getRdxKind(EdgeInst) != RdxKind ||
19656 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19657 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19658 !isVectorizable(RdxKind, EdgeInst) ||
19659 (R.isAnalyzedReductionRoot(EdgeInst) &&
19660 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19661 PossibleReducedVals.push_back(EdgeVal);
19662 continue;
19663 }
19664 ReductionOps.push_back(EdgeInst);
19665 }
19666 };
19667 // Try to regroup reduced values so that it gets more profitable to try to
19668 // reduce them. Values are grouped by their value ids, instructions - by
19669 // instruction op id and/or alternate op id, plus do extra analysis for
19670 // loads (grouping them by the distabce between pointers) and cmp
19671 // instructions (grouping them by the predicate).
19674 8>
19675 PossibleReducedVals;
19676 initReductionOps(Root);
19678 SmallSet<size_t, 2> LoadKeyUsed;
19679
19680 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19681 Key = hash_combine(hash_value(LI->getParent()), Key);
19682 Value *Ptr =
19684 if (!LoadKeyUsed.insert(Key).second) {
19685 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19686 if (LIt != LoadsMap.end()) {
19687 for (LoadInst *RLI : LIt->second) {
19688 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19689 LI->getType(), LI->getPointerOperand(), DL, SE,
19690 /*StrictCheck=*/true))
19691 return hash_value(RLI->getPointerOperand());
19692 }
19693 for (LoadInst *RLI : LIt->second) {
19695 LI->getPointerOperand(), TLI)) {
19696 hash_code SubKey = hash_value(RLI->getPointerOperand());
19697 return SubKey;
19698 }
19699 }
19700 if (LIt->second.size() > 2) {
19701 hash_code SubKey =
19702 hash_value(LIt->second.back()->getPointerOperand());
19703 return SubKey;
19704 }
19705 }
19706 }
19707 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19708 .first->second.push_back(LI);
19709 return hash_value(LI->getPointerOperand());
19710 };
19711
19712 while (!Worklist.empty()) {
19713 auto [TreeN, Level] = Worklist.pop_back_val();
19714 SmallVector<Value *> PossibleRedVals;
19715 SmallVector<Instruction *> PossibleReductionOps;
19716 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19717 addReductionOps(TreeN);
19718 // Add reduction values. The values are sorted for better vectorization
19719 // results.
19720 for (Value *V : PossibleRedVals) {
19721 size_t Key, Idx;
19722 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19723 /*AllowAlternate=*/false);
19724 ++PossibleReducedVals[Key][Idx]
19725 .insert(std::make_pair(V, 0))
19726 .first->second;
19727 }
19728 for (Instruction *I : reverse(PossibleReductionOps))
19729 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19730 }
19731 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19732 // Sort values by the total number of values kinds to start the reduction
19733 // from the longest possible reduced values sequences.
19734 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19735 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19736 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19737 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19738 It != E; ++It) {
19739 PossibleRedValsVect.emplace_back();
19740 auto RedValsVect = It->second.takeVector();
19741 stable_sort(RedValsVect, llvm::less_second());
19742 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19743 PossibleRedValsVect.back().append(Data.second, Data.first);
19744 }
19745 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19746 return P1.size() > P2.size();
19747 });
19748 int NewIdx = -1;
19749 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19750 if (NewIdx < 0 ||
19751 (!isGoodForReduction(Data) &&
19752 (!isa<LoadInst>(Data.front()) ||
19753 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19755 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19757 cast<LoadInst>(ReducedVals[NewIdx].front())
19758 ->getPointerOperand())))) {
19759 NewIdx = ReducedVals.size();
19760 ReducedVals.emplace_back();
19761 }
19762 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19763 }
19764 }
19765 // Sort the reduced values by number of same/alternate opcode and/or pointer
19766 // operand.
19767 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19768 return P1.size() > P2.size();
19769 });
19770 return true;
19771 }
19772
19773 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19774 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19775 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
19776 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19777 constexpr unsigned RegMaxNumber = 4;
19778 constexpr unsigned RedValsMaxNumber = 128;
19779 // If there are a sufficient number of reduction values, reduce
19780 // to a nearby power-of-2. We can safely generate oversized
19781 // vectors and rely on the backend to split them to legal sizes.
19782 if (unsigned NumReducedVals = std::accumulate(
19783 ReducedVals.begin(), ReducedVals.end(), 0,
19784 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19785 if (!isGoodForReduction(Vals))
19786 return Num;
19787 return Num + Vals.size();
19788 });
19789 NumReducedVals < ReductionLimit &&
19790 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19791 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19792 })) {
19793 for (ReductionOpsType &RdxOps : ReductionOps)
19794 for (Value *RdxOp : RdxOps)
19795 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19796 return nullptr;
19797 }
19798
19799 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19800 TargetFolder(DL));
19801 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19802
19803 // Track the reduced values in case if they are replaced by extractelement
19804 // because of the vectorization.
19805 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19806 ReducedVals.front().size());
19807
19808 // The compare instruction of a min/max is the insertion point for new
19809 // instructions and may be replaced with a new compare instruction.
19810 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19811 assert(isa<SelectInst>(RdxRootInst) &&
19812 "Expected min/max reduction to have select root instruction");
19813 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19814 assert(isa<Instruction>(ScalarCond) &&
19815 "Expected min/max reduction to have compare condition");
19816 return cast<Instruction>(ScalarCond);
19817 };
19818
19819 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19820 return isBoolLogicOp(cast<Instruction>(V));
19821 });
19822 // Return new VectorizedTree, based on previous value.
19823 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19824 if (VectorizedTree) {
19825 // Update the final value in the reduction.
19827 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19828 if (AnyBoolLogicOp) {
19829 auto It = ReducedValsToOps.find(VectorizedTree);
19830 auto It1 = ReducedValsToOps.find(Res);
19831 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19832 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19833 (It != ReducedValsToOps.end() &&
19834 any_of(It->getSecond(), [&](Instruction *I) {
19835 return isBoolLogicOp(I) &&
19836 getRdxOperand(I, 0) == VectorizedTree;
19837 }))) {
19838 ;
19839 } else if (isGuaranteedNotToBePoison(Res, AC) ||
19840 (It1 != ReducedValsToOps.end() &&
19841 any_of(It1->getSecond(), [&](Instruction *I) {
19842 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19843 }))) {
19844 std::swap(VectorizedTree, Res);
19845 } else {
19846 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19847 }
19848 }
19849
19850 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19851 ReductionOps);
19852 }
19853 // Initialize the final value in the reduction.
19854 return Res;
19855 };
19856 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19857 ReductionOps.front().size());
19858 for (ReductionOpsType &RdxOps : ReductionOps)
19859 for (Value *RdxOp : RdxOps) {
19860 if (!RdxOp)
19861 continue;
19862 IgnoreList.insert(RdxOp);
19863 }
19864 // Intersect the fast-math-flags from all reduction operations.
19865 FastMathFlags RdxFMF;
19866 RdxFMF.set();
19867 for (Value *U : IgnoreList)
19868 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19869 RdxFMF &= FPMO->getFastMathFlags();
19870 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19871
19872 // Need to track reduced vals, they may be changed during vectorization of
19873 // subvectors.
19874 for (ArrayRef<Value *> Candidates : ReducedVals)
19875 for (Value *V : Candidates)
19876 TrackedVals.try_emplace(V, V);
19877
19879 Value *V) -> unsigned & {
19880 auto *It = MV.find(V);
19881 assert(It != MV.end() && "Unable to find given key.");
19882 return It->second;
19883 };
19884
19885 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19886 // List of the values that were reduced in other trees as part of gather
19887 // nodes and thus requiring extract if fully vectorized in other trees.
19888 SmallPtrSet<Value *, 4> RequiredExtract;
19889 WeakTrackingVH VectorizedTree = nullptr;
19890 bool CheckForReusedReductionOps = false;
19891 // Try to vectorize elements based on their type.
19893 for (ArrayRef<Value *> RV : ReducedVals)
19894 States.push_back(getSameOpcode(RV, TLI));
19895 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19896 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19897 InstructionsState S = States[I];
19898 SmallVector<Value *> Candidates;
19899 Candidates.reserve(2 * OrigReducedVals.size());
19900 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19901 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19902 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19903 // Check if the reduction value was not overriden by the extractelement
19904 // instruction because of the vectorization and exclude it, if it is not
19905 // compatible with other values.
19906 // Also check if the instruction was folded to constant/other value.
19907 auto *Inst = dyn_cast<Instruction>(RdxVal);
19908 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
19909 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
19910 (S.getOpcode() && !Inst))
19911 continue;
19912 Candidates.push_back(RdxVal);
19913 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19914 }
19915 bool ShuffledExtracts = false;
19916 // Try to handle shuffled extractelements.
19917 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
19918 I + 1 < E) {
19919 SmallVector<Value *> CommonCandidates(Candidates);
19920 for (Value *RV : ReducedVals[I + 1]) {
19921 Value *RdxVal = TrackedVals.at(RV);
19922 // Check if the reduction value was not overriden by the
19923 // extractelement instruction because of the vectorization and
19924 // exclude it, if it is not compatible with other values.
19925 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19926 if (!Inst)
19927 continue;
19928 CommonCandidates.push_back(RdxVal);
19929 TrackedToOrig.try_emplace(RdxVal, RV);
19930 }
19932 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
19933 ++I;
19934 Candidates.swap(CommonCandidates);
19935 ShuffledExtracts = true;
19936 }
19937 }
19938
19939 // Emit code for constant values.
19940 if (Candidates.size() > 1 && allConstant(Candidates)) {
19941 Value *Res = Candidates.front();
19942 Value *OrigV = TrackedToOrig.at(Candidates.front());
19943 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19944 for (Value *VC : ArrayRef(Candidates).drop_front()) {
19945 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
19946 Value *OrigV = TrackedToOrig.at(VC);
19947 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19948 if (auto *ResI = dyn_cast<Instruction>(Res))
19949 V.analyzedReductionRoot(ResI);
19950 }
19951 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19952 continue;
19953 }
19954
19955 unsigned NumReducedVals = Candidates.size();
19956 if (NumReducedVals < ReductionLimit &&
19957 (NumReducedVals < 2 || !isSplat(Candidates)))
19958 continue;
19959
19960 // Check if we support repeated scalar values processing (optimization of
19961 // original scalar identity operations on matched horizontal reductions).
19962 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
19963 RdxKind != RecurKind::FMul &&
19964 RdxKind != RecurKind::FMulAdd;
19965 // Gather same values.
19966 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
19967 if (IsSupportedHorRdxIdentityOp)
19968 for (Value *V : Candidates) {
19969 Value *OrigV = TrackedToOrig.at(V);
19970 ++SameValuesCounter.try_emplace(OrigV).first->second;
19971 }
19972 // Used to check if the reduced values used same number of times. In this
19973 // case the compiler may produce better code. E.g. if reduced values are
19974 // aabbccdd (8 x values), then the first node of the tree will have a node
19975 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
19976 // Plus, the final reduction will be performed on <8 x aabbccdd>.
19977 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
19978 // x abcd) * 2.
19979 // Currently it only handles add/fadd/xor. and/or/min/max do not require
19980 // this analysis, other operations may require an extra estimation of
19981 // the profitability.
19982 bool SameScaleFactor = false;
19983 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
19984 SameValuesCounter.size() != Candidates.size();
19985 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
19986 if (OptReusedScalars) {
19987 SameScaleFactor =
19988 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
19989 RdxKind == RecurKind::Xor) &&
19990 all_of(drop_begin(SameValuesCounter),
19991 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
19992 return P.second == SameValuesCounter.front().second;
19993 });
19994 Candidates.resize(SameValuesCounter.size());
19995 transform(SameValuesCounter, Candidates.begin(),
19996 [&](const auto &P) { return TrackedVals.at(P.first); });
19997 NumReducedVals = Candidates.size();
19998 // Have a reduction of the same element.
19999 if (NumReducedVals == 1) {
20000 Value *OrigV = TrackedToOrig.at(Candidates.front());
20001 unsigned Cnt = At(SameValuesCounter, OrigV);
20002 Value *RedVal =
20003 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20004 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20005 VectorizedVals.try_emplace(OrigV, Cnt);
20006 ExternallyUsedValues.insert(OrigV);
20007 continue;
20008 }
20009 }
20010
20011 unsigned MaxVecRegSize = V.getMaxVecRegSize();
20012 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
20013 const unsigned MaxElts = std::clamp<unsigned>(
20014 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20015 RegMaxNumber * RedValsMaxNumber);
20016
20017 unsigned ReduxWidth = NumReducedVals;
20018 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20019 unsigned NumParts, NumRegs;
20020 Type *ScalarTy = Candidates.front()->getType();
20021 ReduxWidth =
20022 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20023 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20024 NumParts = TTI.getNumberOfParts(Tp);
20025 NumRegs =
20027 while (NumParts > NumRegs) {
20028 ReduxWidth = bit_floor(ReduxWidth - 1);
20029 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20030 NumParts = TTI.getNumberOfParts(Tp);
20031 NumRegs =
20033 }
20034 if (NumParts > NumRegs / 2)
20035 ReduxWidth = bit_floor(ReduxWidth);
20036 return ReduxWidth;
20037 };
20038 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20039 ReduxWidth = GetVectorFactor(ReduxWidth);
20040 ReduxWidth = std::min(ReduxWidth, MaxElts);
20041
20042 unsigned Start = 0;
20043 unsigned Pos = Start;
20044 // Restarts vectorization attempt with lower vector factor.
20045 unsigned PrevReduxWidth = ReduxWidth;
20046 bool CheckForReusedReductionOpsLocal = false;
20047 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
20048 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
20049 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20050 // Check if any of the reduction ops are gathered. If so, worth
20051 // trying again with less number of reduction ops.
20052 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20053 }
20054 ++Pos;
20055 if (Pos < NumReducedVals - ReduxWidth + 1)
20056 return IsAnyRedOpGathered;
20057 Pos = Start;
20058 --ReduxWidth;
20059 if (ReduxWidth > 1)
20060 ReduxWidth = GetVectorFactor(ReduxWidth);
20061 return IsAnyRedOpGathered;
20062 };
20063 bool AnyVectorized = false;
20064 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20065 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20066 ReduxWidth >= ReductionLimit) {
20067 // Dependency in tree of the reduction ops - drop this attempt, try
20068 // later.
20069 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20070 Start == 0) {
20071 CheckForReusedReductionOps = true;
20072 break;
20073 }
20074 PrevReduxWidth = ReduxWidth;
20075 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20076 // Been analyzed already - skip.
20077 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20078 (!has_single_bit(ReduxWidth) &&
20079 (IgnoredCandidates.contains(
20080 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
20081 IgnoredCandidates.contains(
20082 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
20083 bit_floor(ReduxWidth))))) ||
20084 V.areAnalyzedReductionVals(VL)) {
20085 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20086 continue;
20087 }
20088 // Early exit if any of the reduction values were deleted during
20089 // previous vectorization attempts.
20090 if (any_of(VL, [&V](Value *RedVal) {
20091 auto *RedValI = dyn_cast<Instruction>(RedVal);
20092 if (!RedValI)
20093 return false;
20094 return V.isDeleted(RedValI);
20095 }))
20096 break;
20097 V.buildTree(VL, IgnoreList);
20098 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20099 if (!AdjustReducedVals())
20100 V.analyzedReductionVals(VL);
20101 continue;
20102 }
20103 if (V.isLoadCombineReductionCandidate(RdxKind)) {
20104 if (!AdjustReducedVals())
20105 V.analyzedReductionVals(VL);
20106 continue;
20107 }
20108 V.reorderTopToBottom();
20109 // No need to reorder the root node at all.
20110 V.reorderBottomToTop(/*IgnoreReorder=*/true);
20111 // Keep extracted other reduction values, if they are used in the
20112 // vectorization trees.
20113 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20114 ExternallyUsedValues);
20115 // The reduction root is used as the insertion point for new
20116 // instructions, so set it as externally used to prevent it from being
20117 // deleted.
20118 LocalExternallyUsedValues.insert(ReductionRoot);
20119 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20120 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
20121 continue;
20122 for (Value *V : ReducedVals[Cnt])
20123 if (isa<Instruction>(V))
20124 LocalExternallyUsedValues.insert(TrackedVals[V]);
20125 }
20126 if (!IsSupportedHorRdxIdentityOp) {
20127 // Number of uses of the candidates in the vector of values.
20128 assert(SameValuesCounter.empty() &&
20129 "Reused values counter map is not empty");
20130 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20131 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20132 continue;
20133 Value *V = Candidates[Cnt];
20134 Value *OrigV = TrackedToOrig.at(V);
20135 ++SameValuesCounter.try_emplace(OrigV).first->second;
20136 }
20137 }
20138 V.transformNodes();
20139 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20140 // Gather externally used values.
20142 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20143 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20144 continue;
20145 Value *RdxVal = Candidates[Cnt];
20146 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20147 RdxVal = It->second;
20148 if (!Visited.insert(RdxVal).second)
20149 continue;
20150 // Check if the scalar was vectorized as part of the vectorization
20151 // tree but not the top node.
20152 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
20153 LocalExternallyUsedValues.insert(RdxVal);
20154 continue;
20155 }
20156 Value *OrigV = TrackedToOrig.at(RdxVal);
20157 unsigned NumOps =
20158 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20159 if (NumOps != ReducedValsToOps.at(OrigV).size())
20160 LocalExternallyUsedValues.insert(RdxVal);
20161 }
20162 // Do not need the list of reused scalars in regular mode anymore.
20163 if (!IsSupportedHorRdxIdentityOp)
20164 SameValuesCounter.clear();
20165 for (Value *RdxVal : VL)
20166 if (RequiredExtract.contains(RdxVal))
20167 LocalExternallyUsedValues.insert(RdxVal);
20168 V.buildExternalUses(LocalExternallyUsedValues);
20169
20170 V.computeMinimumValueSizes();
20171
20172 // Estimate cost.
20173 InstructionCost TreeCost = V.getTreeCost(VL);
20174 InstructionCost ReductionCost =
20175 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20176 InstructionCost Cost = TreeCost + ReductionCost;
20177 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
20178 << " for reduction\n");
20179 if (!Cost.isValid())
20180 break;
20181 if (Cost >= -SLPCostThreshold) {
20182 V.getORE()->emit([&]() {
20183 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20184 ReducedValsToOps.at(VL[0]).front())
20185 << "Vectorizing horizontal reduction is possible "
20186 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20187 << " and threshold "
20188 << ore::NV("Threshold", -SLPCostThreshold);
20189 });
20190 if (!AdjustReducedVals()) {
20191 V.analyzedReductionVals(VL);
20192 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20193 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20194 // Add subvectors of VL to the list of the analyzed values.
20195 for (unsigned VF = getFloorFullVectorNumberOfElements(
20196 *TTI, VL.front()->getType(), ReduxWidth - 1);
20197 VF >= ReductionLimit;
20199 *TTI, VL.front()->getType(), VF - 1)) {
20200 if (has_single_bit(VF) &&
20201 V.getCanonicalGraphSize() != V.getTreeSize())
20202 continue;
20203 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20204 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20205 }
20206 }
20207 }
20208 continue;
20209 }
20210
20211 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20212 << Cost << ". (HorRdx)\n");
20213 V.getORE()->emit([&]() {
20214 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20215 ReducedValsToOps.at(VL[0]).front())
20216 << "Vectorized horizontal reduction with cost "
20217 << ore::NV("Cost", Cost) << " and with tree size "
20218 << ore::NV("TreeSize", V.getTreeSize());
20219 });
20220
20221 Builder.setFastMathFlags(RdxFMF);
20222
20223 // Emit a reduction. If the root is a select (min/max idiom), the insert
20224 // point is the compare condition of that select.
20225 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20226 Instruction *InsertPt = RdxRootInst;
20227 if (IsCmpSelMinMax)
20228 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20229
20230 // Vectorize a tree.
20231 Value *VectorizedRoot =
20232 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20233 // Update TrackedToOrig mapping, since the tracked values might be
20234 // updated.
20235 for (Value *RdxVal : Candidates) {
20236 Value *OrigVal = TrackedToOrig.at(RdxVal);
20237 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20238 if (TransformedRdxVal != RdxVal)
20239 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20240 }
20241
20242 Builder.SetInsertPoint(InsertPt);
20243
20244 // To prevent poison from leaking across what used to be sequential,
20245 // safe, scalar boolean logic operations, the reduction operand must be
20246 // frozen.
20247 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20248 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20249
20250 // Emit code to correctly handle reused reduced values, if required.
20251 if (OptReusedScalars && !SameScaleFactor) {
20252 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20253 SameValuesCounter, TrackedToOrig);
20254 }
20255
20256 Value *ReducedSubTree;
20257 Type *ScalarTy = VL.front()->getType();
20258 if (isa<FixedVectorType>(ScalarTy)) {
20259 assert(SLPReVec && "FixedVectorType is not expected.");
20260 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20261 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20262 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20263 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20264 // Do reduction for each lane.
20265 // e.g., do reduce add for
20266 // VL[0] = <4 x Ty> <a, b, c, d>
20267 // VL[1] = <4 x Ty> <e, f, g, h>
20268 // Lane[0] = <2 x Ty> <a, e>
20269 // Lane[1] = <2 x Ty> <b, f>
20270 // Lane[2] = <2 x Ty> <c, g>
20271 // Lane[3] = <2 x Ty> <d, h>
20272 // result[0] = reduce add Lane[0]
20273 // result[1] = reduce add Lane[1]
20274 // result[2] = reduce add Lane[2]
20275 // result[3] = reduce add Lane[3]
20277 createStrideMask(I, ScalarTyNumElements, VL.size());
20278 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20279 ReducedSubTree = Builder.CreateInsertElement(
20280 ReducedSubTree,
20281 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20282 }
20283 } else {
20284 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20285 RdxRootInst->getType());
20286 }
20287 if (ReducedSubTree->getType() != VL.front()->getType()) {
20288 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20289 "Expected different reduction type.");
20290 ReducedSubTree =
20291 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20292 V.isSignedMinBitwidthRootNode());
20293 }
20294
20295 // Improved analysis for add/fadd/xor reductions with same scale factor
20296 // for all operands of reductions. We can emit scalar ops for them
20297 // instead.
20298 if (OptReusedScalars && SameScaleFactor)
20299 ReducedSubTree = emitScaleForReusedOps(
20300 ReducedSubTree, Builder, SameValuesCounter.front().second);
20301
20302 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20303 // Count vectorized reduced values to exclude them from final reduction.
20304 for (Value *RdxVal : VL) {
20305 Value *OrigV = TrackedToOrig.at(RdxVal);
20306 if (IsSupportedHorRdxIdentityOp) {
20307 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20308 continue;
20309 }
20310 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20311 if (!V.isVectorized(RdxVal))
20312 RequiredExtract.insert(RdxVal);
20313 }
20314 Pos += ReduxWidth;
20315 Start = Pos;
20316 ReduxWidth = NumReducedVals - Pos;
20317 if (ReduxWidth > 1)
20318 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20319 AnyVectorized = true;
20320 }
20321 if (OptReusedScalars && !AnyVectorized) {
20322 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20323 Value *RdxVal = TrackedVals.at(P.first);
20324 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20325 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20326 VectorizedVals.try_emplace(P.first, P.second);
20327 }
20328 continue;
20329 }
20330 }
20331 if (VectorizedTree) {
20332 // Reorder operands of bool logical op in the natural order to avoid
20333 // possible problem with poison propagation. If not possible to reorder
20334 // (both operands are originally RHS), emit an extra freeze instruction
20335 // for the LHS operand.
20336 // I.e., if we have original code like this:
20337 // RedOp1 = select i1 ?, i1 LHS, i1 false
20338 // RedOp2 = select i1 RHS, i1 ?, i1 false
20339
20340 // Then, we swap LHS/RHS to create a new op that matches the poison
20341 // semantics of the original code.
20342
20343 // If we have original code like this and both values could be poison:
20344 // RedOp1 = select i1 ?, i1 LHS, i1 false
20345 // RedOp2 = select i1 ?, i1 RHS, i1 false
20346
20347 // Then, we must freeze LHS in the new op.
20348 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20349 Instruction *RedOp1,
20350 Instruction *RedOp2,
20351 bool InitStep) {
20352 if (!AnyBoolLogicOp)
20353 return;
20354 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
20355 getRdxOperand(RedOp1, 0) == LHS ||
20357 return;
20358 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20359 getRdxOperand(RedOp2, 0) == RHS ||
20361 std::swap(LHS, RHS);
20362 return;
20363 }
20364 if (LHS != VectorizedTree)
20365 LHS = Builder.CreateFreeze(LHS);
20366 };
20367 // Finish the reduction.
20368 // Need to add extra arguments and not vectorized possible reduction
20369 // values.
20370 // Try to avoid dependencies between the scalar remainders after
20371 // reductions.
20372 auto FinalGen =
20374 bool InitStep) {
20375 unsigned Sz = InstVals.size();
20377 Sz % 2);
20378 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20379 Instruction *RedOp = InstVals[I + 1].first;
20380 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20381 Value *RdxVal1 = InstVals[I].second;
20382 Value *StableRdxVal1 = RdxVal1;
20383 auto It1 = TrackedVals.find(RdxVal1);
20384 if (It1 != TrackedVals.end())
20385 StableRdxVal1 = It1->second;
20386 Value *RdxVal2 = InstVals[I + 1].second;
20387 Value *StableRdxVal2 = RdxVal2;
20388 auto It2 = TrackedVals.find(RdxVal2);
20389 if (It2 != TrackedVals.end())
20390 StableRdxVal2 = It2->second;
20391 // To prevent poison from leaking across what used to be
20392 // sequential, safe, scalar boolean logic operations, the
20393 // reduction operand must be frozen.
20394 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20395 RedOp, InitStep);
20396 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20397 StableRdxVal2, "op.rdx", ReductionOps);
20398 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20399 }
20400 if (Sz % 2 == 1)
20401 ExtraReds[Sz / 2] = InstVals.back();
20402 return ExtraReds;
20403 };
20405 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20406 VectorizedTree);
20408 for (ArrayRef<Value *> Candidates : ReducedVals) {
20409 for (Value *RdxVal : Candidates) {
20410 if (!Visited.insert(RdxVal).second)
20411 continue;
20412 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20413 for (Instruction *RedOp :
20414 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20415 ExtraReductions.emplace_back(RedOp, RdxVal);
20416 }
20417 }
20418 // Iterate through all not-vectorized reduction values/extra arguments.
20419 bool InitStep = true;
20420 while (ExtraReductions.size() > 1) {
20422 FinalGen(ExtraReductions, InitStep);
20423 ExtraReductions.swap(NewReds);
20424 InitStep = false;
20425 }
20426 VectorizedTree = ExtraReductions.front().second;
20427
20428 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20429
20430 // The original scalar reduction is expected to have no remaining
20431 // uses outside the reduction tree itself. Assert that we got this
20432 // correct, replace internal uses with undef, and mark for eventual
20433 // deletion.
20434#ifndef NDEBUG
20435 SmallSet<Value *, 4> IgnoreSet;
20436 for (ArrayRef<Value *> RdxOps : ReductionOps)
20437 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20438#endif
20439 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20440 for (Value *Ignore : RdxOps) {
20441 if (!Ignore)
20442 continue;
20443#ifndef NDEBUG
20444 for (auto *U : Ignore->users()) {
20445 assert(IgnoreSet.count(U) &&
20446 "All users must be either in the reduction ops list.");
20447 }
20448#endif
20449 if (!Ignore->use_empty()) {
20450 Value *P = PoisonValue::get(Ignore->getType());
20451 Ignore->replaceAllUsesWith(P);
20452 }
20453 }
20454 V.removeInstructionsAndOperands(RdxOps);
20455 }
20456 } else if (!CheckForReusedReductionOps) {
20457 for (ReductionOpsType &RdxOps : ReductionOps)
20458 for (Value *RdxOp : RdxOps)
20459 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20460 }
20461 return VectorizedTree;
20462 }
20463
20464private:
20465 /// Calculate the cost of a reduction.
20466 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20467 ArrayRef<Value *> ReducedVals,
20468 bool IsCmpSelMinMax, FastMathFlags FMF,
20469 const BoUpSLP &R) {
20471 Type *ScalarTy = ReducedVals.front()->getType();
20472 unsigned ReduxWidth = ReducedVals.size();
20473 FixedVectorType *VectorTy = R.getReductionType();
20474 InstructionCost VectorCost = 0, ScalarCost;
20475 // If all of the reduced values are constant, the vector cost is 0, since
20476 // the reduction value can be calculated at the compile time.
20477 bool AllConsts = allConstant(ReducedVals);
20478 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20480 // Scalar cost is repeated for N-1 elements.
20481 int Cnt = ReducedVals.size();
20482 for (Value *RdxVal : ReducedVals) {
20483 if (Cnt == 1)
20484 break;
20485 --Cnt;
20486 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20487 Cost += GenCostFn();
20488 continue;
20489 }
20490 InstructionCost ScalarCost = 0;
20491 for (User *U : RdxVal->users()) {
20492 auto *RdxOp = cast<Instruction>(U);
20493 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20494 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20495 continue;
20496 }
20497 ScalarCost = InstructionCost::getInvalid();
20498 break;
20499 }
20500 if (ScalarCost.isValid())
20501 Cost += ScalarCost;
20502 else
20503 Cost += GenCostFn();
20504 }
20505 return Cost;
20506 };
20507 switch (RdxKind) {
20508 case RecurKind::Add:
20509 case RecurKind::Mul:
20510 case RecurKind::Or:
20511 case RecurKind::And:
20512 case RecurKind::Xor:
20513 case RecurKind::FAdd:
20514 case RecurKind::FMul: {
20515 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20516 if (!AllConsts) {
20517 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20518 assert(SLPReVec && "FixedVectorType is not expected.");
20519 unsigned ScalarTyNumElements = VecTy->getNumElements();
20520 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20521 VectorCost += TTI->getShuffleCost(
20522 TTI::SK_PermuteSingleSrc, VectorTy,
20523 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20524 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20525 CostKind);
20526 }
20527 VectorCost += TTI->getScalarizationOverhead(
20528 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20529 /*Extract*/ false, TTI::TCK_RecipThroughput);
20530 } else {
20531 Type *RedTy = VectorTy->getElementType();
20532 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20533 std::make_pair(RedTy, true));
20534 if (RType == RedTy) {
20535 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20536 FMF, CostKind);
20537 } else {
20538 VectorCost = TTI->getExtendedReductionCost(
20539 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20540 FMF, CostKind);
20541 }
20542 }
20543 }
20544 ScalarCost = EvaluateScalarCost([&]() {
20545 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20546 });
20547 break;
20548 }
20549 case RecurKind::FMax:
20550 case RecurKind::FMin:
20551 case RecurKind::FMaximum:
20552 case RecurKind::FMinimum:
20553 case RecurKind::SMax:
20554 case RecurKind::SMin:
20555 case RecurKind::UMax:
20556 case RecurKind::UMin: {
20558 if (!AllConsts)
20559 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20560 ScalarCost = EvaluateScalarCost([&]() {
20561 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20562 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20563 });
20564 break;
20565 }
20566 default:
20567 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20568 }
20569
20570 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20571 << " for reduction of " << shortBundleName(ReducedVals)
20572 << " (It is a splitting reduction)\n");
20573 return VectorCost - ScalarCost;
20574 }
20575
20576 /// Emit a horizontal reduction of the vectorized value.
20577 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20578 const TargetTransformInfo *TTI, Type *DestTy) {
20579 assert(VectorizedValue && "Need to have a vectorized tree node");
20580 assert(RdxKind != RecurKind::FMulAdd &&
20581 "A call to the llvm.fmuladd intrinsic is not handled yet");
20582
20583 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20584 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20585 RdxKind == RecurKind::Add &&
20586 DestTy->getScalarType() != FTy->getScalarType()) {
20587 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20588 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20589 Value *V = Builder.CreateBitCast(
20590 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20591 ++NumVectorInstructions;
20592 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20593 }
20594 ++NumVectorInstructions;
20595 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20596 }
20597
20598 /// Emits optimized code for unique scalar value reused \p Cnt times.
20599 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20600 unsigned Cnt) {
20601 assert(IsSupportedHorRdxIdentityOp &&
20602 "The optimization of matched scalar identity horizontal reductions "
20603 "must be supported.");
20604 if (Cnt == 1)
20605 return VectorizedValue;
20606 switch (RdxKind) {
20607 case RecurKind::Add: {
20608 // res = mul vv, n
20609 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20610 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20611 << VectorizedValue << ". (HorRdx)\n");
20612 return Builder.CreateMul(VectorizedValue, Scale);
20613 }
20614 case RecurKind::Xor: {
20615 // res = n % 2 ? 0 : vv
20616 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20617 << ". (HorRdx)\n");
20618 if (Cnt % 2 == 0)
20619 return Constant::getNullValue(VectorizedValue->getType());
20620 return VectorizedValue;
20621 }
20622 case RecurKind::FAdd: {
20623 // res = fmul v, n
20624 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20625 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20626 << VectorizedValue << ". (HorRdx)\n");
20627 return Builder.CreateFMul(VectorizedValue, Scale);
20628 }
20629 case RecurKind::And:
20630 case RecurKind::Or:
20631 case RecurKind::SMax:
20632 case RecurKind::SMin:
20633 case RecurKind::UMax:
20634 case RecurKind::UMin:
20635 case RecurKind::FMax:
20636 case RecurKind::FMin:
20637 case RecurKind::FMaximum:
20638 case RecurKind::FMinimum:
20639 // res = vv
20640 return VectorizedValue;
20641 case RecurKind::Mul:
20642 case RecurKind::FMul:
20643 case RecurKind::FMulAdd:
20644 case RecurKind::IAnyOf:
20645 case RecurKind::FAnyOf:
20646 case RecurKind::IFindLastIV:
20647 case RecurKind::FFindLastIV:
20648 case RecurKind::None:
20649 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20650 }
20651 return nullptr;
20652 }
20653
20654 /// Emits actual operation for the scalar identity values, found during
20655 /// horizontal reduction analysis.
20656 Value *
20657 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20658 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20659 const DenseMap<Value *, Value *> &TrackedToOrig) {
20660 assert(IsSupportedHorRdxIdentityOp &&
20661 "The optimization of matched scalar identity horizontal reductions "
20662 "must be supported.");
20663 ArrayRef<Value *> VL = R.getRootNodeScalars();
20664 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20665 if (VTy->getElementType() != VL.front()->getType()) {
20666 VectorizedValue = Builder.CreateIntCast(
20667 VectorizedValue,
20668 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20669 R.isSignedMinBitwidthRootNode());
20670 }
20671 switch (RdxKind) {
20672 case RecurKind::Add: {
20673 // root = mul prev_root, <1, 1, n, 1>
20675 for (Value *V : VL) {
20676 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20677 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20678 }
20679 auto *Scale = ConstantVector::get(Vals);
20680 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20681 << VectorizedValue << ". (HorRdx)\n");
20682 return Builder.CreateMul(VectorizedValue, Scale);
20683 }
20684 case RecurKind::And:
20685 case RecurKind::Or:
20686 // No need for multiple or/and(s).
20687 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20688 << ". (HorRdx)\n");
20689 return VectorizedValue;
20690 case RecurKind::SMax:
20691 case RecurKind::SMin:
20692 case RecurKind::UMax:
20693 case RecurKind::UMin:
20694 case RecurKind::FMax:
20695 case RecurKind::FMin:
20696 case RecurKind::FMaximum:
20697 case RecurKind::FMinimum:
20698 // No need for multiple min/max(s) of the same value.
20699 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20700 << ". (HorRdx)\n");
20701 return VectorizedValue;
20702 case RecurKind::Xor: {
20703 // Replace values with even number of repeats with 0, since
20704 // x xor x = 0.
20705 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20706 // 7>, if elements 4th and 6th elements have even number of repeats.
20708 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20710 std::iota(Mask.begin(), Mask.end(), 0);
20711 bool NeedShuffle = false;
20712 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20713 Value *V = VL[I];
20714 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20715 if (Cnt % 2 == 0) {
20716 Mask[I] = VF;
20717 NeedShuffle = true;
20718 }
20719 }
20720 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20721 : Mask) dbgs()
20722 << I << " ";
20723 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20724 if (NeedShuffle)
20725 VectorizedValue = Builder.CreateShuffleVector(
20726 VectorizedValue,
20727 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20728 return VectorizedValue;
20729 }
20730 case RecurKind::FAdd: {
20731 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20733 for (Value *V : VL) {
20734 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20735 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20736 }
20737 auto *Scale = ConstantVector::get(Vals);
20738 return Builder.CreateFMul(VectorizedValue, Scale);
20739 }
20740 case RecurKind::Mul:
20741 case RecurKind::FMul:
20742 case RecurKind::FMulAdd:
20743 case RecurKind::IAnyOf:
20744 case RecurKind::FAnyOf:
20745 case RecurKind::IFindLastIV:
20746 case RecurKind::FFindLastIV:
20747 case RecurKind::None:
20748 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20749 }
20750 return nullptr;
20751 }
20752};
20753} // end anonymous namespace
20754
20755/// Gets recurrence kind from the specified value.
20757 return HorizontalReduction::getRdxKind(V);
20758}
20759static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20760 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20761 return cast<FixedVectorType>(IE->getType())->getNumElements();
20762
20763 unsigned AggregateSize = 1;
20764 auto *IV = cast<InsertValueInst>(InsertInst);
20765 Type *CurrentType = IV->getType();
20766 do {
20767 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20768 for (auto *Elt : ST->elements())
20769 if (Elt != ST->getElementType(0)) // check homogeneity
20770 return std::nullopt;
20771 AggregateSize *= ST->getNumElements();
20772 CurrentType = ST->getElementType(0);
20773 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20774 AggregateSize *= AT->getNumElements();
20775 CurrentType = AT->getElementType();
20776 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20777 AggregateSize *= VT->getNumElements();
20778 return AggregateSize;
20779 } else if (CurrentType->isSingleValueType()) {
20780 return AggregateSize;
20781 } else {
20782 return std::nullopt;
20783 }
20784 } while (true);
20785}
20786
20787static void findBuildAggregate_rec(Instruction *LastInsertInst,
20789 SmallVectorImpl<Value *> &BuildVectorOpds,
20790 SmallVectorImpl<Value *> &InsertElts,
20791 unsigned OperandOffset, const BoUpSLP &R) {
20792 do {
20793 Value *InsertedOperand = LastInsertInst->getOperand(1);
20794 std::optional<unsigned> OperandIndex =
20795 getElementIndex(LastInsertInst, OperandOffset);
20796 if (!OperandIndex || R.isDeleted(LastInsertInst))
20797 return;
20798 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20799 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20800 BuildVectorOpds, InsertElts, *OperandIndex, R);
20801
20802 } else {
20803 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20804 InsertElts[*OperandIndex] = LastInsertInst;
20805 }
20806 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20807 } while (LastInsertInst != nullptr &&
20808 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20809 LastInsertInst->hasOneUse());
20810}
20811
20812/// Recognize construction of vectors like
20813/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20814/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20815/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20816/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20817/// starting from the last insertelement or insertvalue instruction.
20818///
20819/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20820/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20821/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20822///
20823/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20824///
20825/// \return true if it matches.
20826static bool findBuildAggregate(Instruction *LastInsertInst,
20828 SmallVectorImpl<Value *> &BuildVectorOpds,
20829 SmallVectorImpl<Value *> &InsertElts,
20830 const BoUpSLP &R) {
20831
20832 assert((isa<InsertElementInst>(LastInsertInst) ||
20833 isa<InsertValueInst>(LastInsertInst)) &&
20834 "Expected insertelement or insertvalue instruction!");
20835
20836 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20837 "Expected empty result vectors!");
20838
20839 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20840 if (!AggregateSize)
20841 return false;
20842 BuildVectorOpds.resize(*AggregateSize);
20843 InsertElts.resize(*AggregateSize);
20844
20845 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20846 R);
20847 llvm::erase(BuildVectorOpds, nullptr);
20848 llvm::erase(InsertElts, nullptr);
20849 if (BuildVectorOpds.size() >= 2)
20850 return true;
20851
20852 return false;
20853}
20854
20855/// Try and get a reduction instruction from a phi node.
20856///
20857/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20858/// if they come from either \p ParentBB or a containing loop latch.
20859///
20860/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20861/// if not possible.
20863 BasicBlock *ParentBB, LoopInfo *LI) {
20864 // There are situations where the reduction value is not dominated by the
20865 // reduction phi. Vectorizing such cases has been reported to cause
20866 // miscompiles. See PR25787.
20867 auto DominatedReduxValue = [&](Value *R) {
20868 return isa<Instruction>(R) &&
20869 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20870 };
20871
20872 Instruction *Rdx = nullptr;
20873
20874 // Return the incoming value if it comes from the same BB as the phi node.
20875 if (P->getIncomingBlock(0) == ParentBB) {
20876 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20877 } else if (P->getIncomingBlock(1) == ParentBB) {
20878 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20879 }
20880
20881 if (Rdx && DominatedReduxValue(Rdx))
20882 return Rdx;
20883
20884 // Otherwise, check whether we have a loop latch to look at.
20885 Loop *BBL = LI->getLoopFor(ParentBB);
20886 if (!BBL)
20887 return nullptr;
20888 BasicBlock *BBLatch = BBL->getLoopLatch();
20889 if (!BBLatch)
20890 return nullptr;
20891
20892 // There is a loop latch, return the incoming value if it comes from
20893 // that. This reduction pattern occasionally turns up.
20894 if (P->getIncomingBlock(0) == BBLatch) {
20895 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20896 } else if (P->getIncomingBlock(1) == BBLatch) {
20897 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20898 }
20899
20900 if (Rdx && DominatedReduxValue(Rdx))
20901 return Rdx;
20902
20903 return nullptr;
20904}
20905
20906static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
20907 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
20908 return true;
20909 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
20910 return true;
20911 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
20912 return true;
20913 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
20914 return true;
20915 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
20916 return true;
20917 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
20918 return true;
20919 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
20920 return true;
20921 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
20922 return true;
20923 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
20924 return true;
20925 return false;
20926}
20927
20928/// We could have an initial reduction that is not an add.
20929/// r *= v1 + v2 + v3 + v4
20930/// In such a case start looking for a tree rooted in the first '+'.
20931/// \Returns the new root if found, which may be nullptr if not an instruction.
20933 Instruction *Root) {
20934 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20935 isa<IntrinsicInst>(Root)) &&
20936 "Expected binop, select, or intrinsic for reduction matching");
20937 Value *LHS =
20938 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20939 Value *RHS =
20940 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20941 if (LHS == Phi)
20942 return dyn_cast<Instruction>(RHS);
20943 if (RHS == Phi)
20944 return dyn_cast<Instruction>(LHS);
20945 return nullptr;
20946}
20947
20948/// \p Returns the first operand of \p I that does not match \p Phi. If
20949/// operand is not an instruction it returns nullptr.
20951 Value *Op0 = nullptr;
20952 Value *Op1 = nullptr;
20953 if (!matchRdxBop(I, Op0, Op1))
20954 return nullptr;
20955 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
20956}
20957
20958/// \Returns true if \p I is a candidate instruction for reduction vectorization.
20960 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
20961 Value *B0 = nullptr, *B1 = nullptr;
20962 bool IsBinop = matchRdxBop(I, B0, B1);
20963 return IsBinop || IsSelect;
20964}
20965
20966bool SLPVectorizerPass::vectorizeHorReduction(
20967 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
20968 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
20969 if (!ShouldVectorizeHor)
20970 return false;
20971 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
20972
20973 if (Root->getParent() != BB || isa<PHINode>(Root))
20974 return false;
20975
20976 // If we can find a secondary reduction root, use that instead.
20977 auto SelectRoot = [&]() {
20978 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
20979 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
20980 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
20981 return NewRoot;
20982 return Root;
20983 };
20984
20985 // Start analysis starting from Root instruction. If horizontal reduction is
20986 // found, try to vectorize it. If it is not a horizontal reduction or
20987 // vectorization is not possible or not effective, and currently analyzed
20988 // instruction is a binary operation, try to vectorize the operands, using
20989 // pre-order DFS traversal order. If the operands were not vectorized, repeat
20990 // the same procedure considering each operand as a possible root of the
20991 // horizontal reduction.
20992 // Interrupt the process if the Root instruction itself was vectorized or all
20993 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
20994 // If a horizintal reduction was not matched or vectorized we collect
20995 // instructions for possible later attempts for vectorization.
20996 std::queue<std::pair<Instruction *, unsigned>> Stack;
20997 Stack.emplace(SelectRoot(), 0);
20998 SmallPtrSet<Value *, 8> VisitedInstrs;
20999 bool Res = false;
21000 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
21001 if (R.isAnalyzedReductionRoot(Inst))
21002 return nullptr;
21003 if (!isReductionCandidate(Inst))
21004 return nullptr;
21005 HorizontalReduction HorRdx;
21006 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21007 return nullptr;
21008 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
21009 };
21010 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21011 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21012 FutureSeed = getNonPhiOperand(Root, P);
21013 if (!FutureSeed)
21014 return false;
21015 }
21016 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21017 // analysis is done separately.
21018 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21019 PostponedInsts.push_back(FutureSeed);
21020 return true;
21021 };
21022
21023 while (!Stack.empty()) {
21024 Instruction *Inst;
21025 unsigned Level;
21026 std::tie(Inst, Level) = Stack.front();
21027 Stack.pop();
21028 // Do not try to analyze instruction that has already been vectorized.
21029 // This may happen when we vectorize instruction operands on a previous
21030 // iteration while stack was populated before that happened.
21031 if (R.isDeleted(Inst))
21032 continue;
21033 if (Value *VectorizedV = TryToReduce(Inst)) {
21034 Res = true;
21035 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21036 // Try to find another reduction.
21037 Stack.emplace(I, Level);
21038 continue;
21039 }
21040 if (R.isDeleted(Inst))
21041 continue;
21042 } else {
21043 // We could not vectorize `Inst` so try to use it as a future seed.
21044 if (!TryAppendToPostponedInsts(Inst)) {
21045 assert(Stack.empty() && "Expected empty stack");
21046 break;
21047 }
21048 }
21049
21050 // Try to vectorize operands.
21051 // Continue analysis for the instruction from the same basic block only to
21052 // save compile time.
21053 if (++Level < RecursionMaxDepth)
21054 for (auto *Op : Inst->operand_values())
21055 if (VisitedInstrs.insert(Op).second)
21056 if (auto *I = dyn_cast<Instruction>(Op))
21057 // Do not try to vectorize CmpInst operands, this is done
21058 // separately.
21059 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21060 !R.isDeleted(I) && I->getParent() == BB)
21061 Stack.emplace(I, Level);
21062 }
21063 return Res;
21064}
21065
21066bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
21067 BasicBlock *BB, BoUpSLP &R) {
21068 SmallVector<WeakTrackingVH> PostponedInsts;
21069 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21070 Res |= tryToVectorize(PostponedInsts, R);
21071 return Res;
21072}
21073
21074bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21075 BoUpSLP &R) {
21076 bool Res = false;
21077 for (Value *V : Insts)
21078 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21079 Res |= tryToVectorize(Inst, R);
21080 return Res;
21081}
21082
21083bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21084 BasicBlock *BB, BoUpSLP &R,
21085 bool MaxVFOnly) {
21086 if (!R.canMapToVector(IVI->getType()))
21087 return false;
21088
21089 SmallVector<Value *, 16> BuildVectorOpds;
21090 SmallVector<Value *, 16> BuildVectorInsts;
21091 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
21092 return false;
21093
21094 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21095 R.getORE()->emit([&]() {
21096 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
21097 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
21098 "trying reduction first.";
21099 });
21100 return false;
21101 }
21102 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
21103 // Aggregate value is unlikely to be processed in vector register.
21104 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21105}
21106
21107bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21108 BasicBlock *BB, BoUpSLP &R,
21109 bool MaxVFOnly) {
21110 SmallVector<Value *, 16> BuildVectorInsts;
21111 SmallVector<Value *, 16> BuildVectorOpds;
21113 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21114 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21115 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21116 return false;
21117
21118 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21119 R.getORE()->emit([&]() {
21120 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
21121 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
21122 "trying reduction first.";
21123 });
21124 return false;
21125 }
21126 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
21127 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21128}
21129
21130template <typename T>
21132 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
21133 function_ref<bool(T *, T *)> AreCompatible,
21134 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
21135 bool MaxVFOnly, BoUpSLP &R) {
21136 bool Changed = false;
21137 // Sort by type, parent, operands.
21138 stable_sort(Incoming, Comparator);
21139
21140 // Try to vectorize elements base on their type.
21141 SmallVector<T *> Candidates;
21143 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
21144 VL.clear()) {
21145 // Look for the next elements with the same type, parent and operand
21146 // kinds.
21147 auto *I = dyn_cast<Instruction>(*IncIt);
21148 if (!I || R.isDeleted(I)) {
21149 ++IncIt;
21150 continue;
21151 }
21152 auto *SameTypeIt = IncIt;
21153 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21154 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21155 AreCompatible(*SameTypeIt, *IncIt))) {
21156 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21157 ++SameTypeIt;
21158 if (I && !R.isDeleted(I))
21159 VL.push_back(cast<T>(I));
21160 }
21161
21162 // Try to vectorize them.
21163 unsigned NumElts = VL.size();
21164 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
21165 << NumElts << ")\n");
21166 // The vectorization is a 3-state attempt:
21167 // 1. Try to vectorize instructions with the same/alternate opcodes with the
21168 // size of maximal register at first.
21169 // 2. Try to vectorize remaining instructions with the same type, if
21170 // possible. This may result in the better vectorization results rather than
21171 // if we try just to vectorize instructions with the same/alternate opcodes.
21172 // 3. Final attempt to try to vectorize all instructions with the
21173 // same/alternate ops only, this may result in some extra final
21174 // vectorization.
21175 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21176 // Success start over because instructions might have been changed.
21177 Changed = true;
21178 VL.swap(Candidates);
21179 Candidates.clear();
21180 for (T *V : VL) {
21181 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21182 Candidates.push_back(V);
21183 }
21184 } else {
21185 /// \Returns the minimum number of elements that we will attempt to
21186 /// vectorize.
21187 auto GetMinNumElements = [&R](Value *V) {
21188 unsigned EltSize = R.getVectorElementSize(V);
21189 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21190 };
21191 if (NumElts < GetMinNumElements(*IncIt) &&
21192 (Candidates.empty() ||
21193 Candidates.front()->getType() == (*IncIt)->getType())) {
21194 for (T *V : VL) {
21195 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21196 Candidates.push_back(V);
21197 }
21198 }
21199 }
21200 // Final attempt to vectorize instructions with the same types.
21201 if (Candidates.size() > 1 &&
21202 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21203 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21204 // Success start over because instructions might have been changed.
21205 Changed = true;
21206 } else if (MaxVFOnly) {
21207 // Try to vectorize using small vectors.
21209 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21210 VL.clear()) {
21211 auto *I = dyn_cast<Instruction>(*It);
21212 if (!I || R.isDeleted(I)) {
21213 ++It;
21214 continue;
21215 }
21216 auto *SameTypeIt = It;
21217 while (SameTypeIt != End &&
21218 (!isa<Instruction>(*SameTypeIt) ||
21219 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21220 AreCompatible(*SameTypeIt, *It))) {
21221 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21222 ++SameTypeIt;
21223 if (I && !R.isDeleted(I))
21224 VL.push_back(cast<T>(I));
21225 }
21226 unsigned NumElts = VL.size();
21227 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21228 /*MaxVFOnly=*/false))
21229 Changed = true;
21230 It = SameTypeIt;
21231 }
21232 }
21233 Candidates.clear();
21234 }
21235
21236 // Start over at the next instruction of a different type (or the end).
21237 IncIt = SameTypeIt;
21238 }
21239 return Changed;
21240}
21241
21242/// Compare two cmp instructions. If IsCompatibility is true, function returns
21243/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21244/// operands. If IsCompatibility is false, function implements strict weak
21245/// ordering relation between two cmp instructions, returning true if the first
21246/// instruction is "less" than the second, i.e. its predicate is less than the
21247/// predicate of the second or the operands IDs are less than the operands IDs
21248/// of the second cmp instruction.
21249template <bool IsCompatibility>
21250static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21251 const DominatorTree &DT) {
21252 assert(isValidElementType(V->getType()) &&
21253 isValidElementType(V2->getType()) &&
21254 "Expected valid element types only.");
21255 if (V == V2)
21256 return IsCompatibility;
21257 auto *CI1 = cast<CmpInst>(V);
21258 auto *CI2 = cast<CmpInst>(V2);
21259 if (CI1->getOperand(0)->getType()->getTypeID() <
21260 CI2->getOperand(0)->getType()->getTypeID())
21261 return !IsCompatibility;
21262 if (CI1->getOperand(0)->getType()->getTypeID() >
21263 CI2->getOperand(0)->getType()->getTypeID())
21264 return false;
21265 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21267 return !IsCompatibility;
21268 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21270 return false;
21271 CmpInst::Predicate Pred1 = CI1->getPredicate();
21272 CmpInst::Predicate Pred2 = CI2->getPredicate();
21275 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21276 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21277 if (BasePred1 < BasePred2)
21278 return !IsCompatibility;
21279 if (BasePred1 > BasePred2)
21280 return false;
21281 // Compare operands.
21282 bool CI1Preds = Pred1 == BasePred1;
21283 bool CI2Preds = Pred2 == BasePred1;
21284 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21285 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21286 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21287 if (Op1 == Op2)
21288 continue;
21289 if (Op1->getValueID() < Op2->getValueID())
21290 return !IsCompatibility;
21291 if (Op1->getValueID() > Op2->getValueID())
21292 return false;
21293 if (auto *I1 = dyn_cast<Instruction>(Op1))
21294 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21295 if (IsCompatibility) {
21296 if (I1->getParent() != I2->getParent())
21297 return false;
21298 } else {
21299 // Try to compare nodes with same parent.
21300 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21301 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21302 if (!NodeI1)
21303 return NodeI2 != nullptr;
21304 if (!NodeI2)
21305 return false;
21306 assert((NodeI1 == NodeI2) ==
21307 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21308 "Different nodes should have different DFS numbers");
21309 if (NodeI1 != NodeI2)
21310 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21311 }
21312 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21313 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
21314 continue;
21315 if (IsCompatibility)
21316 return false;
21317 if (I1->getOpcode() != I2->getOpcode())
21318 return I1->getOpcode() < I2->getOpcode();
21319 }
21320 }
21321 return IsCompatibility;
21322}
21323
21324template <typename ItT>
21325bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21326 BasicBlock *BB, BoUpSLP &R) {
21327 bool Changed = false;
21328 // Try to find reductions first.
21329 for (CmpInst *I : CmpInsts) {
21330 if (R.isDeleted(I))
21331 continue;
21332 for (Value *Op : I->operands())
21333 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21334 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21335 if (R.isDeleted(I))
21336 break;
21337 }
21338 }
21339 // Try to vectorize operands as vector bundles.
21340 for (CmpInst *I : CmpInsts) {
21341 if (R.isDeleted(I))
21342 continue;
21343 Changed |= tryToVectorize(I, R);
21344 }
21345 // Try to vectorize list of compares.
21346 // Sort by type, compare predicate, etc.
21347 auto CompareSorter = [&](Value *V, Value *V2) {
21348 if (V == V2)
21349 return false;
21350 return compareCmp<false>(V, V2, *TLI, *DT);
21351 };
21352
21353 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21354 if (V1 == V2)
21355 return true;
21356 return compareCmp<true>(V1, V2, *TLI, *DT);
21357 };
21358
21360 for (Instruction *V : CmpInsts)
21361 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21362 Vals.push_back(V);
21363 if (Vals.size() <= 1)
21364 return Changed;
21365 Changed |= tryToVectorizeSequence<Value>(
21366 Vals, CompareSorter, AreCompatibleCompares,
21367 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21368 // Exclude possible reductions from other blocks.
21369 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21370 return any_of(V->users(), [V](User *U) {
21371 auto *Select = dyn_cast<SelectInst>(U);
21372 return Select &&
21373 Select->getParent() != cast<Instruction>(V)->getParent();
21374 });
21375 });
21376 if (ArePossiblyReducedInOtherBlock)
21377 return false;
21378 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21379 },
21380 /*MaxVFOnly=*/true, R);
21381 return Changed;
21382}
21383
21384bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21385 BasicBlock *BB, BoUpSLP &R) {
21386 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21387 "This function only accepts Insert instructions");
21388 bool OpsChanged = false;
21389 SmallVector<WeakTrackingVH> PostponedInsts;
21390 for (auto *I : reverse(Instructions)) {
21391 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21392 if (R.isDeleted(I) || isa<CmpInst>(I))
21393 continue;
21394 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21395 OpsChanged |=
21396 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21397 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21398 OpsChanged |=
21399 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21400 }
21401 // pass2 - try to vectorize reductions only
21402 if (R.isDeleted(I))
21403 continue;
21404 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21405 if (R.isDeleted(I) || isa<CmpInst>(I))
21406 continue;
21407 // pass3 - try to match and vectorize a buildvector sequence.
21408 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21409 OpsChanged |=
21410 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21411 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21412 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21413 /*MaxVFOnly=*/false);
21414 }
21415 }
21416 // Now try to vectorize postponed instructions.
21417 OpsChanged |= tryToVectorize(PostponedInsts, R);
21418
21419 Instructions.clear();
21420 return OpsChanged;
21421}
21422
21423bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21424 bool Changed = false;
21426 SmallPtrSet<Value *, 16> VisitedInstrs;
21427 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21428 // node. Allows better to identify the chains that can be vectorized in the
21429 // better way.
21431 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21433 isValidElementType(V2->getType()) &&
21434 "Expected vectorizable types only.");
21435 // It is fine to compare type IDs here, since we expect only vectorizable
21436 // types, like ints, floats and pointers, we don't care about other type.
21437 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21438 return true;
21439 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21440 return false;
21441 if (V1->getType()->getScalarSizeInBits() <
21442 V2->getType()->getScalarSizeInBits())
21443 return true;
21444 if (V1->getType()->getScalarSizeInBits() >
21445 V2->getType()->getScalarSizeInBits())
21446 return false;
21447 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21448 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21449 if (Opcodes1.size() < Opcodes2.size())
21450 return true;
21451 if (Opcodes1.size() > Opcodes2.size())
21452 return false;
21453 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21454 {
21455 // Instructions come first.
21456 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21457 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21458 if (I1 && I2) {
21459 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21460 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21461 if (!NodeI1)
21462 return NodeI2 != nullptr;
21463 if (!NodeI2)
21464 return false;
21465 assert((NodeI1 == NodeI2) ==
21466 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21467 "Different nodes should have different DFS numbers");
21468 if (NodeI1 != NodeI2)
21469 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21470 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21471 if (S.getOpcode() && !S.isAltShuffle())
21472 continue;
21473 return I1->getOpcode() < I2->getOpcode();
21474 }
21475 if (I1)
21476 return true;
21477 if (I2)
21478 return false;
21479 }
21480 {
21481 // Non-undef constants come next.
21482 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21483 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21484 if (C1 && C2)
21485 continue;
21486 if (C1)
21487 return true;
21488 if (C2)
21489 return false;
21490 }
21491 bool U1 = isa<UndefValue>(Opcodes1[I]);
21492 bool U2 = isa<UndefValue>(Opcodes2[I]);
21493 {
21494 // Non-constant non-instructions come next.
21495 if (!U1 && !U2) {
21496 auto ValID1 = Opcodes1[I]->getValueID();
21497 auto ValID2 = Opcodes2[I]->getValueID();
21498 if (ValID1 == ValID2)
21499 continue;
21500 if (ValID1 < ValID2)
21501 return true;
21502 if (ValID1 > ValID2)
21503 return false;
21504 }
21505 if (!U1)
21506 return true;
21507 if (!U2)
21508 return false;
21509 }
21510 // Undefs come last.
21511 assert(U1 && U2 && "The only thing left should be undef & undef.");
21512 }
21513 return false;
21514 };
21515 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21516 if (V1 == V2)
21517 return true;
21518 if (V1->getType() != V2->getType())
21519 return false;
21520 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21521 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21522 if (Opcodes1.size() != Opcodes2.size())
21523 return false;
21524 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21525 // Undefs are compatible with any other value.
21526 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21527 continue;
21528 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21529 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21530 if (R.isDeleted(I1) || R.isDeleted(I2))
21531 return false;
21532 if (I1->getParent() != I2->getParent())
21533 return false;
21534 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21535 if (S.getOpcode())
21536 continue;
21537 return false;
21538 }
21539 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21540 continue;
21541 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21542 return false;
21543 }
21544 return true;
21545 };
21546
21547 bool HaveVectorizedPhiNodes = false;
21548 do {
21549 // Collect the incoming values from the PHIs.
21550 Incoming.clear();
21551 for (Instruction &I : *BB) {
21552 auto *P = dyn_cast<PHINode>(&I);
21553 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21554 break;
21555
21556 // No need to analyze deleted, vectorized and non-vectorizable
21557 // instructions.
21558 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21559 isValidElementType(P->getType()))
21560 Incoming.push_back(P);
21561 }
21562
21563 if (Incoming.size() <= 1)
21564 break;
21565
21566 // Find the corresponding non-phi nodes for better matching when trying to
21567 // build the tree.
21568 for (Value *V : Incoming) {
21569 SmallVectorImpl<Value *> &Opcodes =
21570 PHIToOpcodes.try_emplace(V).first->getSecond();
21571 if (!Opcodes.empty())
21572 continue;
21573 SmallVector<Value *, 4> Nodes(1, V);
21575 while (!Nodes.empty()) {
21576 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21577 if (!Visited.insert(PHI).second)
21578 continue;
21579 for (Value *V : PHI->incoming_values()) {
21580 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21581 Nodes.push_back(PHI1);
21582 continue;
21583 }
21584 Opcodes.emplace_back(V);
21585 }
21586 }
21587 }
21588
21589 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21590 Incoming, PHICompare, AreCompatiblePHIs,
21591 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21592 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21593 },
21594 /*MaxVFOnly=*/true, R);
21595 Changed |= HaveVectorizedPhiNodes;
21596 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21597 auto *PHI = dyn_cast<PHINode>(P.first);
21598 return !PHI || R.isDeleted(PHI);
21599 }))
21600 PHIToOpcodes.clear();
21601 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21602 } while (HaveVectorizedPhiNodes);
21603
21604 VisitedInstrs.clear();
21605
21606 InstSetVector PostProcessInserts;
21607 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21608 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21609 // also vectorizes `PostProcessCmps`.
21610 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21611 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21612 if (VectorizeCmps) {
21613 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21614 PostProcessCmps.clear();
21615 }
21616 PostProcessInserts.clear();
21617 return Changed;
21618 };
21619 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21620 auto IsInPostProcessInstrs = [&](Instruction *I) {
21621 if (auto *Cmp = dyn_cast<CmpInst>(I))
21622 return PostProcessCmps.contains(Cmp);
21623 return isa<InsertElementInst, InsertValueInst>(I) &&
21624 PostProcessInserts.contains(I);
21625 };
21626 // Returns true if `I` is an instruction without users, like terminator, or
21627 // function call with ignored return value, store. Ignore unused instructions
21628 // (basing on instruction type, except for CallInst and InvokeInst).
21629 auto HasNoUsers = [](Instruction *I) {
21630 return I->use_empty() &&
21631 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21632 };
21633 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21634 // Skip instructions with scalable type. The num of elements is unknown at
21635 // compile-time for scalable type.
21636 if (isa<ScalableVectorType>(It->getType()))
21637 continue;
21638
21639 // Skip instructions marked for the deletion.
21640 if (R.isDeleted(&*It))
21641 continue;
21642 // We may go through BB multiple times so skip the one we have checked.
21643 if (!VisitedInstrs.insert(&*It).second) {
21644 if (HasNoUsers(&*It) &&
21645 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21646 // We would like to start over since some instructions are deleted
21647 // and the iterator may become invalid value.
21648 Changed = true;
21649 It = BB->begin();
21650 E = BB->end();
21651 }
21652 continue;
21653 }
21654
21655 if (isa<DbgInfoIntrinsic>(It))
21656 continue;
21657
21658 // Try to vectorize reductions that use PHINodes.
21659 if (PHINode *P = dyn_cast<PHINode>(It)) {
21660 // Check that the PHI is a reduction PHI.
21661 if (P->getNumIncomingValues() == 2) {
21662 // Try to match and vectorize a horizontal reduction.
21663 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21664 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21665 Changed = true;
21666 It = BB->begin();
21667 E = BB->end();
21668 continue;
21669 }
21670 }
21671 // Try to vectorize the incoming values of the PHI, to catch reductions
21672 // that feed into PHIs.
21673 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21674 // Skip if the incoming block is the current BB for now. Also, bypass
21675 // unreachable IR for efficiency and to avoid crashing.
21676 // TODO: Collect the skipped incoming values and try to vectorize them
21677 // after processing BB.
21678 if (BB == P->getIncomingBlock(I) ||
21679 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21680 continue;
21681
21682 // Postponed instructions should not be vectorized here, delay their
21683 // vectorization.
21684 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21685 PI && !IsInPostProcessInstrs(PI)) {
21686 bool Res =
21687 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21688 Changed |= Res;
21689 if (Res && R.isDeleted(P)) {
21690 It = BB->begin();
21691 E = BB->end();
21692 break;
21693 }
21694 }
21695 }
21696 continue;
21697 }
21698
21699 if (HasNoUsers(&*It)) {
21700 bool OpsChanged = false;
21701 auto *SI = dyn_cast<StoreInst>(It);
21702 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21703 if (SI) {
21704 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21705 // Try to vectorize chain in store, if this is the only store to the
21706 // address in the block.
21707 // TODO: This is just a temporarily solution to save compile time. Need
21708 // to investigate if we can safely turn on slp-vectorize-hor-store
21709 // instead to allow lookup for reduction chains in all non-vectorized
21710 // stores (need to check side effects and compile time).
21711 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21712 SI->getValueOperand()->hasOneUse();
21713 }
21714 if (TryToVectorizeRoot) {
21715 for (auto *V : It->operand_values()) {
21716 // Postponed instructions should not be vectorized here, delay their
21717 // vectorization.
21718 if (auto *VI = dyn_cast<Instruction>(V);
21719 VI && !IsInPostProcessInstrs(VI))
21720 // Try to match and vectorize a horizontal reduction.
21721 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21722 }
21723 }
21724 // Start vectorization of post-process list of instructions from the
21725 // top-tree instructions to try to vectorize as many instructions as
21726 // possible.
21727 OpsChanged |=
21728 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21729 if (OpsChanged) {
21730 // We would like to start over since some instructions are deleted
21731 // and the iterator may become invalid value.
21732 Changed = true;
21733 It = BB->begin();
21734 E = BB->end();
21735 continue;
21736 }
21737 }
21738
21739 if (isa<InsertElementInst, InsertValueInst>(It))
21740 PostProcessInserts.insert(&*It);
21741 else if (isa<CmpInst>(It))
21742 PostProcessCmps.insert(cast<CmpInst>(&*It));
21743 }
21744
21745 return Changed;
21746}
21747
21748bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21749 auto Changed = false;
21750 for (auto &Entry : GEPs) {
21751 // If the getelementptr list has fewer than two elements, there's nothing
21752 // to do.
21753 if (Entry.second.size() < 2)
21754 continue;
21755
21756 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21757 << Entry.second.size() << ".\n");
21758
21759 // Process the GEP list in chunks suitable for the target's supported
21760 // vector size. If a vector register can't hold 1 element, we are done. We
21761 // are trying to vectorize the index computations, so the maximum number of
21762 // elements is based on the size of the index expression, rather than the
21763 // size of the GEP itself (the target's pointer size).
21764 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21765 return !R.isDeleted(GEP);
21766 });
21767 if (It == Entry.second.end())
21768 continue;
21769 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21770 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21771 if (MaxVecRegSize < EltSize)
21772 continue;
21773
21774 unsigned MaxElts = MaxVecRegSize / EltSize;
21775 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21776 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21777 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21778
21779 // Initialize a set a candidate getelementptrs. Note that we use a
21780 // SetVector here to preserve program order. If the index computations
21781 // are vectorizable and begin with loads, we want to minimize the chance
21782 // of having to reorder them later.
21783 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21784
21785 // Some of the candidates may have already been vectorized after we
21786 // initially collected them or their index is optimized to constant value.
21787 // If so, they are marked as deleted, so remove them from the set of
21788 // candidates.
21789 Candidates.remove_if([&R](Value *I) {
21790 return R.isDeleted(cast<Instruction>(I)) ||
21791 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21792 });
21793
21794 // Remove from the set of candidates all pairs of getelementptrs with
21795 // constant differences. Such getelementptrs are likely not good
21796 // candidates for vectorization in a bottom-up phase since one can be
21797 // computed from the other. We also ensure all candidate getelementptr
21798 // indices are unique.
21799 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21800 auto *GEPI = GEPList[I];
21801 if (!Candidates.count(GEPI))
21802 continue;
21803 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21804 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21805 auto *GEPJ = GEPList[J];
21806 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21807 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21808 Candidates.remove(GEPI);
21809 Candidates.remove(GEPJ);
21810 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21811 Candidates.remove(GEPJ);
21812 }
21813 }
21814 }
21815
21816 // We break out of the above computation as soon as we know there are
21817 // fewer than two candidates remaining.
21818 if (Candidates.size() < 2)
21819 continue;
21820
21821 // Add the single, non-constant index of each candidate to the bundle. We
21822 // ensured the indices met these constraints when we originally collected
21823 // the getelementptrs.
21824 SmallVector<Value *, 16> Bundle(Candidates.size());
21825 auto BundleIndex = 0u;
21826 for (auto *V : Candidates) {
21827 auto *GEP = cast<GetElementPtrInst>(V);
21828 auto *GEPIdx = GEP->idx_begin()->get();
21829 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21830 Bundle[BundleIndex++] = GEPIdx;
21831 }
21832
21833 // Try and vectorize the indices. We are currently only interested in
21834 // gather-like cases of the form:
21835 //
21836 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21837 //
21838 // where the loads of "a", the loads of "b", and the subtractions can be
21839 // performed in parallel. It's likely that detecting this pattern in a
21840 // bottom-up phase will be simpler and less costly than building a
21841 // full-blown top-down phase beginning at the consecutive loads.
21842 Changed |= tryToVectorizeList(Bundle, R);
21843 }
21844 }
21845 return Changed;
21846}
21847
21848bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21849 bool Changed = false;
21850 // Sort by type, base pointers and values operand. Value operands must be
21851 // compatible (have the same opcode, same parent), otherwise it is
21852 // definitely not profitable to try to vectorize them.
21853 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21854 if (V->getValueOperand()->getType()->getTypeID() <
21855 V2->getValueOperand()->getType()->getTypeID())
21856 return true;
21857 if (V->getValueOperand()->getType()->getTypeID() >
21858 V2->getValueOperand()->getType()->getTypeID())
21859 return false;
21860 if (V->getPointerOperandType()->getTypeID() <
21861 V2->getPointerOperandType()->getTypeID())
21862 return true;
21863 if (V->getPointerOperandType()->getTypeID() >
21864 V2->getPointerOperandType()->getTypeID())
21865 return false;
21866 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21867 V2->getValueOperand()->getType()->getScalarSizeInBits())
21868 return true;
21869 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21870 V2->getValueOperand()->getType()->getScalarSizeInBits())
21871 return false;
21872 // UndefValues are compatible with all other values.
21873 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21874 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21876 DT->getNode(I1->getParent());
21878 DT->getNode(I2->getParent());
21879 assert(NodeI1 && "Should only process reachable instructions");
21880 assert(NodeI2 && "Should only process reachable instructions");
21881 assert((NodeI1 == NodeI2) ==
21882 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21883 "Different nodes should have different DFS numbers");
21884 if (NodeI1 != NodeI2)
21885 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21886 return I1->getOpcode() < I2->getOpcode();
21887 }
21888 return V->getValueOperand()->getValueID() <
21889 V2->getValueOperand()->getValueID();
21890 };
21891
21892 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21893 if (V1 == V2)
21894 return true;
21895 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21896 return false;
21897 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21898 return false;
21899 // Undefs are compatible with any other value.
21900 if (isa<UndefValue>(V1->getValueOperand()) ||
21901 isa<UndefValue>(V2->getValueOperand()))
21902 return true;
21903 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21904 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21905 if (I1->getParent() != I2->getParent())
21906 return false;
21907 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21908 return S.getOpcode() > 0;
21909 }
21910 if (isa<Constant>(V1->getValueOperand()) &&
21911 isa<Constant>(V2->getValueOperand()))
21912 return true;
21913 return V1->getValueOperand()->getValueID() ==
21914 V2->getValueOperand()->getValueID();
21915 };
21916
21917 // Attempt to sort and vectorize each of the store-groups.
21919 for (auto &Pair : Stores) {
21920 if (Pair.second.size() < 2)
21921 continue;
21922
21923 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
21924 << Pair.second.size() << ".\n");
21925
21926 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21927 continue;
21928
21929 // Reverse stores to do bottom-to-top analysis. This is important if the
21930 // values are stores to the same addresses several times, in this case need
21931 // to follow the stores order (reversed to meet the memory dependecies).
21932 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21933 Pair.second.rend());
21934 Changed |= tryToVectorizeSequence<StoreInst>(
21935 ReversedStores, StoreSorter, AreCompatibleStores,
21936 [&](ArrayRef<StoreInst *> Candidates, bool) {
21937 return vectorizeStores(Candidates, R, Attempted);
21938 },
21939 /*MaxVFOnly=*/false, R);
21940 }
21941 return Changed;
21942}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
reverse_iterator rend()
Definition: BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:675
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1978
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1873
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2115
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1972
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1207
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1285
unsigned arg_size() const
Definition: InstrTypes.h:1292
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1502
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1969
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:988
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:859
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:871
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2289
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1052
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:879
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2503
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:508
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1060
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2491
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2297
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1830
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1048
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2566
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1889
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:844
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1776
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2398
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2429
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2281
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2525
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2189
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2444
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1689
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2305
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2227
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1849
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1610
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1384
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:283
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:763
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
bool erase(const ValueT &V)
Definition: DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7297
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1668
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:250
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2136
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.