LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<bool>
117 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118 cl::desc("Enable vectorization for wider vector utilization"));
119
120static cl::opt<int>
122 cl::desc("Only vectorize if you gain more than this "
123 "number "));
124
126 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
129
130static cl::opt<bool>
131ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132 cl::desc("Attempt to vectorize horizontal reductions"));
133
135 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136 cl::desc(
137 "Attempt to vectorize horizontal reductions feeding into a store"));
138
139// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140// even if we match a reduction but do not vectorize in the end.
142 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
143 cl::desc("Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
145
146static cl::opt<int>
148 cl::desc("Attempt to vectorize for this register size in bits"));
149
152 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153
154/// Limits the size of scheduling regions in a block.
155/// It avoid long compile times for _very_ large blocks where vector
156/// instructions are spread over a wide range.
157/// This limit is way higher than needed by real-world functions.
158static cl::opt<int>
159ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
160 cl::desc("Limit the size of the SLP scheduling region per block"));
161
163 "slp-min-reg-size", cl::init(128), cl::Hidden,
164 cl::desc("Attempt to vectorize for this register size in bits"));
165
167 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
168 cl::desc("Limit the recursion depth when building a vectorizable tree"));
169
171 "slp-min-tree-size", cl::init(3), cl::Hidden,
172 cl::desc("Only vectorize small trees if they are fully vectorizable"));
173
174// The maximum depth that the look-ahead score heuristic will explore.
175// The higher this value, the higher the compilation time overhead.
177 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
178 cl::desc("The maximum look-ahead depth for operand reordering scores"));
179
180// The maximum depth that the look-ahead score heuristic will explore
181// when it probing among candidates for vectorization tree roots.
182// The higher this value, the higher the compilation time overhead but unlike
183// similar limit for operands ordering this is less frequently used, hence
184// impact of higher value is less noticeable.
186 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
187 cl::desc("The maximum look-ahead depth for searching best rooting option"));
188
190 "slp-min-strided-loads", cl::init(2), cl::Hidden,
191 cl::desc("The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
193
195 "slp-max-stride", cl::init(8), cl::Hidden,
196 cl::desc("The maximum stride, considered to be profitable."));
197
198static cl::opt<bool>
199 ViewSLPTree("view-slp-tree", cl::Hidden,
200 cl::desc("Display the SLP trees with Graphviz"));
201
203 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
204 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205
206// Limit the number of alias checks. The limit is chosen so that
207// it has no negative effect on the llvm benchmarks.
208static const unsigned AliasedCheckLimit = 10;
209
210// Limit of the number of uses for potentially transformed instructions/values,
211// used in checks to avoid compile-time explode.
212static constexpr int UsesLimit = 64;
213
214// Another limit for the alias checks: The maximum distance between load/store
215// instructions where alias checks are done.
216// This limit is useful for very large basic blocks.
217static const unsigned MaxMemDepDistance = 160;
218
219/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220/// regions to be handled.
221static const int MinScheduleRegionSize = 16;
222
223/// Maximum allowed number of operands in the PHI nodes.
224static const unsigned MaxPHINumOperands = 128;
225
226/// Predicate for the element types that the SLP vectorizer supports.
227///
228/// The most important thing to filter here are types which are invalid in LLVM
229/// vectors. We also filter target specific types which have absolutely no
230/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231/// avoids spending time checking the cost model and realizing that they will
232/// be inevitably scalarized.
233static bool isValidElementType(Type *Ty) {
234 // TODO: Support ScalableVectorType.
235 if (SLPReVec && isa<FixedVectorType>(Ty))
236 Ty = Ty->getScalarType();
237 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
238 !Ty->isPPC_FP128Ty();
239}
240
241/// \returns the number of elements for Ty.
242static unsigned getNumElements(Type *Ty) {
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
247 return 1;
248}
249
250/// \returns the vector type of ScalarTy based on vectorization factor.
251static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252 return FixedVectorType::get(ScalarTy->getScalarType(),
253 VF * getNumElements(ScalarTy));
254}
255
256static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
257 SmallVectorImpl<int> &Mask) {
258 // The ShuffleBuilder implementation use shufflevector to splat an "element".
259 // But the element have different meaning for SLP (scalar) and REVEC
260 // (vector). We need to expand Mask into masks which shufflevector can use
261 // directly.
262 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
263 for (unsigned I : seq<unsigned>(Mask.size()))
264 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
265 I * VecTyNumElements, VecTyNumElements)))
266 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
267 : Mask[I] * VecTyNumElements + J;
268 Mask.swap(NewMask);
269}
270
271/// \returns the number of groups of shufflevector
272/// A group has the following features
273/// 1. All of value in a group are shufflevector.
274/// 2. The mask of all shufflevector is isExtractSubvectorMask.
275/// 3. The mask of all shufflevector uses all of the elements of the source (and
276/// the elements are used in order).
277/// e.g., it is 1 group (%0)
278/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
279/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
280/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
281/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
282/// it is 2 groups (%3 and %4)
283/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
284/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
285/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
286/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
287/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
288/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
289/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
290/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
291/// it is 0 group
292/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
293/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
294/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
295/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
297 if (VL.empty())
298 return 0;
299 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
300 return 0;
301 auto *SV = cast<ShuffleVectorInst>(VL.front());
302 unsigned SVNumElements =
303 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
304 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
305 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
306 return 0;
307 unsigned NumGroup = 0;
308 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
309 auto *SV = cast<ShuffleVectorInst>(VL[I]);
310 Value *Src = SV->getOperand(0);
311 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
312 SmallVector<int> ExtractionIndex(SVNumElements);
313 if (!all_of(Group, [&](Value *V) {
314 auto *SV = cast<ShuffleVectorInst>(V);
315 // From the same source.
316 if (SV->getOperand(0) != Src)
317 return false;
318 int Index;
319 if (!SV->isExtractSubvectorMask(Index))
320 return false;
321 for (int I : seq<int>(Index, Index + SV->getShuffleMask().size()))
322 ExtractionIndex.push_back(I);
323 return true;
324 }))
325 return 0;
326 if (!is_sorted(ExtractionIndex))
327 return 0;
328 ++NumGroup;
329 }
330 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
331 return NumGroup;
332}
333
334/// \returns a shufflevector mask which is used to vectorize shufflevectors
335/// e.g.,
336/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
337/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
338/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
339/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
340/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
341/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
342/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
343/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
344/// the result is
345/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
347 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
348 auto *SV = cast<ShuffleVectorInst>(VL.front());
349 unsigned SVNumElements =
350 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
351 SmallVector<int> Mask;
352 unsigned AccumulateLength = 0;
353 for (Value *V : VL) {
354 auto *SV = cast<ShuffleVectorInst>(V);
355 for (int M : SV->getShuffleMask())
356 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
357 : AccumulateLength + M);
358 AccumulateLength += SVNumElements;
359 }
360 return Mask;
361}
362
363/// \returns True if the value is a constant (but not globals/constant
364/// expressions).
365static bool isConstant(Value *V) {
366 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
367}
368
369/// Checks if \p V is one of vector-like instructions, i.e. undef,
370/// insertelement/extractelement with constant indices for fixed vector type or
371/// extractvalue instruction.
373 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
374 !isa<ExtractValueInst, UndefValue>(V))
375 return false;
376 auto *I = dyn_cast<Instruction>(V);
377 if (!I || isa<ExtractValueInst>(I))
378 return true;
379 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
380 return false;
381 if (isa<ExtractElementInst>(I))
382 return isConstant(I->getOperand(1));
383 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
384 return isConstant(I->getOperand(2));
385}
386
387/// Returns power-of-2 number of elements in a single register (part), given the
388/// total number of elements \p Size and number of registers (parts) \p
389/// NumParts.
390static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
391 return PowerOf2Ceil(divideCeil(Size, NumParts));
392}
393
394/// Returns correct remaining number of elements, considering total amount \p
395/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
396/// and current register (part) \p Part.
397static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
398 unsigned Part) {
399 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
400}
401
402#if !defined(NDEBUG)
403/// Print a short descriptor of the instruction bundle suitable for debug output.
404static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
405 std::string Result;
406 raw_string_ostream OS(Result);
407 if (Idx >= 0)
408 OS << "Idx: " << Idx << ", ";
409 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
410 OS.flush();
411 return Result;
412}
413#endif
414
415/// \returns true if all of the instructions in \p VL are in the same block or
416/// false otherwise.
418 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
419 if (!I0)
420 return false;
422 return true;
423
424 BasicBlock *BB = I0->getParent();
425 for (int I = 1, E = VL.size(); I < E; I++) {
426 auto *II = dyn_cast<Instruction>(VL[I]);
427 if (!II)
428 return false;
429
430 if (BB != II->getParent())
431 return false;
432 }
433 return true;
434}
435
436/// \returns True if all of the values in \p VL are constants (but not
437/// globals/constant expressions).
439 // Constant expressions and globals can't be vectorized like normal integer/FP
440 // constants.
441 return all_of(VL, isConstant);
442}
443
444/// \returns True if all of the values in \p VL are identical or some of them
445/// are UndefValue.
446static bool isSplat(ArrayRef<Value *> VL) {
447 Value *FirstNonUndef = nullptr;
448 for (Value *V : VL) {
449 if (isa<UndefValue>(V))
450 continue;
451 if (!FirstNonUndef) {
452 FirstNonUndef = V;
453 continue;
454 }
455 if (V != FirstNonUndef)
456 return false;
457 }
458 return FirstNonUndef != nullptr;
459}
460
461/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
463 if (auto *Cmp = dyn_cast<CmpInst>(I))
464 return Cmp->isCommutative();
465 if (auto *BO = dyn_cast<BinaryOperator>(I))
466 return BO->isCommutative() ||
467 (BO->getOpcode() == Instruction::Sub &&
468 !BO->hasNUsesOrMore(UsesLimit) &&
469 all_of(
470 BO->uses(),
471 [](const Use &U) {
472 // Commutative, if icmp eq/ne sub, 0
473 ICmpInst::Predicate Pred;
474 if (match(U.getUser(),
475 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
476 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
477 return true;
478 // Commutative, if abs(sub nsw, true) or abs(sub, false).
479 ConstantInt *Flag;
480 return match(U.getUser(),
481 m_Intrinsic<Intrinsic::abs>(
482 m_Specific(U.get()), m_ConstantInt(Flag))) &&
483 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
484 Flag->isOne());
485 })) ||
486 (BO->getOpcode() == Instruction::FSub &&
487 !BO->hasNUsesOrMore(UsesLimit) &&
488 all_of(BO->uses(), [](const Use &U) {
489 return match(U.getUser(),
490 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
491 }));
492 return I->isCommutative();
493}
494
495template <typename T>
496static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
497 unsigned Offset) {
498 static_assert(std::is_same_v<T, InsertElementInst> ||
499 std::is_same_v<T, ExtractElementInst>,
500 "unsupported T");
501 int Index = Offset;
502 if (const auto *IE = dyn_cast<T>(Inst)) {
503 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
504 if (!VT)
505 return std::nullopt;
506 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
507 if (!CI)
508 return std::nullopt;
509 if (CI->getValue().uge(VT->getNumElements()))
510 return std::nullopt;
511 Index *= VT->getNumElements();
512 Index += CI->getZExtValue();
513 return Index;
514 }
515 return std::nullopt;
516}
517
518/// \returns inserting or extracting index of InsertElement, ExtractElement or
519/// InsertValue instruction, using Offset as base offset for index.
520/// \returns std::nullopt if the index is not an immediate.
521static std::optional<unsigned> getElementIndex(const Value *Inst,
522 unsigned Offset = 0) {
523 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
524 return Index;
525 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
526 return Index;
527
528 int Index = Offset;
529
530 const auto *IV = dyn_cast<InsertValueInst>(Inst);
531 if (!IV)
532 return std::nullopt;
533
534 Type *CurrentType = IV->getType();
535 for (unsigned I : IV->indices()) {
536 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
537 Index *= ST->getNumElements();
538 CurrentType = ST->getElementType(I);
539 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
540 Index *= AT->getNumElements();
541 CurrentType = AT->getElementType();
542 } else {
543 return std::nullopt;
544 }
545 Index += I;
546 }
547 return Index;
548}
549
550namespace {
551/// Specifies the way the mask should be analyzed for undefs/poisonous elements
552/// in the shuffle mask.
553enum class UseMask {
554 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
555 ///< check for the mask elements for the first argument (mask
556 ///< indices are in range [0:VF)).
557 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
558 ///< for the mask elements for the second argument (mask indices
559 ///< are in range [VF:2*VF))
560 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
561 ///< future shuffle elements and mark them as ones as being used
562 ///< in future. Non-undef elements are considered as unused since
563 ///< they're already marked as used in the mask.
564};
565} // namespace
566
567/// Prepares a use bitset for the given mask either for the first argument or
568/// for the second.
570 UseMask MaskArg) {
571 SmallBitVector UseMask(VF, true);
572 for (auto [Idx, Value] : enumerate(Mask)) {
573 if (Value == PoisonMaskElem) {
574 if (MaskArg == UseMask::UndefsAsMask)
575 UseMask.reset(Idx);
576 continue;
577 }
578 if (MaskArg == UseMask::FirstArg && Value < VF)
579 UseMask.reset(Value);
580 else if (MaskArg == UseMask::SecondArg && Value >= VF)
581 UseMask.reset(Value - VF);
582 }
583 return UseMask;
584}
585
586/// Checks if the given value is actually an undefined constant vector.
587/// Also, if the \p UseMask is not empty, tries to check if the non-masked
588/// elements actually mask the insertelement buildvector, if any.
589template <bool IsPoisonOnly = false>
591 const SmallBitVector &UseMask = {}) {
592 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
593 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
594 if (isa<T>(V))
595 return Res;
596 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
597 if (!VecTy)
598 return Res.reset();
599 auto *C = dyn_cast<Constant>(V);
600 if (!C) {
601 if (!UseMask.empty()) {
602 const Value *Base = V;
603 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
604 Base = II->getOperand(0);
605 if (isa<T>(II->getOperand(1)))
606 continue;
607 std::optional<unsigned> Idx = getElementIndex(II);
608 if (!Idx) {
609 Res.reset();
610 return Res;
611 }
612 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
613 Res.reset(*Idx);
614 }
615 // TODO: Add analysis for shuffles here too.
616 if (V == Base) {
617 Res.reset();
618 } else {
619 SmallBitVector SubMask(UseMask.size(), false);
620 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
621 }
622 } else {
623 Res.reset();
624 }
625 return Res;
626 }
627 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
628 if (Constant *Elem = C->getAggregateElement(I))
629 if (!isa<T>(Elem) &&
630 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
631 Res.reset(I);
632 }
633 return Res;
634}
635
636/// Checks if the vector of instructions can be represented as a shuffle, like:
637/// %x0 = extractelement <4 x i8> %x, i32 0
638/// %x3 = extractelement <4 x i8> %x, i32 3
639/// %y1 = extractelement <4 x i8> %y, i32 1
640/// %y2 = extractelement <4 x i8> %y, i32 2
641/// %x0x0 = mul i8 %x0, %x0
642/// %x3x3 = mul i8 %x3, %x3
643/// %y1y1 = mul i8 %y1, %y1
644/// %y2y2 = mul i8 %y2, %y2
645/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
646/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
647/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
648/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
649/// ret <4 x i8> %ins4
650/// can be transformed into:
651/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
652/// i32 6>
653/// %2 = mul <4 x i8> %1, %1
654/// ret <4 x i8> %2
655/// Mask will return the Shuffle Mask equivalent to the extracted elements.
656/// TODO: Can we split off and reuse the shuffle mask detection from
657/// ShuffleVectorInst/getShuffleCost?
658static std::optional<TargetTransformInfo::ShuffleKind>
660 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
661 if (It == VL.end())
662 return std::nullopt;
663 unsigned Size =
664 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
665 auto *EI = dyn_cast<ExtractElementInst>(V);
666 if (!EI)
667 return S;
668 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
669 if (!VTy)
670 return S;
671 return std::max(S, VTy->getNumElements());
672 });
673
674 Value *Vec1 = nullptr;
675 Value *Vec2 = nullptr;
676 bool HasNonUndefVec = any_of(VL, [](Value *V) {
677 auto *EE = dyn_cast<ExtractElementInst>(V);
678 if (!EE)
679 return false;
680 Value *Vec = EE->getVectorOperand();
681 if (isa<UndefValue>(Vec))
682 return false;
683 return isGuaranteedNotToBePoison(Vec);
684 });
685 enum ShuffleMode { Unknown, Select, Permute };
686 ShuffleMode CommonShuffleMode = Unknown;
687 Mask.assign(VL.size(), PoisonMaskElem);
688 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
689 // Undef can be represented as an undef element in a vector.
690 if (isa<UndefValue>(VL[I]))
691 continue;
692 auto *EI = cast<ExtractElementInst>(VL[I]);
693 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
694 return std::nullopt;
695 auto *Vec = EI->getVectorOperand();
696 // We can extractelement from undef or poison vector.
697 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
698 continue;
699 // All vector operands must have the same number of vector elements.
700 if (isa<UndefValue>(Vec)) {
701 Mask[I] = I;
702 } else {
703 if (isa<UndefValue>(EI->getIndexOperand()))
704 continue;
705 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
706 if (!Idx)
707 return std::nullopt;
708 // Undefined behavior if Idx is negative or >= Size.
709 if (Idx->getValue().uge(Size))
710 continue;
711 unsigned IntIdx = Idx->getValue().getZExtValue();
712 Mask[I] = IntIdx;
713 }
714 if (isUndefVector(Vec).all() && HasNonUndefVec)
715 continue;
716 // For correct shuffling we have to have at most 2 different vector operands
717 // in all extractelement instructions.
718 if (!Vec1 || Vec1 == Vec) {
719 Vec1 = Vec;
720 } else if (!Vec2 || Vec2 == Vec) {
721 Vec2 = Vec;
722 Mask[I] += Size;
723 } else {
724 return std::nullopt;
725 }
726 if (CommonShuffleMode == Permute)
727 continue;
728 // If the extract index is not the same as the operation number, it is a
729 // permutation.
730 if (Mask[I] % Size != I) {
731 CommonShuffleMode = Permute;
732 continue;
733 }
734 CommonShuffleMode = Select;
735 }
736 // If we're not crossing lanes in different vectors, consider it as blending.
737 if (CommonShuffleMode == Select && Vec2)
739 // If Vec2 was never used, we have a permutation of a single vector, otherwise
740 // we have permutation of 2 vectors.
743}
744
745/// \returns True if Extract{Value,Element} instruction extracts element Idx.
746static std::optional<unsigned> getExtractIndex(Instruction *E) {
747 unsigned Opcode = E->getOpcode();
748 assert((Opcode == Instruction::ExtractElement ||
749 Opcode == Instruction::ExtractValue) &&
750 "Expected extractelement or extractvalue instruction.");
751 if (Opcode == Instruction::ExtractElement) {
752 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
753 if (!CI)
754 return std::nullopt;
755 return CI->getZExtValue();
756 }
757 auto *EI = cast<ExtractValueInst>(E);
758 if (EI->getNumIndices() != 1)
759 return std::nullopt;
760 return *EI->idx_begin();
761}
762
763namespace {
764
765/// Main data required for vectorization of instructions.
766struct InstructionsState {
767 /// The very first instruction in the list with the main opcode.
768 Value *OpValue = nullptr;
769
770 /// The main/alternate instruction.
771 Instruction *MainOp = nullptr;
772 Instruction *AltOp = nullptr;
773
774 /// The main/alternate opcodes for the list of instructions.
775 unsigned getOpcode() const {
776 return MainOp ? MainOp->getOpcode() : 0;
777 }
778
779 unsigned getAltOpcode() const {
780 return AltOp ? AltOp->getOpcode() : 0;
781 }
782
783 /// Some of the instructions in the list have alternate opcodes.
784 bool isAltShuffle() const { return AltOp != MainOp; }
785
786 bool isOpcodeOrAlt(Instruction *I) const {
787 unsigned CheckedOpcode = I->getOpcode();
788 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
789 }
790
791 InstructionsState() = delete;
792 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
793 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
794};
795
796} // end anonymous namespace
797
798/// \returns true if \p Opcode is allowed as part of the main/alternate
799/// instruction for SLP vectorization.
800///
801/// Example of unsupported opcode is SDIV that can potentially cause UB if the
802/// "shuffled out" lane would result in division by zero.
803static bool isValidForAlternation(unsigned Opcode) {
804 if (Instruction::isIntDivRem(Opcode))
805 return false;
806
807 return true;
808}
809
810static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
811 const TargetLibraryInfo &TLI,
812 unsigned BaseIndex = 0);
813
814/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
815/// compatible instructions or constants, or just some other regular values.
816static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
817 Value *Op1, const TargetLibraryInfo &TLI) {
818 return (isConstant(BaseOp0) && isConstant(Op0)) ||
819 (isConstant(BaseOp1) && isConstant(Op1)) ||
820 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
821 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
822 BaseOp0 == Op0 || BaseOp1 == Op1 ||
823 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
824 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
825}
826
827/// \returns true if a compare instruction \p CI has similar "look" and
828/// same predicate as \p BaseCI, "as is" or with its operands and predicate
829/// swapped, false otherwise.
830static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
831 const TargetLibraryInfo &TLI) {
832 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
833 "Assessing comparisons of different types?");
834 CmpInst::Predicate BasePred = BaseCI->getPredicate();
835 CmpInst::Predicate Pred = CI->getPredicate();
837
838 Value *BaseOp0 = BaseCI->getOperand(0);
839 Value *BaseOp1 = BaseCI->getOperand(1);
840 Value *Op0 = CI->getOperand(0);
841 Value *Op1 = CI->getOperand(1);
842
843 return (BasePred == Pred &&
844 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
845 (BasePred == SwappedPred &&
846 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
847}
848
849/// \returns analysis of the Instructions in \p VL described in
850/// InstructionsState, the Opcode that we suppose the whole list
851/// could be vectorized even if its structure is diverse.
852static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
853 const TargetLibraryInfo &TLI,
854 unsigned BaseIndex) {
855 // Make sure these are all Instructions.
856 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
857 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
858
859 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
860 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
861 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
862 CmpInst::Predicate BasePred =
863 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
865 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
866 unsigned AltOpcode = Opcode;
867 unsigned AltIndex = BaseIndex;
868
869 bool SwappedPredsCompatible = [&]() {
870 if (!IsCmpOp)
871 return false;
872 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
873 UniquePreds.insert(BasePred);
874 UniqueNonSwappedPreds.insert(BasePred);
875 for (Value *V : VL) {
876 auto *I = dyn_cast<CmpInst>(V);
877 if (!I)
878 return false;
879 CmpInst::Predicate CurrentPred = I->getPredicate();
880 CmpInst::Predicate SwappedCurrentPred =
881 CmpInst::getSwappedPredicate(CurrentPred);
882 UniqueNonSwappedPreds.insert(CurrentPred);
883 if (!UniquePreds.contains(CurrentPred) &&
884 !UniquePreds.contains(SwappedCurrentPred))
885 UniquePreds.insert(CurrentPred);
886 }
887 // Total number of predicates > 2, but if consider swapped predicates
888 // compatible only 2, consider swappable predicates as compatible opcodes,
889 // not alternate.
890 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
891 }();
892 // Check for one alternate opcode from another BinaryOperator.
893 // TODO - generalize to support all operators (types, calls etc.).
894 auto *IBase = cast<Instruction>(VL[BaseIndex]);
895 Intrinsic::ID BaseID = 0;
896 SmallVector<VFInfo> BaseMappings;
897 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
899 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
900 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
901 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
902 }
903 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
904 auto *I = cast<Instruction>(VL[Cnt]);
905 unsigned InstOpcode = I->getOpcode();
906 if (IsBinOp && isa<BinaryOperator>(I)) {
907 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
908 continue;
909 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
910 isValidForAlternation(Opcode)) {
911 AltOpcode = InstOpcode;
912 AltIndex = Cnt;
913 continue;
914 }
915 } else if (IsCastOp && isa<CastInst>(I)) {
916 Value *Op0 = IBase->getOperand(0);
917 Type *Ty0 = Op0->getType();
918 Value *Op1 = I->getOperand(0);
919 Type *Ty1 = Op1->getType();
920 if (Ty0 == Ty1) {
921 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
922 continue;
923 if (Opcode == AltOpcode) {
925 isValidForAlternation(InstOpcode) &&
926 "Cast isn't safe for alternation, logic needs to be updated!");
927 AltOpcode = InstOpcode;
928 AltIndex = Cnt;
929 continue;
930 }
931 }
932 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
933 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
934 Type *Ty0 = BaseInst->getOperand(0)->getType();
935 Type *Ty1 = Inst->getOperand(0)->getType();
936 if (Ty0 == Ty1) {
937 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
938 // Check for compatible operands. If the corresponding operands are not
939 // compatible - need to perform alternate vectorization.
940 CmpInst::Predicate CurrentPred = Inst->getPredicate();
941 CmpInst::Predicate SwappedCurrentPred =
942 CmpInst::getSwappedPredicate(CurrentPred);
943
944 if ((E == 2 || SwappedPredsCompatible) &&
945 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
946 continue;
947
948 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
949 continue;
950 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
951 if (AltIndex != BaseIndex) {
952 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
953 continue;
954 } else if (BasePred != CurrentPred) {
955 assert(
956 isValidForAlternation(InstOpcode) &&
957 "CmpInst isn't safe for alternation, logic needs to be updated!");
958 AltIndex = Cnt;
959 continue;
960 }
961 CmpInst::Predicate AltPred = AltInst->getPredicate();
962 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
963 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
964 continue;
965 }
966 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
967 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
968 if (Gep->getNumOperands() != 2 ||
969 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
970 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
971 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
973 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
974 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
975 auto *BaseLI = cast<LoadInst>(IBase);
976 if (!LI->isSimple() || !BaseLI->isSimple())
977 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
978 } else if (auto *Call = dyn_cast<CallInst>(I)) {
979 auto *CallBase = cast<CallInst>(IBase);
980 if (Call->getCalledFunction() != CallBase->getCalledFunction())
981 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
982 if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
983 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
984 Call->op_begin() + Call->getBundleOperandsEndIndex(),
985 CallBase->op_begin() +
987 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
989 if (ID != BaseID)
990 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
991 if (!ID) {
992 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
993 if (Mappings.size() != BaseMappings.size() ||
994 Mappings.front().ISA != BaseMappings.front().ISA ||
995 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
996 Mappings.front().VectorName != BaseMappings.front().VectorName ||
997 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
998 Mappings.front().Shape.Parameters !=
999 BaseMappings.front().Shape.Parameters)
1000 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1001 }
1002 }
1003 continue;
1004 }
1005 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1006 }
1007
1008 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
1009 cast<Instruction>(VL[AltIndex]));
1010}
1011
1012/// \returns true if all of the values in \p VL have the same type or false
1013/// otherwise.
1015 Type *Ty = VL.front()->getType();
1016 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1017}
1018
1019/// \returns True if in-tree use also needs extract. This refers to
1020/// possible scalar operand in vectorized instruction.
1021static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1022 TargetLibraryInfo *TLI) {
1023 unsigned Opcode = UserInst->getOpcode();
1024 switch (Opcode) {
1025 case Instruction::Load: {
1026 LoadInst *LI = cast<LoadInst>(UserInst);
1027 return (LI->getPointerOperand() == Scalar);
1028 }
1029 case Instruction::Store: {
1030 StoreInst *SI = cast<StoreInst>(UserInst);
1031 return (SI->getPointerOperand() == Scalar);
1032 }
1033 case Instruction::Call: {
1034 CallInst *CI = cast<CallInst>(UserInst);
1036 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1037 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
1038 Arg.value().get() == Scalar;
1039 });
1040 }
1041 default:
1042 return false;
1043 }
1044}
1045
1046/// \returns the AA location that is being access by the instruction.
1048 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1049 return MemoryLocation::get(SI);
1050 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1051 return MemoryLocation::get(LI);
1052 return MemoryLocation();
1053}
1054
1055/// \returns True if the instruction is not a volatile or atomic load/store.
1056static bool isSimple(Instruction *I) {
1057 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1058 return LI->isSimple();
1059 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1060 return SI->isSimple();
1061 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1062 return !MI->isVolatile();
1063 return true;
1064}
1065
1066/// Shuffles \p Mask in accordance with the given \p SubMask.
1067/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1068/// one but two input vectors.
1069static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1070 bool ExtendingManyInputs = false) {
1071 if (SubMask.empty())
1072 return;
1073 assert(
1074 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1075 // Check if input scalars were extended to match the size of other node.
1076 (SubMask.size() == Mask.size() &&
1077 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
1078 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
1079 "SubMask with many inputs support must be larger than the mask.");
1080 if (Mask.empty()) {
1081 Mask.append(SubMask.begin(), SubMask.end());
1082 return;
1083 }
1084 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1085 int TermValue = std::min(Mask.size(), SubMask.size());
1086 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1087 if (SubMask[I] == PoisonMaskElem ||
1088 (!ExtendingManyInputs &&
1089 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1090 continue;
1091 NewMask[I] = Mask[SubMask[I]];
1092 }
1093 Mask.swap(NewMask);
1094}
1095
1096/// Order may have elements assigned special value (size) which is out of
1097/// bounds. Such indices only appear on places which correspond to undef values
1098/// (see canReuseExtract for details) and used in order to avoid undef values
1099/// have effect on operands ordering.
1100/// The first loop below simply finds all unused indices and then the next loop
1101/// nest assigns these indices for undef values positions.
1102/// As an example below Order has two undef positions and they have assigned
1103/// values 3 and 7 respectively:
1104/// before: 6 9 5 4 9 2 1 0
1105/// after: 6 3 5 4 7 2 1 0
1107 const unsigned Sz = Order.size();
1108 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1109 SmallBitVector MaskedIndices(Sz);
1110 for (unsigned I = 0; I < Sz; ++I) {
1111 if (Order[I] < Sz)
1112 UnusedIndices.reset(Order[I]);
1113 else
1114 MaskedIndices.set(I);
1115 }
1116 if (MaskedIndices.none())
1117 return;
1118 assert(UnusedIndices.count() == MaskedIndices.count() &&
1119 "Non-synced masked/available indices.");
1120 int Idx = UnusedIndices.find_first();
1121 int MIdx = MaskedIndices.find_first();
1122 while (MIdx >= 0) {
1123 assert(Idx >= 0 && "Indices must be synced.");
1124 Order[MIdx] = Idx;
1125 Idx = UnusedIndices.find_next(Idx);
1126 MIdx = MaskedIndices.find_next(MIdx);
1127 }
1128}
1129
1130/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1131/// Opcode1.
1133 unsigned Opcode1) {
1134 Type *ScalarTy = VL[0]->getType();
1135 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1136 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1137 for (unsigned Lane : seq<unsigned>(VL.size()))
1138 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1139 OpcodeMask.set(Lane * ScalarTyNumElements,
1140 Lane * ScalarTyNumElements + ScalarTyNumElements);
1141 return OpcodeMask;
1142}
1143
1144namespace llvm {
1145
1147 SmallVectorImpl<int> &Mask) {
1148 Mask.clear();
1149 const unsigned E = Indices.size();
1150 Mask.resize(E, PoisonMaskElem);
1151 for (unsigned I = 0; I < E; ++I)
1152 Mask[Indices[I]] = I;
1153}
1154
1155/// Reorders the list of scalars in accordance with the given \p Mask.
1157 ArrayRef<int> Mask) {
1158 assert(!Mask.empty() && "Expected non-empty mask.");
1159 SmallVector<Value *> Prev(Scalars.size(),
1160 PoisonValue::get(Scalars.front()->getType()));
1161 Prev.swap(Scalars);
1162 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1163 if (Mask[I] != PoisonMaskElem)
1164 Scalars[Mask[I]] = Prev[I];
1165}
1166
1167/// Checks if the provided value does not require scheduling. It does not
1168/// require scheduling if this is not an instruction or it is an instruction
1169/// that does not read/write memory and all operands are either not instructions
1170/// or phi nodes or instructions from different blocks.
1172 auto *I = dyn_cast<Instruction>(V);
1173 if (!I)
1174 return true;
1175 return !mayHaveNonDefUseDependency(*I) &&
1176 all_of(I->operands(), [I](Value *V) {
1177 auto *IO = dyn_cast<Instruction>(V);
1178 if (!IO)
1179 return true;
1180 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1181 });
1182}
1183
1184/// Checks if the provided value does not require scheduling. It does not
1185/// require scheduling if this is not an instruction or it is an instruction
1186/// that does not read/write memory and all users are phi nodes or instructions
1187/// from the different blocks.
1188static bool isUsedOutsideBlock(Value *V) {
1189 auto *I = dyn_cast<Instruction>(V);
1190 if (!I)
1191 return true;
1192 // Limits the number of uses to save compile time.
1193 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1194 all_of(I->users(), [I](User *U) {
1195 auto *IU = dyn_cast<Instruction>(U);
1196 if (!IU)
1197 return true;
1198 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1199 });
1200}
1201
1202/// Checks if the specified value does not require scheduling. It does not
1203/// require scheduling if all operands and all users do not need to be scheduled
1204/// in the current basic block.
1207}
1208
1209/// Checks if the specified array of instructions does not require scheduling.
1210/// It is so if all either instructions have operands that do not require
1211/// scheduling or their users do not require scheduling since they are phis or
1212/// in other basic blocks.
1214 return !VL.empty() &&
1216}
1217
1218namespace slpvectorizer {
1219
1220/// Bottom Up SLP Vectorizer.
1221class BoUpSLP {
1222 struct TreeEntry;
1223 struct ScheduleData;
1226
1227public:
1228 /// Tracks the state we can represent the loads in the given sequence.
1229 enum class LoadsState {
1230 Gather,
1231 Vectorize,
1234 };
1235
1243
1245 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1248 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1249 AC(AC), DB(DB), DL(DL), ORE(ORE),
1250 Builder(Se->getContext(), TargetFolder(*DL)) {
1251 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1252 // Use the vector register size specified by the target unless overridden
1253 // by a command-line option.
1254 // TODO: It would be better to limit the vectorization factor based on
1255 // data type rather than just register size. For example, x86 AVX has
1256 // 256-bit registers, but it does not support integer operations
1257 // at that width (that requires AVX2).
1258 if (MaxVectorRegSizeOption.getNumOccurrences())
1259 MaxVecRegSize = MaxVectorRegSizeOption;
1260 else
1261 MaxVecRegSize =
1263 .getFixedValue();
1264
1265 if (MinVectorRegSizeOption.getNumOccurrences())
1266 MinVecRegSize = MinVectorRegSizeOption;
1267 else
1268 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1269 }
1270
1271 /// Vectorize the tree that starts with the elements in \p VL.
1272 /// Returns the vectorized root.
1274
1275 /// Vectorize the tree but with the list of externally used values \p
1276 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1277 /// generated extractvalue instructions.
1278 /// \param ReplacedExternals containd list of replaced external values
1279 /// {scalar, replace} after emitting extractelement for external uses.
1280 Value *
1281 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1282 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1283 Instruction *ReductionRoot = nullptr);
1284
1285 /// \returns the cost incurred by unwanted spills and fills, caused by
1286 /// holding live values over call sites.
1288
1289 /// \returns the vectorization cost of the subtree that starts at \p VL.
1290 /// A negative number means that this is profitable.
1291 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1292
1293 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1294 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1295 void buildTree(ArrayRef<Value *> Roots,
1296 const SmallDenseSet<Value *> &UserIgnoreLst);
1297
1298 /// Construct a vectorizable tree that starts at \p Roots.
1299 void buildTree(ArrayRef<Value *> Roots);
1300
1301 /// Returns whether the root node has in-tree uses.
1303 return !VectorizableTree.empty() &&
1304 !VectorizableTree.front()->UserTreeIndices.empty();
1305 }
1306
1307 /// Return the scalars of the root node.
1309 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1310 return VectorizableTree.front()->Scalars;
1311 }
1312
1313 /// Checks if the root graph node can be emitted with narrower bitwidth at
1314 /// codegen and returns it signedness, if so.
1316 return MinBWs.at(VectorizableTree.front().get()).second;
1317 }
1318
1319 /// Builds external uses of the vectorized scalars, i.e. the list of
1320 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1321 /// ExternallyUsedValues contains additional list of external uses to handle
1322 /// vectorization of reductions.
1323 void
1324 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1325
1326 /// Transforms graph nodes to target specific representations, if profitable.
1327 void transformNodes();
1328
1329 /// Clear the internal data structures that are created by 'buildTree'.
1330 void deleteTree() {
1331 VectorizableTree.clear();
1332 ScalarToTreeEntry.clear();
1333 MultiNodeScalars.clear();
1334 MustGather.clear();
1335 NonScheduledFirst.clear();
1336 EntryToLastInstruction.clear();
1337 ExternalUses.clear();
1338 ExternalUsesAsOriginalScalar.clear();
1339 for (auto &Iter : BlocksSchedules) {
1340 BlockScheduling *BS = Iter.second.get();
1341 BS->clear();
1342 }
1343 MinBWs.clear();
1344 ReductionBitWidth = 0;
1345 CastMaxMinBWSizes.reset();
1346 ExtraBitWidthNodes.clear();
1347 InstrElementSize.clear();
1348 UserIgnoreList = nullptr;
1349 PostponedGathers.clear();
1350 ValueToGatherNodes.clear();
1351 }
1352
1353 unsigned getTreeSize() const { return VectorizableTree.size(); }
1354
1355 /// Perform LICM and CSE on the newly generated gather sequences.
1357
1358 /// Checks if the specified gather tree entry \p TE can be represented as a
1359 /// shuffled vector entry + (possibly) permutation with other gathers. It
1360 /// implements the checks only for possibly ordered scalars (Loads,
1361 /// ExtractElement, ExtractValue), which can be part of the graph.
1362 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1363
1364 /// Sort loads into increasing pointers offsets to allow greater clustering.
1365 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1366
1367 /// Gets reordering data for the given tree entry. If the entry is vectorized
1368 /// - just return ReorderIndices, otherwise check if the scalars can be
1369 /// reordered and return the most optimal order.
1370 /// \return std::nullopt if ordering is not important, empty order, if
1371 /// identity order is important, or the actual order.
1372 /// \param TopToBottom If true, include the order of vectorized stores and
1373 /// insertelement nodes, otherwise skip them.
1374 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1375 bool TopToBottom);
1376
1377 /// Reorders the current graph to the most profitable order starting from the
1378 /// root node to the leaf nodes. The best order is chosen only from the nodes
1379 /// of the same size (vectorization factor). Smaller nodes are considered
1380 /// parts of subgraph with smaller VF and they are reordered independently. We
1381 /// can make it because we still need to extend smaller nodes to the wider VF
1382 /// and we can merge reordering shuffles with the widening shuffles.
1383 void reorderTopToBottom();
1384
1385 /// Reorders the current graph to the most profitable order starting from
1386 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1387 /// number of reshuffles if the leaf nodes use the same order. In this case we
1388 /// can merge the orders and just shuffle user node instead of shuffling its
1389 /// operands. Plus, even the leaf nodes have different orders, it allows to
1390 /// sink reordering in the graph closer to the root node and merge it later
1391 /// during analysis.
1392 void reorderBottomToTop(bool IgnoreReorder = false);
1393
1394 /// \return The vector element size in bits to use when vectorizing the
1395 /// expression tree ending at \p V. If V is a store, the size is the width of
1396 /// the stored value. Otherwise, the size is the width of the largest loaded
1397 /// value reaching V. This method is used by the vectorizer to calculate
1398 /// vectorization factors.
1399 unsigned getVectorElementSize(Value *V);
1400
1401 /// Compute the minimum type sizes required to represent the entries in a
1402 /// vectorizable tree.
1404
1405 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1406 unsigned getMaxVecRegSize() const {
1407 return MaxVecRegSize;
1408 }
1409
1410 // \returns minimum vector register size as set by cl::opt.
1411 unsigned getMinVecRegSize() const {
1412 return MinVecRegSize;
1413 }
1414
1415 unsigned getMinVF(unsigned Sz) const {
1416 return std::max(2U, getMinVecRegSize() / Sz);
1417 }
1418
1419 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1420 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1421 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1422 return MaxVF ? MaxVF : UINT_MAX;
1423 }
1424
1425 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1426 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1427 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1428 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1429 ///
1430 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1431 unsigned canMapToVector(Type *T) const;
1432
1433 /// \returns True if the VectorizableTree is both tiny and not fully
1434 /// vectorizable. We do not vectorize such trees.
1435 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1436
1437 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1438 /// can be load combined in the backend. Load combining may not be allowed in
1439 /// the IR optimizer, so we do not want to alter the pattern. For example,
1440 /// partially transforming a scalar bswap() pattern into vector code is
1441 /// effectively impossible for the backend to undo.
1442 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1443 /// may not be necessary.
1444 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1445
1446 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1447 /// can be load combined in the backend. Load combining may not be allowed in
1448 /// the IR optimizer, so we do not want to alter the pattern. For example,
1449 /// partially transforming a scalar bswap() pattern into vector code is
1450 /// effectively impossible for the backend to undo.
1451 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1452 /// may not be necessary.
1453 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1454
1455 /// Checks if the given array of loads can be represented as a vectorized,
1456 /// scatter or just simple gather.
1457 /// \param VL list of loads.
1458 /// \param VL0 main load value.
1459 /// \param Order returned order of load instructions.
1460 /// \param PointerOps returned list of pointer operands.
1461 /// \param TryRecursiveCheck used to check if long masked gather can be
1462 /// represented as a serie of loads/insert subvector, if profitable.
1465 SmallVectorImpl<Value *> &PointerOps,
1466 bool TryRecursiveCheck = true) const;
1467
1469
1470 /// This structure holds any data we need about the edges being traversed
1471 /// during buildTree_rec(). We keep track of:
1472 /// (i) the user TreeEntry index, and
1473 /// (ii) the index of the edge.
1474 struct EdgeInfo {
1475 EdgeInfo() = default;
1476 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1478 /// The user TreeEntry.
1479 TreeEntry *UserTE = nullptr;
1480 /// The operand index of the use.
1481 unsigned EdgeIdx = UINT_MAX;
1482#ifndef NDEBUG
1484 const BoUpSLP::EdgeInfo &EI) {
1485 EI.dump(OS);
1486 return OS;
1487 }
1488 /// Debug print.
1489 void dump(raw_ostream &OS) const {
1490 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1491 << " EdgeIdx:" << EdgeIdx << "}";
1492 }
1493 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1494#endif
1495 bool operator == (const EdgeInfo &Other) const {
1496 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1497 }
1498 };
1499
1500 /// A helper class used for scoring candidates for two consecutive lanes.
1502 const TargetLibraryInfo &TLI;
1503 const DataLayout &DL;
1504 ScalarEvolution &SE;
1505 const BoUpSLP &R;
1506 int NumLanes; // Total number of lanes (aka vectorization factor).
1507 int MaxLevel; // The maximum recursion depth for accumulating score.
1508
1509 public:
1511 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1512 int MaxLevel)
1513 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1514 MaxLevel(MaxLevel) {}
1515
1516 // The hard-coded scores listed here are not very important, though it shall
1517 // be higher for better matches to improve the resulting cost. When
1518 // computing the scores of matching one sub-tree with another, we are
1519 // basically counting the number of values that are matching. So even if all
1520 // scores are set to 1, we would still get a decent matching result.
1521 // However, sometimes we have to break ties. For example we may have to
1522 // choose between matching loads vs matching opcodes. This is what these
1523 // scores are helping us with: they provide the order of preference. Also,
1524 // this is important if the scalar is externally used or used in another
1525 // tree entry node in the different lane.
1526
1527 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1528 static const int ScoreConsecutiveLoads = 4;
1529 /// The same load multiple times. This should have a better score than
1530 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1531 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1532 /// a vector load and 1.0 for a broadcast.
1533 static const int ScoreSplatLoads = 3;
1534 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1535 static const int ScoreReversedLoads = 3;
1536 /// A load candidate for masked gather.
1537 static const int ScoreMaskedGatherCandidate = 1;
1538 /// ExtractElementInst from same vector and consecutive indexes.
1539 static const int ScoreConsecutiveExtracts = 4;
1540 /// ExtractElementInst from same vector and reversed indices.
1541 static const int ScoreReversedExtracts = 3;
1542 /// Constants.
1543 static const int ScoreConstants = 2;
1544 /// Instructions with the same opcode.
1545 static const int ScoreSameOpcode = 2;
1546 /// Instructions with alt opcodes (e.g, add + sub).
1547 static const int ScoreAltOpcodes = 1;
1548 /// Identical instructions (a.k.a. splat or broadcast).
1549 static const int ScoreSplat = 1;
1550 /// Matching with an undef is preferable to failing.
1551 static const int ScoreUndef = 1;
1552 /// Score for failing to find a decent match.
1553 static const int ScoreFail = 0;
1554 /// Score if all users are vectorized.
1555 static const int ScoreAllUserVectorized = 1;
1556
1557 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1558 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1559 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1560 /// MainAltOps.
1562 ArrayRef<Value *> MainAltOps) const {
1563 if (!isValidElementType(V1->getType()) ||
1564 !isValidElementType(V2->getType()))
1566
1567 if (V1 == V2) {
1568 if (isa<LoadInst>(V1)) {
1569 // Retruns true if the users of V1 and V2 won't need to be extracted.
1570 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1571 // Bail out if we have too many uses to save compilation time.
1572 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1573 return false;
1574
1575 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1576 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1577 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1578 });
1579 };
1580 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1581 };
1582 // A broadcast of a load can be cheaper on some targets.
1583 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1584 ElementCount::getFixed(NumLanes)) &&
1585 ((int)V1->getNumUses() == NumLanes ||
1586 AllUsersAreInternal(V1, V2)))
1588 }
1590 }
1591
1592 auto CheckSameEntryOrFail = [&]() {
1593 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1594 TE1 && TE1 == R.getTreeEntry(V2))
1597 };
1598
1599 auto *LI1 = dyn_cast<LoadInst>(V1);
1600 auto *LI2 = dyn_cast<LoadInst>(V2);
1601 if (LI1 && LI2) {
1602 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1603 !LI2->isSimple())
1604 return CheckSameEntryOrFail();
1605
1606 std::optional<int> Dist = getPointersDiff(
1607 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1608 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1609 if (!Dist || *Dist == 0) {
1610 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1611 getUnderlyingObject(LI2->getPointerOperand()) &&
1612 R.TTI->isLegalMaskedGather(
1613 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1615 return CheckSameEntryOrFail();
1616 }
1617 // The distance is too large - still may be profitable to use masked
1618 // loads/gathers.
1619 if (std::abs(*Dist) > NumLanes / 2)
1621 // This still will detect consecutive loads, but we might have "holes"
1622 // in some cases. It is ok for non-power-2 vectorization and may produce
1623 // better results. It should not affect current vectorization.
1626 }
1627
1628 auto *C1 = dyn_cast<Constant>(V1);
1629 auto *C2 = dyn_cast<Constant>(V2);
1630 if (C1 && C2)
1632
1633 // Extracts from consecutive indexes of the same vector better score as
1634 // the extracts could be optimized away.
1635 Value *EV1;
1636 ConstantInt *Ex1Idx;
1637 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1638 // Undefs are always profitable for extractelements.
1639 // Compiler can easily combine poison and extractelement <non-poison> or
1640 // undef and extractelement <poison>. But combining undef +
1641 // extractelement <non-poison-but-may-produce-poison> requires some
1642 // extra operations.
1643 if (isa<UndefValue>(V2))
1644 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1647 Value *EV2 = nullptr;
1648 ConstantInt *Ex2Idx = nullptr;
1649 if (match(V2,
1651 m_Undef())))) {
1652 // Undefs are always profitable for extractelements.
1653 if (!Ex2Idx)
1655 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1657 if (EV2 == EV1) {
1658 int Idx1 = Ex1Idx->getZExtValue();
1659 int Idx2 = Ex2Idx->getZExtValue();
1660 int Dist = Idx2 - Idx1;
1661 // The distance is too large - still may be profitable to use
1662 // shuffles.
1663 if (std::abs(Dist) == 0)
1665 if (std::abs(Dist) > NumLanes / 2)
1669 }
1671 }
1672 return CheckSameEntryOrFail();
1673 }
1674
1675 auto *I1 = dyn_cast<Instruction>(V1);
1676 auto *I2 = dyn_cast<Instruction>(V2);
1677 if (I1 && I2) {
1678 if (I1->getParent() != I2->getParent())
1679 return CheckSameEntryOrFail();
1680 SmallVector<Value *, 4> Ops(MainAltOps);
1681 Ops.push_back(I1);
1682 Ops.push_back(I2);
1683 InstructionsState S = getSameOpcode(Ops, TLI);
1684 // Note: Only consider instructions with <= 2 operands to avoid
1685 // complexity explosion.
1686 if (S.getOpcode() &&
1687 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1688 !S.isAltShuffle()) &&
1689 all_of(Ops, [&S](Value *V) {
1690 return cast<Instruction>(V)->getNumOperands() ==
1691 S.MainOp->getNumOperands();
1692 }))
1693 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1695 }
1696
1697 if (isa<UndefValue>(V2))
1699
1700 return CheckSameEntryOrFail();
1701 }
1702
1703 /// Go through the operands of \p LHS and \p RHS recursively until
1704 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1705 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1706 /// of \p U1 and \p U2), except at the beginning of the recursion where
1707 /// these are set to nullptr.
1708 ///
1709 /// For example:
1710 /// \verbatim
1711 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1712 /// \ / \ / \ / \ /
1713 /// + + + +
1714 /// G1 G2 G3 G4
1715 /// \endverbatim
1716 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1717 /// each level recursively, accumulating the score. It starts from matching
1718 /// the additions at level 0, then moves on to the loads (level 1). The
1719 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1720 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1721 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1722 /// Please note that the order of the operands does not matter, as we
1723 /// evaluate the score of all profitable combinations of operands. In
1724 /// other words the score of G1 and G4 is the same as G1 and G2. This
1725 /// heuristic is based on ideas described in:
1726 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1727 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1728 /// Luís F. W. Góes
1730 Instruction *U2, int CurrLevel,
1731 ArrayRef<Value *> MainAltOps) const {
1732
1733 // Get the shallow score of V1 and V2.
1734 int ShallowScoreAtThisLevel =
1735 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1736
1737 // If reached MaxLevel,
1738 // or if V1 and V2 are not instructions,
1739 // or if they are SPLAT,
1740 // or if they are not consecutive,
1741 // or if profitable to vectorize loads or extractelements, early return
1742 // the current cost.
1743 auto *I1 = dyn_cast<Instruction>(LHS);
1744 auto *I2 = dyn_cast<Instruction>(RHS);
1745 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1746 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1747 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1748 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1749 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1750 ShallowScoreAtThisLevel))
1751 return ShallowScoreAtThisLevel;
1752 assert(I1 && I2 && "Should have early exited.");
1753
1754 // Contains the I2 operand indexes that got matched with I1 operands.
1755 SmallSet<unsigned, 4> Op2Used;
1756
1757 // Recursion towards the operands of I1 and I2. We are trying all possible
1758 // operand pairs, and keeping track of the best score.
1759 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1760 OpIdx1 != NumOperands1; ++OpIdx1) {
1761 // Try to pair op1I with the best operand of I2.
1762 int MaxTmpScore = 0;
1763 unsigned MaxOpIdx2 = 0;
1764 bool FoundBest = false;
1765 // If I2 is commutative try all combinations.
1766 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1767 unsigned ToIdx = isCommutative(I2)
1768 ? I2->getNumOperands()
1769 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1770 assert(FromIdx <= ToIdx && "Bad index");
1771 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1772 // Skip operands already paired with OpIdx1.
1773 if (Op2Used.count(OpIdx2))
1774 continue;
1775 // Recursively calculate the cost at each level
1776 int TmpScore =
1777 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1778 I1, I2, CurrLevel + 1, std::nullopt);
1779 // Look for the best score.
1780 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1781 TmpScore > MaxTmpScore) {
1782 MaxTmpScore = TmpScore;
1783 MaxOpIdx2 = OpIdx2;
1784 FoundBest = true;
1785 }
1786 }
1787 if (FoundBest) {
1788 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1789 Op2Used.insert(MaxOpIdx2);
1790 ShallowScoreAtThisLevel += MaxTmpScore;
1791 }
1792 }
1793 return ShallowScoreAtThisLevel;
1794 }
1795 };
1796 /// A helper data structure to hold the operands of a vector of instructions.
1797 /// This supports a fixed vector length for all operand vectors.
1799 /// For each operand we need (i) the value, and (ii) the opcode that it
1800 /// would be attached to if the expression was in a left-linearized form.
1801 /// This is required to avoid illegal operand reordering.
1802 /// For example:
1803 /// \verbatim
1804 /// 0 Op1
1805 /// |/
1806 /// Op1 Op2 Linearized + Op2
1807 /// \ / ----------> |/
1808 /// - -
1809 ///
1810 /// Op1 - Op2 (0 + Op1) - Op2
1811 /// \endverbatim
1812 ///
1813 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1814 ///
1815 /// Another way to think of this is to track all the operations across the
1816 /// path from the operand all the way to the root of the tree and to
1817 /// calculate the operation that corresponds to this path. For example, the
1818 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1819 /// corresponding operation is a '-' (which matches the one in the
1820 /// linearized tree, as shown above).
1821 ///
1822 /// For lack of a better term, we refer to this operation as Accumulated
1823 /// Path Operation (APO).
1824 struct OperandData {
1825 OperandData() = default;
1826 OperandData(Value *V, bool APO, bool IsUsed)
1827 : V(V), APO(APO), IsUsed(IsUsed) {}
1828 /// The operand value.
1829 Value *V = nullptr;
1830 /// TreeEntries only allow a single opcode, or an alternate sequence of
1831 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1832 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1833 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1834 /// (e.g., Add/Mul)
1835 bool APO = false;
1836 /// Helper data for the reordering function.
1837 bool IsUsed = false;
1838 };
1839
1840 /// During operand reordering, we are trying to select the operand at lane
1841 /// that matches best with the operand at the neighboring lane. Our
1842 /// selection is based on the type of value we are looking for. For example,
1843 /// if the neighboring lane has a load, we need to look for a load that is
1844 /// accessing a consecutive address. These strategies are summarized in the
1845 /// 'ReorderingMode' enumerator.
1846 enum class ReorderingMode {
1847 Load, ///< Matching loads to consecutive memory addresses
1848 Opcode, ///< Matching instructions based on opcode (same or alternate)
1849 Constant, ///< Matching constants
1850 Splat, ///< Matching the same instruction multiple times (broadcast)
1851 Failed, ///< We failed to create a vectorizable group
1852 };
1853
1855
1856 /// A vector of operand vectors.
1858
1859 const TargetLibraryInfo &TLI;
1860 const DataLayout &DL;
1861 ScalarEvolution &SE;
1862 const BoUpSLP &R;
1863 const Loop *L = nullptr;
1864
1865 /// \returns the operand data at \p OpIdx and \p Lane.
1866 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1867 return OpsVec[OpIdx][Lane];
1868 }
1869
1870 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1871 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1872 return OpsVec[OpIdx][Lane];
1873 }
1874
1875 /// Clears the used flag for all entries.
1876 void clearUsed() {
1877 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1878 OpIdx != NumOperands; ++OpIdx)
1879 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1880 ++Lane)
1881 OpsVec[OpIdx][Lane].IsUsed = false;
1882 }
1883
1884 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1885 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1886 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1887 }
1888
1889 /// \param Lane lane of the operands under analysis.
1890 /// \param OpIdx operand index in \p Lane lane we're looking the best
1891 /// candidate for.
1892 /// \param Idx operand index of the current candidate value.
1893 /// \returns The additional score due to possible broadcasting of the
1894 /// elements in the lane. It is more profitable to have power-of-2 unique
1895 /// elements in the lane, it will be vectorized with higher probability
1896 /// after removing duplicates. Currently the SLP vectorizer supports only
1897 /// vectorization of the power-of-2 number of unique scalars.
1898 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1899 Value *IdxLaneV = getData(Idx, Lane).V;
1900 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1901 return 0;
1903 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1904 if (Ln == Lane)
1905 continue;
1906 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1907 if (!isa<Instruction>(OpIdxLnV))
1908 return 0;
1909 Uniques.insert(OpIdxLnV);
1910 }
1911 int UniquesCount = Uniques.size();
1912 int UniquesCntWithIdxLaneV =
1913 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1914 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1915 int UniquesCntWithOpIdxLaneV =
1916 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1917 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1918 return 0;
1919 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1920 UniquesCntWithOpIdxLaneV) -
1921 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1922 }
1923
1924 /// \param Lane lane of the operands under analysis.
1925 /// \param OpIdx operand index in \p Lane lane we're looking the best
1926 /// candidate for.
1927 /// \param Idx operand index of the current candidate value.
1928 /// \returns The additional score for the scalar which users are all
1929 /// vectorized.
1930 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1931 Value *IdxLaneV = getData(Idx, Lane).V;
1932 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1933 // Do not care about number of uses for vector-like instructions
1934 // (extractelement/extractvalue with constant indices), they are extracts
1935 // themselves and already externally used. Vectorization of such
1936 // instructions does not add extra extractelement instruction, just may
1937 // remove it.
1938 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1939 isVectorLikeInstWithConstOps(OpIdxLaneV))
1941 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1942 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1943 return 0;
1944 return R.areAllUsersVectorized(IdxLaneI)
1946 : 0;
1947 }
1948
1949 /// Score scaling factor for fully compatible instructions but with
1950 /// different number of external uses. Allows better selection of the
1951 /// instructions with less external uses.
1952 static const int ScoreScaleFactor = 10;
1953
1954 /// \Returns the look-ahead score, which tells us how much the sub-trees
1955 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1956 /// score. This helps break ties in an informed way when we cannot decide on
1957 /// the order of the operands by just considering the immediate
1958 /// predecessors.
1959 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1960 int Lane, unsigned OpIdx, unsigned Idx,
1961 bool &IsUsed) {
1962 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1964 // Keep track of the instruction stack as we recurse into the operands
1965 // during the look-ahead score exploration.
1966 int Score =
1967 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1968 /*CurrLevel=*/1, MainAltOps);
1969 if (Score) {
1970 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1971 if (Score <= -SplatScore) {
1972 // Set the minimum score for splat-like sequence to avoid setting
1973 // failed state.
1974 Score = 1;
1975 } else {
1976 Score += SplatScore;
1977 // Scale score to see the difference between different operands
1978 // and similar operands but all vectorized/not all vectorized
1979 // uses. It does not affect actual selection of the best
1980 // compatible operand in general, just allows to select the
1981 // operand with all vectorized uses.
1982 Score *= ScoreScaleFactor;
1983 Score += getExternalUseScore(Lane, OpIdx, Idx);
1984 IsUsed = true;
1985 }
1986 }
1987 return Score;
1988 }
1989
1990 /// Best defined scores per lanes between the passes. Used to choose the
1991 /// best operand (with the highest score) between the passes.
1992 /// The key - {Operand Index, Lane}.
1993 /// The value - the best score between the passes for the lane and the
1994 /// operand.
1996 BestScoresPerLanes;
1997
1998 // Search all operands in Ops[*][Lane] for the one that matches best
1999 // Ops[OpIdx][LastLane] and return its opreand index.
2000 // If no good match can be found, return std::nullopt.
2001 std::optional<unsigned>
2002 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2003 ArrayRef<ReorderingMode> ReorderingModes,
2004 ArrayRef<Value *> MainAltOps) {
2005 unsigned NumOperands = getNumOperands();
2006
2007 // The operand of the previous lane at OpIdx.
2008 Value *OpLastLane = getData(OpIdx, LastLane).V;
2009
2010 // Our strategy mode for OpIdx.
2011 ReorderingMode RMode = ReorderingModes[OpIdx];
2012 if (RMode == ReorderingMode::Failed)
2013 return std::nullopt;
2014
2015 // The linearized opcode of the operand at OpIdx, Lane.
2016 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2017
2018 // The best operand index and its score.
2019 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2020 // are using the score to differentiate between the two.
2021 struct BestOpData {
2022 std::optional<unsigned> Idx;
2023 unsigned Score = 0;
2024 } BestOp;
2025 BestOp.Score =
2026 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2027 .first->second;
2028
2029 // Track if the operand must be marked as used. If the operand is set to
2030 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2031 // want to reestimate the operands again on the following iterations).
2032 bool IsUsed = RMode == ReorderingMode::Splat ||
2033 RMode == ReorderingMode::Constant ||
2034 RMode == ReorderingMode::Load;
2035 // Iterate through all unused operands and look for the best.
2036 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2037 // Get the operand at Idx and Lane.
2038 OperandData &OpData = getData(Idx, Lane);
2039 Value *Op = OpData.V;
2040 bool OpAPO = OpData.APO;
2041
2042 // Skip already selected operands.
2043 if (OpData.IsUsed)
2044 continue;
2045
2046 // Skip if we are trying to move the operand to a position with a
2047 // different opcode in the linearized tree form. This would break the
2048 // semantics.
2049 if (OpAPO != OpIdxAPO)
2050 continue;
2051
2052 // Look for an operand that matches the current mode.
2053 switch (RMode) {
2054 case ReorderingMode::Load:
2055 case ReorderingMode::Opcode: {
2056 bool LeftToRight = Lane > LastLane;
2057 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2058 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2059 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2060 OpIdx, Idx, IsUsed);
2061 if (Score > static_cast<int>(BestOp.Score) ||
2062 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2063 Idx == OpIdx)) {
2064 BestOp.Idx = Idx;
2065 BestOp.Score = Score;
2066 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2067 }
2068 break;
2069 }
2070 case ReorderingMode::Constant:
2071 if (isa<Constant>(Op) ||
2072 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2073 BestOp.Idx = Idx;
2074 if (isa<Constant>(Op)) {
2076 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2078 }
2079 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2080 IsUsed = false;
2081 }
2082 break;
2083 case ReorderingMode::Splat:
2084 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2085 IsUsed = Op == OpLastLane;
2086 if (Op == OpLastLane) {
2087 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2088 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2090 }
2091 BestOp.Idx = Idx;
2092 }
2093 break;
2094 case ReorderingMode::Failed:
2095 llvm_unreachable("Not expected Failed reordering mode.");
2096 }
2097 }
2098
2099 if (BestOp.Idx) {
2100 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2101 return BestOp.Idx;
2102 }
2103 // If we could not find a good match return std::nullopt.
2104 return std::nullopt;
2105 }
2106
2107 /// Helper for reorderOperandVecs.
2108 /// \returns the lane that we should start reordering from. This is the one
2109 /// which has the least number of operands that can freely move about or
2110 /// less profitable because it already has the most optimal set of operands.
2111 unsigned getBestLaneToStartReordering() const {
2112 unsigned Min = UINT_MAX;
2113 unsigned SameOpNumber = 0;
2114 // std::pair<unsigned, unsigned> is used to implement a simple voting
2115 // algorithm and choose the lane with the least number of operands that
2116 // can freely move about or less profitable because it already has the
2117 // most optimal set of operands. The first unsigned is a counter for
2118 // voting, the second unsigned is the counter of lanes with instructions
2119 // with same/alternate opcodes and same parent basic block.
2121 // Try to be closer to the original results, if we have multiple lanes
2122 // with same cost. If 2 lanes have the same cost, use the one with the
2123 // lowest index.
2124 for (int I = getNumLanes(); I > 0; --I) {
2125 unsigned Lane = I - 1;
2126 OperandsOrderData NumFreeOpsHash =
2127 getMaxNumOperandsThatCanBeReordered(Lane);
2128 // Compare the number of operands that can move and choose the one with
2129 // the least number.
2130 if (NumFreeOpsHash.NumOfAPOs < Min) {
2131 Min = NumFreeOpsHash.NumOfAPOs;
2132 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2133 HashMap.clear();
2134 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2135 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2136 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2137 // Select the most optimal lane in terms of number of operands that
2138 // should be moved around.
2139 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2140 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2141 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2142 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2143 auto *It = HashMap.find(NumFreeOpsHash.Hash);
2144 if (It == HashMap.end())
2145 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2146 else
2147 ++It->second.first;
2148 }
2149 }
2150 // Select the lane with the minimum counter.
2151 unsigned BestLane = 0;
2152 unsigned CntMin = UINT_MAX;
2153 for (const auto &Data : reverse(HashMap)) {
2154 if (Data.second.first < CntMin) {
2155 CntMin = Data.second.first;
2156 BestLane = Data.second.second;
2157 }
2158 }
2159 return BestLane;
2160 }
2161
2162 /// Data structure that helps to reorder operands.
2163 struct OperandsOrderData {
2164 /// The best number of operands with the same APOs, which can be
2165 /// reordered.
2166 unsigned NumOfAPOs = UINT_MAX;
2167 /// Number of operands with the same/alternate instruction opcode and
2168 /// parent.
2169 unsigned NumOpsWithSameOpcodeParent = 0;
2170 /// Hash for the actual operands ordering.
2171 /// Used to count operands, actually their position id and opcode
2172 /// value. It is used in the voting mechanism to find the lane with the
2173 /// least number of operands that can freely move about or less profitable
2174 /// because it already has the most optimal set of operands. Can be
2175 /// replaced with SmallVector<unsigned> instead but hash code is faster
2176 /// and requires less memory.
2177 unsigned Hash = 0;
2178 };
2179 /// \returns the maximum number of operands that are allowed to be reordered
2180 /// for \p Lane and the number of compatible instructions(with the same
2181 /// parent/opcode). This is used as a heuristic for selecting the first lane
2182 /// to start operand reordering.
2183 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2184 unsigned CntTrue = 0;
2185 unsigned NumOperands = getNumOperands();
2186 // Operands with the same APO can be reordered. We therefore need to count
2187 // how many of them we have for each APO, like this: Cnt[APO] = x.
2188 // Since we only have two APOs, namely true and false, we can avoid using
2189 // a map. Instead we can simply count the number of operands that
2190 // correspond to one of them (in this case the 'true' APO), and calculate
2191 // the other by subtracting it from the total number of operands.
2192 // Operands with the same instruction opcode and parent are more
2193 // profitable since we don't need to move them in many cases, with a high
2194 // probability such lane already can be vectorized effectively.
2195 bool AllUndefs = true;
2196 unsigned NumOpsWithSameOpcodeParent = 0;
2197 Instruction *OpcodeI = nullptr;
2198 BasicBlock *Parent = nullptr;
2199 unsigned Hash = 0;
2200 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2201 const OperandData &OpData = getData(OpIdx, Lane);
2202 if (OpData.APO)
2203 ++CntTrue;
2204 // Use Boyer-Moore majority voting for finding the majority opcode and
2205 // the number of times it occurs.
2206 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2207 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2208 I->getParent() != Parent) {
2209 if (NumOpsWithSameOpcodeParent == 0) {
2210 NumOpsWithSameOpcodeParent = 1;
2211 OpcodeI = I;
2212 Parent = I->getParent();
2213 } else {
2214 --NumOpsWithSameOpcodeParent;
2215 }
2216 } else {
2217 ++NumOpsWithSameOpcodeParent;
2218 }
2219 }
2220 Hash = hash_combine(
2221 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2222 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2223 }
2224 if (AllUndefs)
2225 return {};
2226 OperandsOrderData Data;
2227 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2228 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2229 Data.Hash = Hash;
2230 return Data;
2231 }
2232
2233 /// Go through the instructions in VL and append their operands.
2234 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2235 assert(!VL.empty() && "Bad VL");
2236 assert((empty() || VL.size() == getNumLanes()) &&
2237 "Expected same number of lanes");
2238 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2239 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2240 constexpr unsigned IntrinsicNumOperands = 2;
2241 if (isa<IntrinsicInst>(VL[0]))
2242 NumOperands = IntrinsicNumOperands;
2243 OpsVec.resize(NumOperands);
2244 unsigned NumLanes = VL.size();
2245 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2246 OpsVec[OpIdx].resize(NumLanes);
2247 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2248 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2249 // Our tree has just 3 nodes: the root and two operands.
2250 // It is therefore trivial to get the APO. We only need to check the
2251 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2252 // RHS operand. The LHS operand of both add and sub is never attached
2253 // to an inversese operation in the linearized form, therefore its APO
2254 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2255
2256 // Since operand reordering is performed on groups of commutative
2257 // operations or alternating sequences (e.g., +, -), we can safely
2258 // tell the inverse operations by checking commutativity.
2259 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2260 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2261 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2262 APO, false};
2263 }
2264 }
2265 }
2266
2267 /// \returns the number of operands.
2268 unsigned getNumOperands() const { return OpsVec.size(); }
2269
2270 /// \returns the number of lanes.
2271 unsigned getNumLanes() const { return OpsVec[0].size(); }
2272
2273 /// \returns the operand value at \p OpIdx and \p Lane.
2274 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2275 return getData(OpIdx, Lane).V;
2276 }
2277
2278 /// \returns true if the data structure is empty.
2279 bool empty() const { return OpsVec.empty(); }
2280
2281 /// Clears the data.
2282 void clear() { OpsVec.clear(); }
2283
2284 /// \Returns true if there are enough operands identical to \p Op to fill
2285 /// the whole vector (it is mixed with constants or loop invariant values).
2286 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2287 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2288 bool OpAPO = getData(OpIdx, Lane).APO;
2289 bool IsInvariant = L && L->isLoopInvariant(Op);
2290 unsigned Cnt = 0;
2291 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2292 if (Ln == Lane)
2293 continue;
2294 // This is set to true if we found a candidate for broadcast at Lane.
2295 bool FoundCandidate = false;
2296 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2297 OperandData &Data = getData(OpI, Ln);
2298 if (Data.APO != OpAPO || Data.IsUsed)
2299 continue;
2300 Value *OpILane = getValue(OpI, Lane);
2301 bool IsConstantOp = isa<Constant>(OpILane);
2302 // Consider the broadcast candidate if:
2303 // 1. Same value is found in one of the operands.
2304 if (Data.V == Op ||
2305 // 2. The operand in the given lane is not constant but there is a
2306 // constant operand in another lane (which can be moved to the
2307 // given lane). In this case we can represent it as a simple
2308 // permutation of constant and broadcast.
2309 (!IsConstantOp &&
2310 ((Lns > 2 && isa<Constant>(Data.V)) ||
2311 // 2.1. If we have only 2 lanes, need to check that value in the
2312 // next lane does not build same opcode sequence.
2313 (Lns == 2 &&
2314 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2315 .getOpcode() &&
2316 isa<Constant>(Data.V)))) ||
2317 // 3. The operand in the current lane is loop invariant (can be
2318 // hoisted out) and another operand is also a loop invariant
2319 // (though not a constant). In this case the whole vector can be
2320 // hoisted out.
2321 // FIXME: need to teach the cost model about this case for better
2322 // estimation.
2323 (IsInvariant && !isa<Constant>(Data.V) &&
2324 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2325 L->isLoopInvariant(Data.V))) {
2326 FoundCandidate = true;
2327 Data.IsUsed = Data.V == Op;
2328 if (Data.V == Op)
2329 ++Cnt;
2330 break;
2331 }
2332 }
2333 if (!FoundCandidate)
2334 return false;
2335 }
2336 return getNumLanes() == 2 || Cnt > 1;
2337 }
2338
2339 /// Checks if there is at least single compatible operand in lanes other
2340 /// than \p Lane, compatible with the operand \p Op.
2341 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2342 bool OpAPO = getData(OpIdx, Lane).APO;
2343 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2344 if (Ln == Lane)
2345 continue;
2346 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2347 const OperandData &Data = getData(OpI, Ln);
2348 if (Data.APO != OpAPO || Data.IsUsed)
2349 return true;
2350 Value *OpILn = getValue(OpI, Ln);
2351 return (L && L->isLoopInvariant(OpILn)) ||
2352 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2353 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2354 }))
2355 return true;
2356 }
2357 return false;
2358 }
2359
2360 public:
2361 /// Initialize with all the operands of the instruction vector \p RootVL.
2363 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2364 L(R.LI->getLoopFor(
2365 (cast<Instruction>(RootVL.front())->getParent()))) {
2366 // Append all the operands of RootVL.
2367 appendOperandsOfVL(RootVL);
2368 }
2369
2370 /// \Returns a value vector with the operands across all lanes for the
2371 /// opearnd at \p OpIdx.
2372 ValueList getVL(unsigned OpIdx) const {
2373 ValueList OpVL(OpsVec[OpIdx].size());
2374 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2375 "Expected same num of lanes across all operands");
2376 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2377 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2378 return OpVL;
2379 }
2380
2381 // Performs operand reordering for 2 or more operands.
2382 // The original operands are in OrigOps[OpIdx][Lane].
2383 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2384 void reorder() {
2385 unsigned NumOperands = getNumOperands();
2386 unsigned NumLanes = getNumLanes();
2387 // Each operand has its own mode. We are using this mode to help us select
2388 // the instructions for each lane, so that they match best with the ones
2389 // we have selected so far.
2390 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2391
2392 // This is a greedy single-pass algorithm. We are going over each lane
2393 // once and deciding on the best order right away with no back-tracking.
2394 // However, in order to increase its effectiveness, we start with the lane
2395 // that has operands that can move the least. For example, given the
2396 // following lanes:
2397 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2398 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2399 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2400 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2401 // we will start at Lane 1, since the operands of the subtraction cannot
2402 // be reordered. Then we will visit the rest of the lanes in a circular
2403 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2404
2405 // Find the first lane that we will start our search from.
2406 unsigned FirstLane = getBestLaneToStartReordering();
2407
2408 // Initialize the modes.
2409 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2410 Value *OpLane0 = getValue(OpIdx, FirstLane);
2411 // Keep track if we have instructions with all the same opcode on one
2412 // side.
2413 if (isa<LoadInst>(OpLane0))
2414 ReorderingModes[OpIdx] = ReorderingMode::Load;
2415 else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2416 // Check if OpLane0 should be broadcast.
2417 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2418 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2419 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2420 else
2421 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2422 } else if (isa<Constant>(OpLane0))
2423 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2424 else if (isa<Argument>(OpLane0))
2425 // Our best hope is a Splat. It may save some cost in some cases.
2426 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2427 else
2428 // NOTE: This should be unreachable.
2429 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2430 }
2431
2432 // Check that we don't have same operands. No need to reorder if operands
2433 // are just perfect diamond or shuffled diamond match. Do not do it only
2434 // for possible broadcasts or non-power of 2 number of scalars (just for
2435 // now).
2436 auto &&SkipReordering = [this]() {
2437 SmallPtrSet<Value *, 4> UniqueValues;
2438 ArrayRef<OperandData> Op0 = OpsVec.front();
2439 for (const OperandData &Data : Op0)
2440 UniqueValues.insert(Data.V);
2441 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2442 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2443 return !UniqueValues.contains(Data.V);
2444 }))
2445 return false;
2446 }
2447 // TODO: Check if we can remove a check for non-power-2 number of
2448 // scalars after full support of non-power-2 vectorization.
2449 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2450 };
2451
2452 // If the initial strategy fails for any of the operand indexes, then we
2453 // perform reordering again in a second pass. This helps avoid assigning
2454 // high priority to the failed strategy, and should improve reordering for
2455 // the non-failed operand indexes.
2456 for (int Pass = 0; Pass != 2; ++Pass) {
2457 // Check if no need to reorder operands since they're are perfect or
2458 // shuffled diamond match.
2459 // Need to do it to avoid extra external use cost counting for
2460 // shuffled matches, which may cause regressions.
2461 if (SkipReordering())
2462 break;
2463 // Skip the second pass if the first pass did not fail.
2464 bool StrategyFailed = false;
2465 // Mark all operand data as free to use.
2466 clearUsed();
2467 // We keep the original operand order for the FirstLane, so reorder the
2468 // rest of the lanes. We are visiting the nodes in a circular fashion,
2469 // using FirstLane as the center point and increasing the radius
2470 // distance.
2471 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2472 for (unsigned I = 0; I < NumOperands; ++I)
2473 MainAltOps[I].push_back(getData(I, FirstLane).V);
2474
2475 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2476 // Visit the lane on the right and then the lane on the left.
2477 for (int Direction : {+1, -1}) {
2478 int Lane = FirstLane + Direction * Distance;
2479 if (Lane < 0 || Lane >= (int)NumLanes)
2480 continue;
2481 int LastLane = Lane - Direction;
2482 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2483 "Out of bounds");
2484 // Look for a good match for each operand.
2485 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2486 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2487 std::optional<unsigned> BestIdx = getBestOperand(
2488 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2489 // By not selecting a value, we allow the operands that follow to
2490 // select a better matching value. We will get a non-null value in
2491 // the next run of getBestOperand().
2492 if (BestIdx) {
2493 // Swap the current operand with the one returned by
2494 // getBestOperand().
2495 swap(OpIdx, *BestIdx, Lane);
2496 } else {
2497 // Enable the second pass.
2498 StrategyFailed = true;
2499 }
2500 // Try to get the alternate opcode and follow it during analysis.
2501 if (MainAltOps[OpIdx].size() != 2) {
2502 OperandData &AltOp = getData(OpIdx, Lane);
2503 InstructionsState OpS =
2504 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2505 if (OpS.getOpcode() && OpS.isAltShuffle())
2506 MainAltOps[OpIdx].push_back(AltOp.V);
2507 }
2508 }
2509 }
2510 }
2511 // Skip second pass if the strategy did not fail.
2512 if (!StrategyFailed)
2513 break;
2514 }
2515 }
2516
2517#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2518 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2519 switch (RMode) {
2520 case ReorderingMode::Load:
2521 return "Load";
2522 case ReorderingMode::Opcode:
2523 return "Opcode";
2524 case ReorderingMode::Constant:
2525 return "Constant";
2526 case ReorderingMode::Splat:
2527 return "Splat";
2528 case ReorderingMode::Failed:
2529 return "Failed";
2530 }
2531 llvm_unreachable("Unimplemented Reordering Type");
2532 }
2533
2534 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2535 raw_ostream &OS) {
2536 return OS << getModeStr(RMode);
2537 }
2538
2539 /// Debug print.
2540 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2541 printMode(RMode, dbgs());
2542 }
2543
2544 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2545 return printMode(RMode, OS);
2546 }
2547
2549 const unsigned Indent = 2;
2550 unsigned Cnt = 0;
2551 for (const OperandDataVec &OpDataVec : OpsVec) {
2552 OS << "Operand " << Cnt++ << "\n";
2553 for (const OperandData &OpData : OpDataVec) {
2554 OS.indent(Indent) << "{";
2555 if (Value *V = OpData.V)
2556 OS << *V;
2557 else
2558 OS << "null";
2559 OS << ", APO:" << OpData.APO << "}\n";
2560 }
2561 OS << "\n";
2562 }
2563 return OS;
2564 }
2565
2566 /// Debug print.
2567 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2568#endif
2569 };
2570
2571 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2572 /// for a pair which have highest score deemed to have best chance to form
2573 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2574 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2575 /// of the cost, considered to be good enough score.
2576 std::optional<int>
2577 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2578 int Limit = LookAheadHeuristics::ScoreFail) const {
2579 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2581 int BestScore = Limit;
2582 std::optional<int> Index;
2583 for (int I : seq<int>(0, Candidates.size())) {
2584 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2585 Candidates[I].second,
2586 /*U1=*/nullptr, /*U2=*/nullptr,
2587 /*Level=*/1, std::nullopt);
2588 if (Score > BestScore) {
2589 BestScore = Score;
2590 Index = I;
2591 }
2592 }
2593 return Index;
2594 }
2595
2596 /// Checks if the instruction is marked for deletion.
2597 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2598
2599 /// Removes an instruction from its block and eventually deletes it.
2600 /// It's like Instruction::eraseFromParent() except that the actual deletion
2601 /// is delayed until BoUpSLP is destructed.
2603 DeletedInstructions.insert(I);
2604 }
2605
2606 /// Remove instructions from the parent function and clear the operands of \p
2607 /// DeadVals instructions, marking for deletion trivially dead operands.
2608 template <typename T>
2611 for (T *V : DeadVals) {
2612 auto *I = cast<Instruction>(V);
2613 DeletedInstructions.insert(I);
2614 }
2615 DenseSet<Value *> Processed;
2616 for (T *V : DeadVals) {
2617 if (!V || !Processed.insert(V).second)
2618 continue;
2619 auto *I = cast<Instruction>(V);
2622 if (const TreeEntry *Entry = getTreeEntry(I)) {
2623 Entries.push_back(Entry);
2624 auto It = MultiNodeScalars.find(I);
2625 if (It != MultiNodeScalars.end())
2626 Entries.append(It->second.begin(), It->second.end());
2627 }
2628 for (Use &U : I->operands()) {
2629 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2630 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2632 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2633 return Entry->VectorizedValue == OpI;
2634 })))
2635 DeadInsts.push_back(OpI);
2636 }
2637 I->dropAllReferences();
2638 }
2639 for (T *V : DeadVals) {
2640 auto *I = cast<Instruction>(V);
2641 if (!I->getParent())
2642 continue;
2643 assert((I->use_empty() || all_of(I->uses(),
2644 [&](Use &U) {
2645 return isDeleted(
2646 cast<Instruction>(U.getUser()));
2647 })) &&
2648 "trying to erase instruction with users.");
2649 I->removeFromParent();
2650 SE->forgetValue(I);
2651 }
2652 // Process the dead instruction list until empty.
2653 while (!DeadInsts.empty()) {
2654 Value *V = DeadInsts.pop_back_val();
2655 Instruction *VI = cast_or_null<Instruction>(V);
2656 if (!VI || !VI->getParent())
2657 continue;
2659 "Live instruction found in dead worklist!");
2660 assert(VI->use_empty() && "Instructions with uses are not dead.");
2661
2662 // Don't lose the debug info while deleting the instructions.
2663 salvageDebugInfo(*VI);
2664
2665 // Null out all of the instruction's operands to see if any operand
2666 // becomes dead as we go.
2667 for (Use &OpU : VI->operands()) {
2668 Value *OpV = OpU.get();
2669 if (!OpV)
2670 continue;
2671 OpU.set(nullptr);
2672
2673 if (!OpV->use_empty())
2674 continue;
2675
2676 // If the operand is an instruction that became dead as we nulled out
2677 // the operand, and if it is 'trivially' dead, delete it in a future
2678 // loop iteration.
2679 if (auto *OpI = dyn_cast<Instruction>(OpV))
2680 if (!DeletedInstructions.contains(OpI) &&
2682 DeadInsts.push_back(OpI);
2683 }
2684
2685 VI->removeFromParent();
2686 DeletedInstructions.insert(VI);
2687 SE->forgetValue(VI);
2688 }
2689 }
2690
2691 /// Checks if the instruction was already analyzed for being possible
2692 /// reduction root.
2694 return AnalyzedReductionsRoots.count(I);
2695 }
2696 /// Register given instruction as already analyzed for being possible
2697 /// reduction root.
2699 AnalyzedReductionsRoots.insert(I);
2700 }
2701 /// Checks if the provided list of reduced values was checked already for
2702 /// vectorization.
2704 return AnalyzedReductionVals.contains(hash_value(VL));
2705 }
2706 /// Adds the list of reduced values to list of already checked values for the
2707 /// vectorization.
2709 AnalyzedReductionVals.insert(hash_value(VL));
2710 }
2711 /// Clear the list of the analyzed reduction root instructions.
2713 AnalyzedReductionsRoots.clear();
2714 AnalyzedReductionVals.clear();
2715 AnalyzedMinBWVals.clear();
2716 }
2717 /// Checks if the given value is gathered in one of the nodes.
2718 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2719 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2720 }
2721 /// Checks if the given value is gathered in one of the nodes.
2722 bool isGathered(const Value *V) const {
2723 return MustGather.contains(V);
2724 }
2725 /// Checks if the specified value was not schedule.
2726 bool isNotScheduled(const Value *V) const {
2727 return NonScheduledFirst.contains(V);
2728 }
2729
2730 /// Check if the value is vectorized in the tree.
2731 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2732
2733 ~BoUpSLP();
2734
2735private:
2736 /// Determine if a node \p E in can be demoted to a smaller type with a
2737 /// truncation. We collect the entries that will be demoted in ToDemote.
2738 /// \param E Node for analysis
2739 /// \param ToDemote indices of the nodes to be demoted.
2740 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2741 unsigned &BitWidth,
2742 SmallVectorImpl<unsigned> &ToDemote,
2744 unsigned &MaxDepthLevel,
2745 bool &IsProfitableToDemote,
2746 bool IsTruncRoot) const;
2747
2748 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2749 /// reordering (i.e. the operands can be reordered because they have only one
2750 /// user and reordarable).
2751 /// \param ReorderableGathers List of all gather nodes that require reordering
2752 /// (e.g., gather of extractlements or partially vectorizable loads).
2753 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2754 /// reordering, subset of \p NonVectorized.
2755 bool
2756 canReorderOperands(TreeEntry *UserTE,
2757 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2758 ArrayRef<TreeEntry *> ReorderableGathers,
2759 SmallVectorImpl<TreeEntry *> &GatherOps);
2760
2761 /// Checks if the given \p TE is a gather node with clustered reused scalars
2762 /// and reorders it per given \p Mask.
2763 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2764
2765 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2766 /// if any. If it is not vectorized (gather node), returns nullptr.
2767 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2768 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2769 TreeEntry *TE = nullptr;
2770 const auto *It = find_if(VL, [&](Value *V) {
2771 TE = getTreeEntry(V);
2772 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2773 return true;
2774 auto It = MultiNodeScalars.find(V);
2775 if (It != MultiNodeScalars.end()) {
2776 for (TreeEntry *E : It->second) {
2777 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2778 TE = E;
2779 return true;
2780 }
2781 }
2782 }
2783 return false;
2784 });
2785 if (It != VL.end()) {
2786 assert(TE->isSame(VL) && "Expected same scalars.");
2787 return TE;
2788 }
2789 return nullptr;
2790 }
2791
2792 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2793 /// if any. If it is not vectorized (gather node), returns nullptr.
2794 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2795 unsigned OpIdx) const {
2796 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2797 const_cast<TreeEntry *>(UserTE), OpIdx);
2798 }
2799
2800 /// Checks if all users of \p I are the part of the vectorization tree.
2801 bool areAllUsersVectorized(
2802 Instruction *I,
2803 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2804
2805 /// Return information about the vector formed for the specified index
2806 /// of a vector of (the same) instruction.
2808
2809 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2810 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2811
2812 /// \returns Cast context for the given graph node.
2814 getCastContextHint(const TreeEntry &TE) const;
2815
2816 /// \returns the cost of the vectorizable entry.
2817 InstructionCost getEntryCost(const TreeEntry *E,
2818 ArrayRef<Value *> VectorizedVals,
2819 SmallPtrSetImpl<Value *> &CheckedExtracts);
2820
2821 /// This is the recursive part of buildTree.
2822 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2823 const EdgeInfo &EI);
2824
2825 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2826 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2827 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2828 /// returns false, setting \p CurrentOrder to either an empty vector or a
2829 /// non-identity permutation that allows to reuse extract instructions.
2830 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2831 /// extract order.
2832 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2833 SmallVectorImpl<unsigned> &CurrentOrder,
2834 bool ResizeAllowed = false) const;
2835
2836 /// Vectorize a single entry in the tree.
2837 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2838 /// avoid issues with def-use order.
2839 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2840
2841 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2842 /// \p E.
2843 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2844 /// avoid issues with def-use order.
2845 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2846
2847 /// Create a new vector from a list of scalar values. Produces a sequence
2848 /// which exploits values reused across lanes, and arranges the inserts
2849 /// for ease of later optimization.
2850 template <typename BVTy, typename ResTy, typename... Args>
2851 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2852
2853 /// Create a new vector from a list of scalar values. Produces a sequence
2854 /// which exploits values reused across lanes, and arranges the inserts
2855 /// for ease of later optimization.
2856 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2857
2858 /// Returns the instruction in the bundle, which can be used as a base point
2859 /// for scheduling. Usually it is the last instruction in the bundle, except
2860 /// for the case when all operands are external (in this case, it is the first
2861 /// instruction in the list).
2862 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2863
2864 /// Tries to find extractelement instructions with constant indices from fixed
2865 /// vector type and gather such instructions into a bunch, which highly likely
2866 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2867 /// was successful, the matched scalars are replaced by poison values in \p VL
2868 /// for future analysis.
2869 std::optional<TargetTransformInfo::ShuffleKind>
2870 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2871 SmallVectorImpl<int> &Mask) const;
2872
2873 /// Tries to find extractelement instructions with constant indices from fixed
2874 /// vector type and gather such instructions into a bunch, which highly likely
2875 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2876 /// was successful, the matched scalars are replaced by poison values in \p VL
2877 /// for future analysis.
2879 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2881 unsigned NumParts) const;
2882
2883 /// Checks if the gathered \p VL can be represented as a single register
2884 /// shuffle(s) of previous tree entries.
2885 /// \param TE Tree entry checked for permutation.
2886 /// \param VL List of scalars (a subset of the TE scalar), checked for
2887 /// permutations. Must form single-register vector.
2888 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2889 /// commands to build the mask using the original vector value, without
2890 /// relying on the potential reordering.
2891 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2892 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2893 std::optional<TargetTransformInfo::ShuffleKind>
2894 isGatherShuffledSingleRegisterEntry(
2895 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2896 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2897 bool ForOrder);
2898
2899 /// Checks if the gathered \p VL can be represented as multi-register
2900 /// shuffle(s) of previous tree entries.
2901 /// \param TE Tree entry checked for permutation.
2902 /// \param VL List of scalars (a subset of the TE scalar), checked for
2903 /// permutations.
2904 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2905 /// commands to build the mask using the original vector value, without
2906 /// relying on the potential reordering.
2907 /// \returns per-register series of ShuffleKind, if gathered values can be
2908 /// represented as shuffles of previous tree entries. \p Mask is filled with
2909 /// the shuffle mask (also on per-register base).
2911 isGatherShuffledEntry(
2912 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2914 unsigned NumParts, bool ForOrder = false);
2915
2916 /// \returns the scalarization cost for this list of values. Assuming that
2917 /// this subtree gets vectorized, we may need to extract the values from the
2918 /// roots. This method calculates the cost of extracting the values.
2919 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2920 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2921 Type *ScalarTy) const;
2922
2923 /// Set the Builder insert point to one after the last instruction in
2924 /// the bundle
2925 void setInsertPointAfterBundle(const TreeEntry *E);
2926
2927 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2928 /// specified, the starting vector value is poison.
2929 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2930
2931 /// \returns whether the VectorizableTree is fully vectorizable and will
2932 /// be beneficial even the tree height is tiny.
2933 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2934
2935 /// Reorder commutative or alt operands to get better probability of
2936 /// generating vectorized code.
2937 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2940 const BoUpSLP &R);
2941
2942 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2943 /// users of \p TE and collects the stores. It returns the map from the store
2944 /// pointers to the collected stores.
2946 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2947
2948 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2949 /// stores in \p StoresVec can form a vector instruction. If so it returns
2950 /// true and populates \p ReorderIndices with the shuffle indices of the
2951 /// stores when compared to the sorted vector.
2952 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2953 OrdersType &ReorderIndices) const;
2954
2955 /// Iterates through the users of \p TE, looking for scalar stores that can be
2956 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2957 /// their order and builds an order index vector for each store bundle. It
2958 /// returns all these order vectors found.
2959 /// We run this after the tree has formed, otherwise we may come across user
2960 /// instructions that are not yet in the tree.
2962 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2963
2964 struct TreeEntry {
2965 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2966 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2967
2968 /// \returns Common mask for reorder indices and reused scalars.
2969 SmallVector<int> getCommonMask() const {
2971 inversePermutation(ReorderIndices, Mask);
2972 ::addMask(Mask, ReuseShuffleIndices);
2973 return Mask;
2974 }
2975
2976 /// \returns true if the scalars in VL are equal to this entry.
2977 bool isSame(ArrayRef<Value *> VL) const {
2978 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2979 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2980 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2981 return VL.size() == Mask.size() &&
2982 std::equal(VL.begin(), VL.end(), Mask.begin(),
2983 [Scalars](Value *V, int Idx) {
2984 return (isa<UndefValue>(V) &&
2985 Idx == PoisonMaskElem) ||
2986 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2987 });
2988 };
2989 if (!ReorderIndices.empty()) {
2990 // TODO: implement matching if the nodes are just reordered, still can
2991 // treat the vector as the same if the list of scalars matches VL
2992 // directly, without reordering.
2994 inversePermutation(ReorderIndices, Mask);
2995 if (VL.size() == Scalars.size())
2996 return IsSame(Scalars, Mask);
2997 if (VL.size() == ReuseShuffleIndices.size()) {
2998 ::addMask(Mask, ReuseShuffleIndices);
2999 return IsSame(Scalars, Mask);
3000 }
3001 return false;
3002 }
3003 return IsSame(Scalars, ReuseShuffleIndices);
3004 }
3005
3006 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3007 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3008 UserTreeIndices.front().UserTE == UserEI.UserTE;
3009 }
3010
3011 /// \returns true if current entry has same operands as \p TE.
3012 bool hasEqualOperands(const TreeEntry &TE) const {
3013 if (TE.getNumOperands() != getNumOperands())
3014 return false;
3015 SmallBitVector Used(getNumOperands());
3016 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3017 unsigned PrevCount = Used.count();
3018 for (unsigned K = 0; K < E; ++K) {
3019 if (Used.test(K))
3020 continue;
3021 if (getOperand(K) == TE.getOperand(I)) {
3022 Used.set(K);
3023 break;
3024 }
3025 }
3026 // Check if we actually found the matching operand.
3027 if (PrevCount == Used.count())
3028 return false;
3029 }
3030 return true;
3031 }
3032
3033 /// \return Final vectorization factor for the node. Defined by the total
3034 /// number of vectorized scalars, including those, used several times in the
3035 /// entry and counted in the \a ReuseShuffleIndices, if any.
3036 unsigned getVectorFactor() const {
3037 if (!ReuseShuffleIndices.empty())
3038 return ReuseShuffleIndices.size();
3039 return Scalars.size();
3040 };
3041
3042 /// Checks if the current node is a gather node.
3043 bool isGather() const {return State == NeedToGather; }
3044
3045 /// A vector of scalars.
3046 ValueList Scalars;
3047
3048 /// The Scalars are vectorized into this value. It is initialized to Null.
3049 WeakTrackingVH VectorizedValue = nullptr;
3050
3051 /// New vector phi instructions emitted for the vectorized phi nodes.
3052 PHINode *PHI = nullptr;
3053
3054 /// Do we need to gather this sequence or vectorize it
3055 /// (either with vector instruction or with scatter/gather
3056 /// intrinsics for store/load)?
3057 enum EntryState {
3058 Vectorize, ///< The node is regularly vectorized.
3059 ScatterVectorize, ///< Masked scatter/gather node.
3060 StridedVectorize, ///< Strided loads (and stores)
3061 NeedToGather, ///< Gather/buildvector node.
3062 CombinedVectorize, ///< Vectorized node, combined with its user into more
3063 ///< complex node like select/cmp to minmax, mul/add to
3064 ///< fma, etc. Must be used for the following nodes in
3065 ///< the pattern, not the very first one.
3066 };
3067 EntryState State;
3068
3069 /// List of combined opcodes supported by the vectorizer.
3070 enum CombinedOpcode {
3071 NotCombinedOp = -1,
3072 MinMax = Instruction::OtherOpsEnd + 1,
3073 };
3074 CombinedOpcode CombinedOp = NotCombinedOp;
3075
3076 /// Does this sequence require some shuffling?
3077 SmallVector<int, 4> ReuseShuffleIndices;
3078
3079 /// Does this entry require reordering?
3080 SmallVector<unsigned, 4> ReorderIndices;
3081
3082 /// Points back to the VectorizableTree.
3083 ///
3084 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3085 /// to be a pointer and needs to be able to initialize the child iterator.
3086 /// Thus we need a reference back to the container to translate the indices
3087 /// to entries.
3088 VecTreeTy &Container;
3089
3090 /// The TreeEntry index containing the user of this entry. We can actually
3091 /// have multiple users so the data structure is not truly a tree.
3092 SmallVector<EdgeInfo, 1> UserTreeIndices;
3093
3094 /// The index of this treeEntry in VectorizableTree.
3095 int Idx = -1;
3096
3097 private:
3098 /// The operands of each instruction in each lane Operands[op_index][lane].
3099 /// Note: This helps avoid the replication of the code that performs the
3100 /// reordering of operands during buildTree_rec() and vectorizeTree().
3102
3103 /// The main/alternate instruction.
3104 Instruction *MainOp = nullptr;
3105 Instruction *AltOp = nullptr;
3106
3107 public:
3108 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3109 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3110 if (Operands.size() < OpIdx + 1)
3111 Operands.resize(OpIdx + 1);
3112 assert(Operands[OpIdx].empty() && "Already resized?");
3113 assert(OpVL.size() <= Scalars.size() &&
3114 "Number of operands is greater than the number of scalars.");
3115 Operands[OpIdx].resize(OpVL.size());
3116 copy(OpVL, Operands[OpIdx].begin());
3117 }
3118
3119 /// Set the operands of this bundle in their original order.
3120 void setOperandsInOrder() {
3121 assert(Operands.empty() && "Already initialized?");
3122 auto *I0 = cast<Instruction>(Scalars[0]);
3123 Operands.resize(I0->getNumOperands());
3124 unsigned NumLanes = Scalars.size();
3125 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3126 OpIdx != NumOperands; ++OpIdx) {
3127 Operands[OpIdx].resize(NumLanes);
3128 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3129 auto *I = cast<Instruction>(Scalars[Lane]);
3130 assert(I->getNumOperands() == NumOperands &&
3131 "Expected same number of operands");
3132 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3133 }
3134 }
3135 }
3136
3137 /// Reorders operands of the node to the given mask \p Mask.
3138 void reorderOperands(ArrayRef<int> Mask) {
3139 for (ValueList &Operand : Operands)
3140 reorderScalars(Operand, Mask);
3141 }
3142
3143 /// \returns the \p OpIdx operand of this TreeEntry.
3144 ValueList &getOperand(unsigned OpIdx) {
3145 assert(OpIdx < Operands.size() && "Off bounds");
3146 return Operands[OpIdx];
3147 }
3148
3149 /// \returns the \p OpIdx operand of this TreeEntry.
3150 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3151 assert(OpIdx < Operands.size() && "Off bounds");
3152 return Operands[OpIdx];
3153 }
3154
3155 /// \returns the number of operands.
3156 unsigned getNumOperands() const { return Operands.size(); }
3157
3158 /// \return the single \p OpIdx operand.
3159 Value *getSingleOperand(unsigned OpIdx) const {
3160 assert(OpIdx < Operands.size() && "Off bounds");
3161 assert(!Operands[OpIdx].empty() && "No operand available");
3162 return Operands[OpIdx][0];
3163 }
3164
3165 /// Some of the instructions in the list have alternate opcodes.
3166 bool isAltShuffle() const { return MainOp != AltOp; }
3167
3168 bool isOpcodeOrAlt(Instruction *I) const {
3169 unsigned CheckedOpcode = I->getOpcode();
3170 return (getOpcode() == CheckedOpcode ||
3171 getAltOpcode() == CheckedOpcode);
3172 }
3173
3174 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3175 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3176 /// \p OpValue.
3177 Value *isOneOf(Value *Op) const {
3178 auto *I = dyn_cast<Instruction>(Op);
3179 if (I && isOpcodeOrAlt(I))
3180 return Op;
3181 return MainOp;
3182 }
3183
3184 void setOperations(const InstructionsState &S) {
3185 MainOp = S.MainOp;
3186 AltOp = S.AltOp;
3187 }
3188
3189 Instruction *getMainOp() const {
3190 return MainOp;
3191 }
3192
3193 Instruction *getAltOp() const {
3194 return AltOp;
3195 }
3196
3197 /// The main/alternate opcodes for the list of instructions.
3198 unsigned getOpcode() const {
3199 return MainOp ? MainOp->getOpcode() : 0;
3200 }
3201
3202 unsigned getAltOpcode() const {
3203 return AltOp ? AltOp->getOpcode() : 0;
3204 }
3205
3206 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3207 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3208 int findLaneForValue(Value *V) const {
3209 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
3210 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3211 if (!ReorderIndices.empty())
3212 FoundLane = ReorderIndices[FoundLane];
3213 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3214 if (!ReuseShuffleIndices.empty()) {
3215 FoundLane = std::distance(ReuseShuffleIndices.begin(),
3216 find(ReuseShuffleIndices, FoundLane));
3217 }
3218 return FoundLane;
3219 }
3220
3221 /// Build a shuffle mask for graph entry which represents a merge of main
3222 /// and alternate operations.
3223 void
3224 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3226 SmallVectorImpl<Value *> *OpScalars = nullptr,
3227 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3228
3229 /// Return true if this is a non-power-of-2 node.
3230 bool isNonPowOf2Vec() const {
3231 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
3232 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3233 "Reshuffling not supported with non-power-of-2 vectors yet.");
3234 return IsNonPowerOf2;
3235 }
3236
3237#ifndef NDEBUG
3238 /// Debug printer.
3239 LLVM_DUMP_METHOD void dump() const {
3240 dbgs() << Idx << ".\n";
3241 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3242 dbgs() << "Operand " << OpI << ":\n";
3243 for (const Value *V : Operands[OpI])
3244 dbgs().indent(2) << *V << "\n";
3245 }
3246 dbgs() << "Scalars: \n";
3247 for (Value *V : Scalars)
3248 dbgs().indent(2) << *V << "\n";
3249 dbgs() << "State: ";
3250 switch (State) {
3251 case Vectorize:
3252 dbgs() << "Vectorize\n";
3253 break;
3254 case ScatterVectorize:
3255 dbgs() << "ScatterVectorize\n";
3256 break;
3257 case StridedVectorize:
3258 dbgs() << "StridedVectorize\n";
3259 break;
3260 case NeedToGather:
3261 dbgs() << "NeedToGather\n";
3262 break;
3263 case CombinedVectorize:
3264 dbgs() << "CombinedVectorize\n";
3265 break;
3266 }
3267 dbgs() << "MainOp: ";
3268 if (MainOp)
3269 dbgs() << *MainOp << "\n";
3270 else
3271 dbgs() << "NULL\n";
3272 dbgs() << "AltOp: ";
3273 if (AltOp)
3274 dbgs() << *AltOp << "\n";
3275 else
3276 dbgs() << "NULL\n";
3277 dbgs() << "VectorizedValue: ";
3278 if (VectorizedValue)
3279 dbgs() << *VectorizedValue << "\n";
3280 else
3281 dbgs() << "NULL\n";
3282 dbgs() << "ReuseShuffleIndices: ";
3283 if (ReuseShuffleIndices.empty())
3284 dbgs() << "Empty";
3285 else
3286 for (int ReuseIdx : ReuseShuffleIndices)
3287 dbgs() << ReuseIdx << ", ";
3288 dbgs() << "\n";
3289 dbgs() << "ReorderIndices: ";
3290 for (unsigned ReorderIdx : ReorderIndices)
3291 dbgs() << ReorderIdx << ", ";
3292 dbgs() << "\n";
3293 dbgs() << "UserTreeIndices: ";
3294 for (const auto &EInfo : UserTreeIndices)
3295 dbgs() << EInfo << ", ";
3296 dbgs() << "\n";
3297 }
3298#endif
3299 };
3300
3301#ifndef NDEBUG
3302 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3303 InstructionCost VecCost, InstructionCost ScalarCost,
3304 StringRef Banner) const {
3305 dbgs() << "SLP: " << Banner << ":\n";
3306 E->dump();
3307 dbgs() << "SLP: Costs:\n";
3308 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3309 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3310 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3311 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3312 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3313 }
3314#endif
3315
3316 /// Create a new VectorizableTree entry.
3317 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3318 std::optional<ScheduleData *> Bundle,
3319 const InstructionsState &S,
3320 const EdgeInfo &UserTreeIdx,
3321 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3322 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3323 TreeEntry::EntryState EntryState =
3324 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3325 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3326 ReuseShuffleIndices, ReorderIndices);
3327 }
3328
3329 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3330 TreeEntry::EntryState EntryState,
3331 std::optional<ScheduleData *> Bundle,
3332 const InstructionsState &S,
3333 const EdgeInfo &UserTreeIdx,
3334 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3335 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3336 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3337 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3338 "Need to vectorize gather entry?");
3339 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3340 TreeEntry *Last = VectorizableTree.back().get();
3341 Last->Idx = VectorizableTree.size() - 1;
3342 Last->State = EntryState;
3343 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3344 ReuseShuffleIndices.end());
3345 if (ReorderIndices.empty()) {
3346 Last->Scalars.assign(VL.begin(), VL.end());
3347 Last->setOperations(S);
3348 } else {
3349 // Reorder scalars and build final mask.
3350 Last->Scalars.assign(VL.size(), nullptr);
3351 transform(ReorderIndices, Last->Scalars.begin(),
3352 [VL](unsigned Idx) -> Value * {
3353 if (Idx >= VL.size())
3354 return UndefValue::get(VL.front()->getType());
3355 return VL[Idx];
3356 });
3357 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3358 Last->setOperations(S);
3359 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3360 }
3361 if (!Last->isGather()) {
3362 for (Value *V : VL) {
3363 const TreeEntry *TE = getTreeEntry(V);
3364 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3365 "Scalar already in tree!");
3366 if (TE) {
3367 if (TE != Last)
3368 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3369 continue;
3370 }
3371 ScalarToTreeEntry[V] = Last;
3372 }
3373 // Update the scheduler bundle to point to this TreeEntry.
3374 ScheduleData *BundleMember = *Bundle;
3375 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3376 isVectorLikeInstWithConstOps(S.MainOp) ||
3377 doesNotNeedToSchedule(VL)) &&
3378 "Bundle and VL out of sync");
3379 if (BundleMember) {
3380 for (Value *V : VL) {
3382 continue;
3383 if (!BundleMember)
3384 continue;
3385 BundleMember->TE = Last;
3386 BundleMember = BundleMember->NextInBundle;
3387 }
3388 }
3389 assert(!BundleMember && "Bundle and VL out of sync");
3390 } else {
3391 // Build a map for gathered scalars to the nodes where they are used.
3392 bool AllConstsOrCasts = true;
3393 for (Value *V : VL)
3394 if (!isConstant(V)) {
3395 auto *I = dyn_cast<CastInst>(V);
3396 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3397 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3398 }
3399 if (AllConstsOrCasts)
3400 CastMaxMinBWSizes =
3401 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3402 MustGather.insert(VL.begin(), VL.end());
3403 }
3404
3405 if (UserTreeIdx.UserTE) {
3406 Last->UserTreeIndices.push_back(UserTreeIdx);
3407 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3408 "Reordering isn't implemented for non-power-of-2 nodes yet");
3409 }
3410 return Last;
3411 }
3412
3413 /// -- Vectorization State --
3414 /// Holds all of the tree entries.
3415 TreeEntry::VecTreeTy VectorizableTree;
3416
3417#ifndef NDEBUG
3418 /// Debug printer.
3419 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3420 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3421 VectorizableTree[Id]->dump();
3422 dbgs() << "\n";
3423 }
3424 }
3425#endif
3426
3427 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3428
3429 const TreeEntry *getTreeEntry(Value *V) const {
3430 return ScalarToTreeEntry.lookup(V);
3431 }
3432
3433 /// Check that the operand node of alternate node does not generate
3434 /// buildvector sequence. If it is, then probably not worth it to build
3435 /// alternate shuffle, if number of buildvector operands + alternate
3436 /// instruction > than the number of buildvector instructions.
3437 /// \param S the instructions state of the analyzed values.
3438 /// \param VL list of the instructions with alternate opcodes.
3439 bool areAltOperandsProfitable(const InstructionsState &S,
3440 ArrayRef<Value *> VL) const;
3441
3442 /// Checks if the specified list of the instructions/values can be vectorized
3443 /// and fills required data before actual scheduling of the instructions.
3444 TreeEntry::EntryState getScalarsVectorizationState(
3445 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3446 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3447
3448 /// Maps a specific scalar to its tree entry.
3449 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3450
3451 /// List of scalars, used in several vectorize nodes, and the list of the
3452 /// nodes.
3454
3455 /// Maps a value to the proposed vectorizable size.
3456 SmallDenseMap<Value *, unsigned> InstrElementSize;
3457
3458 /// A list of scalars that we found that we need to keep as scalars.
3459 ValueSet MustGather;
3460
3461 /// A set of first non-schedulable values.
3462 ValueSet NonScheduledFirst;
3463
3464 /// A map between the vectorized entries and the last instructions in the
3465 /// bundles. The bundles are built in use order, not in the def order of the
3466 /// instructions. So, we cannot rely directly on the last instruction in the
3467 /// bundle being the last instruction in the program order during
3468 /// vectorization process since the basic blocks are affected, need to
3469 /// pre-gather them before.
3470 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3471
3472 /// List of gather nodes, depending on other gather/vector nodes, which should
3473 /// be emitted after the vector instruction emission process to correctly
3474 /// handle order of the vector instructions and shuffles.
3475 SetVector<const TreeEntry *> PostponedGathers;
3476
3477 using ValueToGatherNodesMap =
3479 ValueToGatherNodesMap ValueToGatherNodes;
3480
3481 /// This POD struct describes one external user in the vectorized tree.
3482 struct ExternalUser {
3483 ExternalUser(Value *S, llvm::User *U, int L)
3484 : Scalar(S), User(U), Lane(L) {}
3485
3486 // Which scalar in our function.
3487 Value *Scalar;
3488
3489 // Which user that uses the scalar.
3491
3492 // Which lane does the scalar belong to.
3493 int Lane;
3494 };
3495 using UserList = SmallVector<ExternalUser, 16>;
3496
3497 /// Checks if two instructions may access the same memory.
3498 ///
3499 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3500 /// is invariant in the calling loop.
3501 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3502 Instruction *Inst2) {
3503 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3504 return true;
3505 // First check if the result is already in the cache.
3506 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3507 auto It = AliasCache.find(Key);
3508 if (It != AliasCache.end())
3509 return It->second;
3510 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3511 // Store the result in the cache.
3512 AliasCache.try_emplace(Key, Aliased);
3513 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3514 return Aliased;
3515 }
3516
3517 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3518
3519 /// Cache for alias results.
3520 /// TODO: consider moving this to the AliasAnalysis itself.
3522
3523 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3524 // globally through SLP because we don't perform any action which
3525 // invalidates capture results.
3526 BatchAAResults BatchAA;
3527
3528 /// Temporary store for deleted instructions. Instructions will be deleted
3529 /// eventually when the BoUpSLP is destructed. The deferral is required to
3530 /// ensure that there are no incorrect collisions in the AliasCache, which
3531 /// can happen if a new instruction is allocated at the same address as a
3532 /// previously deleted instruction.
3533 DenseSet<Instruction *> DeletedInstructions;
3534
3535 /// Set of the instruction, being analyzed already for reductions.
3536 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3537
3538 /// Set of hashes for the list of reduction values already being analyzed.
3539 DenseSet<size_t> AnalyzedReductionVals;
3540
3541 /// Values, already been analyzed for mininmal bitwidth and found to be
3542 /// non-profitable.
3543 DenseSet<Value *> AnalyzedMinBWVals;
3544
3545 /// A list of values that need to extracted out of the tree.
3546 /// This list holds pairs of (Internal Scalar : External User). External User
3547 /// can be nullptr, it means that this Internal Scalar will be used later,
3548 /// after vectorization.
3549 UserList ExternalUses;
3550
3551 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3552 /// extractelement instructions.
3553 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3554
3555 /// Values used only by @llvm.assume calls.
3557
3558 /// Holds all of the instructions that we gathered, shuffle instructions and
3559 /// extractelements.
3560 SetVector<Instruction *> GatherShuffleExtractSeq;
3561
3562 /// A list of blocks that we are going to CSE.
3563 DenseSet<BasicBlock *> CSEBlocks;
3564
3565 /// Contains all scheduling relevant data for an instruction.
3566 /// A ScheduleData either represents a single instruction or a member of an
3567 /// instruction bundle (= a group of instructions which is combined into a
3568 /// vector instruction).
3569 struct ScheduleData {
3570 // The initial value for the dependency counters. It means that the
3571 // dependencies are not calculated yet.
3572 enum { InvalidDeps = -1 };
3573
3574 ScheduleData() = default;
3575
3576 void init(int BlockSchedulingRegionID, Instruction *I) {
3577 FirstInBundle = this;
3578 NextInBundle = nullptr;
3579 NextLoadStore = nullptr;
3580 IsScheduled = false;
3581 SchedulingRegionID = BlockSchedulingRegionID;
3582 clearDependencies();
3583 Inst = I;
3584 TE = nullptr;
3585 }
3586
3587 /// Verify basic self consistency properties
3588 void verify() {
3589 if (hasValidDependencies()) {
3590 assert(UnscheduledDeps <= Dependencies && "invariant");
3591 } else {
3592 assert(UnscheduledDeps == Dependencies && "invariant");
3593 }
3594
3595 if (IsScheduled) {
3596 assert(isSchedulingEntity() &&
3597 "unexpected scheduled state");
3598 for (const ScheduleData *BundleMember = this; BundleMember;
3599 BundleMember = BundleMember->NextInBundle) {
3600 assert(BundleMember->hasValidDependencies() &&
3601 BundleMember->UnscheduledDeps == 0 &&
3602 "unexpected scheduled state");
3603 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3604 "only bundle is marked scheduled");
3605 }
3606 }
3607
3608 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3609 "all bundle members must be in same basic block");
3610 }
3611
3612 /// Returns true if the dependency information has been calculated.
3613 /// Note that depenendency validity can vary between instructions within
3614 /// a single bundle.
3615 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3616
3617 /// Returns true for single instructions and for bundle representatives
3618 /// (= the head of a bundle).
3619 bool isSchedulingEntity() const { return FirstInBundle == this; }
3620
3621 /// Returns true if it represents an instruction bundle and not only a
3622 /// single instruction.
3623 bool isPartOfBundle() const {
3624 return NextInBundle != nullptr || FirstInBundle != this || TE;
3625 }
3626
3627 /// Returns true if it is ready for scheduling, i.e. it has no more
3628 /// unscheduled depending instructions/bundles.
3629 bool isReady() const {
3630 assert(isSchedulingEntity() &&
3631 "can't consider non-scheduling entity for ready list");
3632 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3633 }
3634
3635 /// Modifies the number of unscheduled dependencies for this instruction,
3636 /// and returns the number of remaining dependencies for the containing
3637 /// bundle.
3638 int incrementUnscheduledDeps(int Incr) {
3639 assert(hasValidDependencies() &&
3640 "increment of unscheduled deps would be meaningless");
3641 UnscheduledDeps += Incr;
3642 return FirstInBundle->unscheduledDepsInBundle();
3643 }
3644
3645 /// Sets the number of unscheduled dependencies to the number of
3646 /// dependencies.
3647 void resetUnscheduledDeps() {
3648 UnscheduledDeps = Dependencies;
3649 }
3650
3651 /// Clears all dependency information.
3652 void clearDependencies() {
3653 Dependencies = InvalidDeps;
3654 resetUnscheduledDeps();
3655 MemoryDependencies.clear();
3656 ControlDependencies.clear();
3657 }
3658
3659 int unscheduledDepsInBundle() const {
3660 assert(isSchedulingEntity() && "only meaningful on the bundle");
3661 int Sum = 0;
3662 for (const ScheduleData *BundleMember = this; BundleMember;
3663 BundleMember = BundleMember->NextInBundle) {
3664 if (BundleMember->UnscheduledDeps == InvalidDeps)
3665 return InvalidDeps;
3666 Sum += BundleMember->UnscheduledDeps;
3667 }
3668 return Sum;
3669 }
3670
3671 void dump(raw_ostream &os) const {
3672 if (!isSchedulingEntity()) {
3673 os << "/ " << *Inst;
3674 } else if (NextInBundle) {
3675 os << '[' << *Inst;
3676 ScheduleData *SD = NextInBundle;
3677 while (SD) {
3678 os << ';' << *SD->Inst;
3679 SD = SD->NextInBundle;
3680 }
3681 os << ']';
3682 } else {
3683 os << *Inst;
3684 }
3685 }
3686
3687 Instruction *Inst = nullptr;
3688
3689 /// The TreeEntry that this instruction corresponds to.
3690 TreeEntry *TE = nullptr;
3691
3692 /// Points to the head in an instruction bundle (and always to this for
3693 /// single instructions).
3694 ScheduleData *FirstInBundle = nullptr;
3695
3696 /// Single linked list of all instructions in a bundle. Null if it is a
3697 /// single instruction.
3698 ScheduleData *NextInBundle = nullptr;
3699
3700 /// Single linked list of all memory instructions (e.g. load, store, call)
3701 /// in the block - until the end of the scheduling region.
3702 ScheduleData *NextLoadStore = nullptr;
3703
3704 /// The dependent memory instructions.
3705 /// This list is derived on demand in calculateDependencies().
3706 SmallVector<ScheduleData *, 4> MemoryDependencies;
3707
3708 /// List of instructions which this instruction could be control dependent
3709 /// on. Allowing such nodes to be scheduled below this one could introduce
3710 /// a runtime fault which didn't exist in the original program.
3711 /// ex: this is a load or udiv following a readonly call which inf loops
3712 SmallVector<ScheduleData *, 4> ControlDependencies;
3713
3714 /// This ScheduleData is in the current scheduling region if this matches
3715 /// the current SchedulingRegionID of BlockScheduling.
3716 int SchedulingRegionID = 0;
3717
3718 /// Used for getting a "good" final ordering of instructions.
3719 int SchedulingPriority = 0;
3720
3721 /// The number of dependencies. Constitutes of the number of users of the
3722 /// instruction plus the number of dependent memory instructions (if any).
3723 /// This value is calculated on demand.
3724 /// If InvalidDeps, the number of dependencies is not calculated yet.
3725 int Dependencies = InvalidDeps;
3726
3727 /// The number of dependencies minus the number of dependencies of scheduled
3728 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3729 /// for scheduling.
3730 /// Note that this is negative as long as Dependencies is not calculated.
3731 int UnscheduledDeps = InvalidDeps;
3732
3733 /// True if this instruction is scheduled (or considered as scheduled in the
3734 /// dry-run).
3735 bool IsScheduled = false;
3736 };
3737
3738#ifndef NDEBUG
3740 const BoUpSLP::ScheduleData &SD) {
3741 SD.dump(os);
3742 return os;
3743 }
3744#endif
3745
3746 friend struct GraphTraits<BoUpSLP *>;
3747 friend struct DOTGraphTraits<BoUpSLP *>;
3748
3749 /// Contains all scheduling data for a basic block.
3750 /// It does not schedules instructions, which are not memory read/write
3751 /// instructions and their operands are either constants, or arguments, or
3752 /// phis, or instructions from others blocks, or their users are phis or from
3753 /// the other blocks. The resulting vector instructions can be placed at the
3754 /// beginning of the basic block without scheduling (if operands does not need
3755 /// to be scheduled) or at the end of the block (if users are outside of the
3756 /// block). It allows to save some compile time and memory used by the
3757 /// compiler.
3758 /// ScheduleData is assigned for each instruction in between the boundaries of
3759 /// the tree entry, even for those, which are not part of the graph. It is
3760 /// required to correctly follow the dependencies between the instructions and
3761 /// their correct scheduling. The ScheduleData is not allocated for the
3762 /// instructions, which do not require scheduling, like phis, nodes with
3763 /// extractelements/insertelements only or nodes with instructions, with
3764 /// uses/operands outside of the block.
3765 struct BlockScheduling {
3766 BlockScheduling(BasicBlock *BB)
3767 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3768
3769 void clear() {
3770 ReadyInsts.clear();
3771 ScheduleStart = nullptr;
3772 ScheduleEnd = nullptr;
3773 FirstLoadStoreInRegion = nullptr;
3774 LastLoadStoreInRegion = nullptr;
3775 RegionHasStackSave = false;
3776
3777 // Reduce the maximum schedule region size by the size of the
3778 // previous scheduling run.
3779 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3780 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3781 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3782 ScheduleRegionSize = 0;
3783
3784 // Make a new scheduling region, i.e. all existing ScheduleData is not
3785 // in the new region yet.
3786 ++SchedulingRegionID;
3787 }
3788
3789 ScheduleData *getScheduleData(Instruction *I) {
3790 if (BB != I->getParent())
3791 // Avoid lookup if can't possibly be in map.
3792 return nullptr;
3793 ScheduleData *SD = ScheduleDataMap.lookup(I);
3794 if (SD && isInSchedulingRegion(SD))
3795 return SD;
3796 return nullptr;
3797 }
3798
3799 ScheduleData *getScheduleData(Value *V) {
3800 if (auto *I = dyn_cast<Instruction>(V))
3801 return getScheduleData(I);
3802 return nullptr;
3803 }
3804
3805 bool isInSchedulingRegion(ScheduleData *SD) const {
3806 return SD->SchedulingRegionID == SchedulingRegionID;
3807 }
3808
3809 /// Marks an instruction as scheduled and puts all dependent ready
3810 /// instructions into the ready-list.
3811 template <typename ReadyListType>
3812 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3813 SD->IsScheduled = true;
3814 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3815
3816 for (ScheduleData *BundleMember = SD; BundleMember;
3817 BundleMember = BundleMember->NextInBundle) {
3818
3819 // Handle the def-use chain dependencies.
3820
3821 // Decrement the unscheduled counter and insert to ready list if ready.
3822 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3823 ScheduleData *OpDef = getScheduleData(I);
3824 if (OpDef && OpDef->hasValidDependencies() &&
3825 OpDef->incrementUnscheduledDeps(-1) == 0) {
3826 // There are no more unscheduled dependencies after
3827 // decrementing, so we can put the dependent instruction
3828 // into the ready list.
3829 ScheduleData *DepBundle = OpDef->FirstInBundle;
3830 assert(!DepBundle->IsScheduled &&
3831 "already scheduled bundle gets ready");
3832 ReadyList.insert(DepBundle);
3834 << "SLP: gets ready (def): " << *DepBundle << "\n");
3835 }
3836 };
3837
3838 // If BundleMember is a vector bundle, its operands may have been
3839 // reordered during buildTree(). We therefore need to get its operands
3840 // through the TreeEntry.
3841 if (TreeEntry *TE = BundleMember->TE) {
3842 // Need to search for the lane since the tree entry can be reordered.
3843 int Lane = std::distance(TE->Scalars.begin(),
3844 find(TE->Scalars, BundleMember->Inst));
3845 assert(Lane >= 0 && "Lane not set");
3846
3847 // Since vectorization tree is being built recursively this assertion
3848 // ensures that the tree entry has all operands set before reaching
3849 // this code. Couple of exceptions known at the moment are extracts
3850 // where their second (immediate) operand is not added. Since
3851 // immediates do not affect scheduler behavior this is considered
3852 // okay.
3853 auto *In = BundleMember->Inst;
3854 assert(
3855 In &&
3856 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3857 In->getNumOperands() == TE->getNumOperands()) &&
3858 "Missed TreeEntry operands?");
3859 (void)In; // fake use to avoid build failure when assertions disabled
3860
3861 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3862 OpIdx != NumOperands; ++OpIdx)
3863 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3864 DecrUnsched(I);
3865 } else {
3866 // If BundleMember is a stand-alone instruction, no operand reordering
3867 // has taken place, so we directly access its operands.
3868 for (Use &U : BundleMember->Inst->operands())
3869 if (auto *I = dyn_cast<Instruction>(U.get()))
3870 DecrUnsched(I);
3871 }
3872 // Handle the memory dependencies.
3873 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3874 if (MemoryDepSD->hasValidDependencies() &&
3875 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3876 // There are no more unscheduled dependencies after decrementing,
3877 // so we can put the dependent instruction into the ready list.
3878 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3879 assert(!DepBundle->IsScheduled &&
3880 "already scheduled bundle gets ready");
3881 ReadyList.insert(DepBundle);
3883 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3884 }
3885 }
3886 // Handle the control dependencies.
3887 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3888 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3889 // There are no more unscheduled dependencies after decrementing,
3890 // so we can put the dependent instruction into the ready list.
3891 ScheduleData *DepBundle = DepSD->FirstInBundle;
3892 assert(!DepBundle->IsScheduled &&
3893 "already scheduled bundle gets ready");
3894 ReadyList.insert(DepBundle);
3896 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3897 }
3898 }
3899 }
3900 }
3901
3902 /// Verify basic self consistency properties of the data structure.
3903 void verify() {
3904 if (!ScheduleStart)
3905 return;
3906
3907 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3908 ScheduleStart->comesBefore(ScheduleEnd) &&
3909 "Not a valid scheduling region?");
3910
3911 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3912 auto *SD = getScheduleData(I);
3913 if (!SD)
3914 continue;
3915 assert(isInSchedulingRegion(SD) &&
3916 "primary schedule data not in window?");
3917 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3918 "entire bundle in window!");
3919 SD->verify();
3920 }
3921
3922 for (auto *SD : ReadyInsts) {
3923 assert(SD->isSchedulingEntity() && SD->isReady() &&
3924 "item in ready list not ready?");
3925 (void)SD;
3926 }
3927 }
3928
3929 /// Put all instructions into the ReadyList which are ready for scheduling.
3930 template <typename ReadyListType>
3931 void initialFillReadyList(ReadyListType &ReadyList) {
3932 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3933 ScheduleData *SD = getScheduleData(I);
3934 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3935 SD->isReady()) {
3936 ReadyList.insert(SD);
3938 << "SLP: initially in ready list: " << *SD << "\n");
3939 }
3940 }
3941 }
3942
3943 /// Build a bundle from the ScheduleData nodes corresponding to the
3944 /// scalar instruction for each lane.
3945 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3946
3947 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3948 /// cyclic dependencies. This is only a dry-run, no instructions are
3949 /// actually moved at this stage.
3950 /// \returns the scheduling bundle. The returned Optional value is not
3951 /// std::nullopt if \p VL is allowed to be scheduled.
3952 std::optional<ScheduleData *>
3953 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3954 const InstructionsState &S);
3955
3956 /// Un-bundles a group of instructions.
3957 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3958
3959 /// Allocates schedule data chunk.
3960 ScheduleData *allocateScheduleDataChunks();
3961
3962 /// Extends the scheduling region so that V is inside the region.
3963 /// \returns true if the region size is within the limit.
3964 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3965
3966 /// Initialize the ScheduleData structures for new instructions in the
3967 /// scheduling region.
3968 void initScheduleData(Instruction *FromI, Instruction *ToI,
3969 ScheduleData *PrevLoadStore,
3970 ScheduleData *NextLoadStore);
3971
3972 /// Updates the dependency information of a bundle and of all instructions/
3973 /// bundles which depend on the original bundle.
3974 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3975 BoUpSLP *SLP);
3976
3977 /// Sets all instruction in the scheduling region to un-scheduled.
3978 void resetSchedule();
3979
3980 BasicBlock *BB;
3981
3982 /// Simple memory allocation for ScheduleData.
3984
3985 /// The size of a ScheduleData array in ScheduleDataChunks.
3986 int ChunkSize;
3987
3988 /// The allocator position in the current chunk, which is the last entry
3989 /// of ScheduleDataChunks.
3990 int ChunkPos;
3991
3992 /// Attaches ScheduleData to Instruction.
3993 /// Note that the mapping survives during all vectorization iterations, i.e.
3994 /// ScheduleData structures are recycled.
3996
3997 /// The ready-list for scheduling (only used for the dry-run).
3998 SetVector<ScheduleData *> ReadyInsts;
3999
4000 /// The first instruction of the scheduling region.
4001 Instruction *ScheduleStart = nullptr;
4002
4003 /// The first instruction _after_ the scheduling region.
4004 Instruction *ScheduleEnd = nullptr;
4005
4006 /// The first memory accessing instruction in the scheduling region
4007 /// (can be null).
4008 ScheduleData *FirstLoadStoreInRegion = nullptr;
4009
4010 /// The last memory accessing instruction in the scheduling region
4011 /// (can be null).
4012 ScheduleData *LastLoadStoreInRegion = nullptr;
4013
4014 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4015 /// region? Used to optimize the dependence calculation for the
4016 /// common case where there isn't.
4017 bool RegionHasStackSave = false;
4018
4019 /// The current size of the scheduling region.
4020 int ScheduleRegionSize = 0;
4021
4022 /// The maximum size allowed for the scheduling region.
4023 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4024
4025 /// The ID of the scheduling region. For a new vectorization iteration this
4026 /// is incremented which "removes" all ScheduleData from the region.
4027 /// Make sure that the initial SchedulingRegionID is greater than the
4028 /// initial SchedulingRegionID in ScheduleData (which is 0).
4029 int SchedulingRegionID = 1;
4030 };
4031
4032 /// Attaches the BlockScheduling structures to basic blocks.
4034
4035 /// Performs the "real" scheduling. Done before vectorization is actually
4036 /// performed in a basic block.
4037 void scheduleBlock(BlockScheduling *BS);
4038
4039 /// List of users to ignore during scheduling and that don't need extracting.
4040 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4041
4042 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4043 /// sorted SmallVectors of unsigned.
4044 struct OrdersTypeDenseMapInfo {
4045 static OrdersType getEmptyKey() {
4046 OrdersType V;
4047 V.push_back(~1U);
4048 return V;
4049 }
4050
4051 static OrdersType getTombstoneKey() {
4052 OrdersType V;
4053 V.push_back(~2U);
4054 return V;
4055 }
4056
4057 static unsigned getHashValue(const OrdersType &V) {
4058 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4059 }
4060
4061 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4062 return LHS == RHS;
4063 }
4064 };
4065
4066 // Analysis and block reference.
4067 Function *F;
4068 ScalarEvolution *SE;
4070 TargetLibraryInfo *TLI;
4071 LoopInfo *LI;
4072 DominatorTree *DT;
4073 AssumptionCache *AC;
4074 DemandedBits *DB;
4075 const DataLayout *DL;
4077
4078 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4079 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4080
4081 /// Instruction builder to construct the vectorized tree.
4083
4084 /// A map of scalar integer values to the smallest bit width with which they
4085 /// can legally be represented. The values map to (width, signed) pairs,
4086 /// where "width" indicates the minimum bit width and "signed" is True if the
4087 /// value must be signed-extended, rather than zero-extended, back to its
4088 /// original width.
4090
4091 /// Final size of the reduced vector, if the current graph represents the
4092 /// input for the reduction and it was possible to narrow the size of the
4093 /// reduction.
4094 unsigned ReductionBitWidth = 0;
4095
4096 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4097 /// type sizes, used in the tree.
4098 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4099
4100 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4101 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4102 DenseSet<unsigned> ExtraBitWidthNodes;
4103};
4104
4105} // end namespace slpvectorizer
4106
4107template <> struct GraphTraits<BoUpSLP *> {
4108 using TreeEntry = BoUpSLP::TreeEntry;
4109
4110 /// NodeRef has to be a pointer per the GraphWriter.
4112
4114
4115 /// Add the VectorizableTree to the index iterator to be able to return
4116 /// TreeEntry pointers.
4117 struct ChildIteratorType
4118 : public iterator_adaptor_base<
4119 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4121
4123 ContainerTy &VT)
4124 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4125
4126 NodeRef operator*() { return I->UserTE; }
4127 };
4128
4130 return R.VectorizableTree[0].get();
4131 }
4132
4133 static ChildIteratorType child_begin(NodeRef N) {
4134 return {N->UserTreeIndices.begin(), N->Container};
4135 }
4136
4137 static ChildIteratorType child_end(NodeRef N) {
4138 return {N->UserTreeIndices.end(), N->Container};
4139 }
4140
4141 /// For the node iterator we just need to turn the TreeEntry iterator into a
4142 /// TreeEntry* iterator so that it dereferences to NodeRef.
4143 class nodes_iterator {
4145 ItTy It;
4146
4147 public:
4148 nodes_iterator(const ItTy &It2) : It(It2) {}
4149 NodeRef operator*() { return It->get(); }
4150 nodes_iterator operator++() {
4151 ++It;
4152 return *this;
4153 }
4154 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4155 };
4156
4157 static nodes_iterator nodes_begin(BoUpSLP *R) {
4158 return nodes_iterator(R->VectorizableTree.begin());
4159 }
4160
4161 static nodes_iterator nodes_end(BoUpSLP *R) {
4162 return nodes_iterator(R->VectorizableTree.end());
4163 }
4164
4165 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4166};
4167
4168template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4169 using TreeEntry = BoUpSLP::TreeEntry;
4170
4171 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4172
4173 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4174 std::string Str;
4176 OS << Entry->Idx << ".\n";
4177 if (isSplat(Entry->Scalars))
4178 OS << "<splat> ";
4179 for (auto *V : Entry->Scalars) {
4180 OS << *V;
4181 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4182 return EU.Scalar == V;
4183 }))
4184 OS << " <extract>";
4185 OS << "\n";
4186 }
4187 return Str;
4188 }
4189
4190 static std::string getNodeAttributes(const TreeEntry *Entry,
4191 const BoUpSLP *) {
4192 if (Entry->isGather())
4193 return "color=red";
4194 if (Entry->State == TreeEntry::ScatterVectorize ||
4195 Entry->State == TreeEntry::StridedVectorize)
4196 return "color=blue";
4197 return "";
4198 }
4199};
4200
4201} // end namespace llvm
4202
4205 for (auto *I : DeletedInstructions) {
4206 if (!I->getParent()) {
4207 // Temporarily insert instruction back to erase them from parent and
4208 // memory later.
4209 if (isa<PHINode>(I))
4210 // Phi nodes must be the very first instructions in the block.
4211 I->insertBefore(F->getEntryBlock(),
4212 F->getEntryBlock().getFirstNonPHIIt());
4213 else
4214 I->insertBefore(F->getEntryBlock().getTerminator());
4215 continue;
4216 }
4217 for (Use &U : I->operands()) {
4218 auto *Op = dyn_cast<Instruction>(U.get());
4219 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4221 DeadInsts.emplace_back(Op);
4222 }
4223 I->dropAllReferences();
4224 }
4225 for (auto *I : DeletedInstructions) {
4226 assert(I->use_empty() &&
4227 "trying to erase instruction with users.");
4228 I->eraseFromParent();
4229 }
4230
4231 // Cleanup any dead scalar code feeding the vectorized instructions
4233
4234#ifdef EXPENSIVE_CHECKS
4235 // If we could guarantee that this call is not extremely slow, we could
4236 // remove the ifdef limitation (see PR47712).
4237 assert(!verifyFunction(*F, &dbgs()));
4238#endif
4239}
4240
4241/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4242/// contains original mask for the scalars reused in the node. Procedure
4243/// transform this mask in accordance with the given \p Mask.
4245 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4246 "Expected non-empty mask.");
4247 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4248 Prev.swap(Reuses);
4249 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4250 if (Mask[I] != PoisonMaskElem)
4251 Reuses[Mask[I]] = Prev[I];
4252}
4253
4254/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4255/// the original order of the scalars. Procedure transforms the provided order
4256/// in accordance with the given \p Mask. If the resulting \p Order is just an
4257/// identity order, \p Order is cleared.
4259 bool BottomOrder = false) {
4260 assert(!Mask.empty() && "Expected non-empty mask.");
4261 unsigned Sz = Mask.size();
4262 if (BottomOrder) {
4263 SmallVector<unsigned> PrevOrder;
4264 if (Order.empty()) {
4265 PrevOrder.resize(Sz);
4266 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4267 } else {
4268 PrevOrder.swap(Order);
4269 }
4270 Order.assign(Sz, Sz);
4271 for (unsigned I = 0; I < Sz; ++I)
4272 if (Mask[I] != PoisonMaskElem)
4273 Order[I] = PrevOrder[Mask[I]];
4274 if (all_of(enumerate(Order), [&](const auto &Data) {
4275 return Data.value() == Sz || Data.index() == Data.value();
4276 })) {
4277 Order.clear();
4278 return;
4279 }
4280 fixupOrderingIndices(Order);
4281 return;
4282 }
4283 SmallVector<int> MaskOrder;
4284 if (Order.empty()) {
4285 MaskOrder.resize(Sz);
4286 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4287 } else {
4288 inversePermutation(Order, MaskOrder);
4289 }
4290 reorderReuses(MaskOrder, Mask);
4291 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4292 Order.clear();
4293 return;
4294 }
4295 Order.assign(Sz, Sz);
4296 for (unsigned I = 0; I < Sz; ++I)
4297 if (MaskOrder[I] != PoisonMaskElem)
4298 Order[MaskOrder[I]] = I;
4299 fixupOrderingIndices(Order);
4300}
4301
4302std::optional<BoUpSLP::OrdersType>
4303BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4304 assert(TE.isGather() && "Expected gather node only.");
4305 // Try to find subvector extract/insert patterns and reorder only such
4306 // patterns.
4307 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4308 Type *ScalarTy = GatheredScalars.front()->getType();
4309 int NumScalars = GatheredScalars.size();
4310 if (!isValidElementType(ScalarTy))
4311 return std::nullopt;
4312 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4313 int NumParts = TTI->getNumberOfParts(VecTy);
4314 if (NumParts == 0 || NumParts >= NumScalars)
4315 NumParts = 1;
4316 SmallVector<int> ExtractMask;
4317 SmallVector<int> Mask;
4320 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4322 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4323 /*ForOrder=*/true);
4324 // No shuffled operands - ignore.
4325 if (GatherShuffles.empty() && ExtractShuffles.empty())
4326 return std::nullopt;
4327 OrdersType CurrentOrder(NumScalars, NumScalars);
4328 if (GatherShuffles.size() == 1 &&
4329 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4330 Entries.front().front()->isSame(TE.Scalars)) {
4331 // Perfect match in the graph, will reuse the previously vectorized
4332 // node. Cost is 0.
4333 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4334 return CurrentOrder;
4335 }
4336 auto IsSplatMask = [](ArrayRef<int> Mask) {
4337 int SingleElt = PoisonMaskElem;
4338 return all_of(Mask, [&](int I) {
4339 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4340 SingleElt = I;
4341 return I == PoisonMaskElem || I == SingleElt;
4342 });
4343 };
4344 // Exclusive broadcast mask - ignore.
4345 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4346 (Entries.size() != 1 ||
4347 Entries.front().front()->ReorderIndices.empty())) ||
4348 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4349 return std::nullopt;
4350 SmallBitVector ShuffledSubMasks(NumParts);
4351 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4352 ArrayRef<int> Mask, int PartSz, int NumParts,
4353 function_ref<unsigned(unsigned)> GetVF) {
4354 for (int I : seq<int>(0, NumParts)) {
4355 if (ShuffledSubMasks.test(I))
4356 continue;
4357 const int VF = GetVF(I);
4358 if (VF == 0)
4359 continue;
4360 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4361 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4362 // Shuffle of at least 2 vectors - ignore.
4363 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4364 std::fill(Slice.begin(), Slice.end(), NumScalars);
4365 ShuffledSubMasks.set(I);
4366 continue;
4367 }
4368 // Try to include as much elements from the mask as possible.
4369 int FirstMin = INT_MAX;
4370 int SecondVecFound = false;
4371 for (int K : seq<int>(Limit)) {
4372 int Idx = Mask[I * PartSz + K];
4373 if (Idx == PoisonMaskElem) {
4374 Value *V = GatheredScalars[I * PartSz + K];
4375 if (isConstant(V) && !isa<PoisonValue>(V)) {
4376 SecondVecFound = true;
4377 break;
4378 }
4379 continue;
4380 }
4381 if (Idx < VF) {
4382 if (FirstMin > Idx)
4383 FirstMin = Idx;
4384 } else {
4385 SecondVecFound = true;
4386 break;
4387 }
4388 }
4389 FirstMin = (FirstMin / PartSz) * PartSz;
4390 // Shuffle of at least 2 vectors - ignore.
4391 if (SecondVecFound) {
4392 std::fill(Slice.begin(), Slice.end(), NumScalars);
4393 ShuffledSubMasks.set(I);
4394 continue;
4395 }
4396 for (int K : seq<int>(Limit)) {
4397 int Idx = Mask[I * PartSz + K];
4398 if (Idx == PoisonMaskElem)
4399 continue;
4400 Idx -= FirstMin;
4401 if (Idx >= PartSz) {
4402 SecondVecFound = true;
4403 break;
4404 }
4405 if (CurrentOrder[I * PartSz + Idx] >
4406 static_cast<unsigned>(I * PartSz + K) &&
4407 CurrentOrder[I * PartSz + Idx] !=
4408 static_cast<unsigned>(I * PartSz + Idx))
4409 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4410 }
4411 // Shuffle of at least 2 vectors - ignore.
4412 if (SecondVecFound) {
4413 std::fill(Slice.begin(), Slice.end(), NumScalars);
4414 ShuffledSubMasks.set(I);
4415 continue;
4416 }
4417 }
4418 };
4419 int PartSz = getPartNumElems(NumScalars, NumParts);
4420 if (!ExtractShuffles.empty())
4421 TransformMaskToOrder(
4422 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4423 if (!ExtractShuffles[I])
4424 return 0U;
4425 unsigned VF = 0;
4426 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4427 for (unsigned Idx : seq<unsigned>(Sz)) {
4428 int K = I * PartSz + Idx;
4429 if (ExtractMask[K] == PoisonMaskElem)
4430 continue;
4431 if (!TE.ReuseShuffleIndices.empty())
4432 K = TE.ReuseShuffleIndices[K];
4433 if (!TE.ReorderIndices.empty())
4434 K = std::distance(TE.ReorderIndices.begin(),
4435 find(TE.ReorderIndices, K));
4436 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4437 if (!EI)
4438 continue;
4439 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4440 ->getElementCount()
4441 .getKnownMinValue());
4442 }
4443 return VF;
4444 });
4445 // Check special corner case - single shuffle of the same entry.
4446 if (GatherShuffles.size() == 1 && NumParts != 1) {
4447 if (ShuffledSubMasks.any())
4448 return std::nullopt;
4449 PartSz = NumScalars;
4450 NumParts = 1;
4451 }
4452 if (!Entries.empty())
4453 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4454 if (!GatherShuffles[I])
4455 return 0U;
4456 return std::max(Entries[I].front()->getVectorFactor(),
4457 Entries[I].back()->getVectorFactor());
4458 });
4459 int NumUndefs =
4460 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4461 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4462 return std::nullopt;
4463 return std::move(CurrentOrder);
4464}
4465
4466static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4467 const TargetLibraryInfo &TLI,
4468 bool CompareOpcodes = true) {
4469 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4470 return false;
4471 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4472 if (!GEP1)
4473 return false;
4474 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4475 if (!GEP2)
4476 return false;
4477 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4478 ((isConstant(GEP1->getOperand(1)) &&
4479 isConstant(GEP2->getOperand(1))) ||
4480 !CompareOpcodes ||
4481 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4482 .getOpcode());
4483}
4484
4485/// Calculates minimal alignment as a common alignment.
4486template <typename T>
4488 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4489 for (Value *V : VL.drop_front())
4490 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4491 return CommonAlignment;
4492}
4493
4494/// Check if \p Order represents reverse order.
4496 unsigned Sz = Order.size();
4497 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4498 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4499 });
4500}
4501
4502/// Checks if the provided list of pointers \p Pointers represents the strided
4503/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4504/// Otherwise, if \p Inst is not specified, just initialized optional value is
4505/// returned to show that the pointers represent strided pointers. If \p Inst
4506/// specified, the runtime stride is materialized before the given \p Inst.
4507/// \returns std::nullopt if the pointers are not pointers with the runtime
4508/// stride, nullptr or actual stride value, otherwise.
4509static std::optional<Value *>
4511 const DataLayout &DL, ScalarEvolution &SE,
4512 SmallVectorImpl<unsigned> &SortedIndices,
4513 Instruction *Inst = nullptr) {
4515 const SCEV *PtrSCEVLowest = nullptr;
4516 const SCEV *PtrSCEVHighest = nullptr;
4517 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4518 // addresses).
4519 for (Value *Ptr : PointerOps) {
4520 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4521 if (!PtrSCEV)
4522 return std::nullopt;
4523 SCEVs.push_back(PtrSCEV);
4524 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4525 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4526 continue;
4527 }
4528 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4529 if (isa<SCEVCouldNotCompute>(Diff))
4530 return std::nullopt;
4531 if (Diff->isNonConstantNegative()) {
4532 PtrSCEVLowest = PtrSCEV;
4533 continue;
4534 }
4535 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4536 if (isa<SCEVCouldNotCompute>(Diff1))
4537 return std::nullopt;
4538 if (Diff1->isNonConstantNegative()) {
4539 PtrSCEVHighest = PtrSCEV;
4540 continue;
4541 }
4542 }
4543 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4544 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4545 if (isa<SCEVCouldNotCompute>(Dist))
4546 return std::nullopt;
4547 int Size = DL.getTypeStoreSize(ElemTy);
4548 auto TryGetStride = [&](const SCEV *Dist,
4549 const SCEV *Multiplier) -> const SCEV * {
4550 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4551 if (M->getOperand(0) == Multiplier)
4552 return M->getOperand(1);
4553 if (M->getOperand(1) == Multiplier)
4554 return M->getOperand(0);
4555 return nullptr;
4556 }
4557 if (Multiplier == Dist)
4558 return SE.getConstant(Dist->getType(), 1);
4559 return SE.getUDivExactExpr(Dist, Multiplier);
4560 };
4561 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4562 const SCEV *Stride = nullptr;
4563 if (Size != 1 || SCEVs.size() > 2) {
4564 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4565 Stride = TryGetStride(Dist, Sz);
4566 if (!Stride)
4567 return std::nullopt;
4568 }
4569 if (!Stride || isa<SCEVConstant>(Stride))
4570 return std::nullopt;
4571 // Iterate through all pointers and check if all distances are
4572 // unique multiple of Stride.
4573 using DistOrdPair = std::pair<int64_t, int>;
4574 auto Compare = llvm::less_first();
4575 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4576 int Cnt = 0;
4577 bool IsConsecutive = true;
4578 for (const SCEV *PtrSCEV : SCEVs) {
4579 unsigned Dist = 0;
4580 if (PtrSCEV != PtrSCEVLowest) {
4581 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4582 const SCEV *Coeff = TryGetStride(Diff, Stride);
4583 if (!Coeff)
4584 return std::nullopt;
4585 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4586 if (!SC || isa<SCEVCouldNotCompute>(SC))
4587 return std::nullopt;
4588 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4589 SE.getMulExpr(Stride, SC)))
4590 ->isZero())
4591 return std::nullopt;
4592 Dist = SC->getAPInt().getZExtValue();
4593 }
4594 // If the strides are not the same or repeated, we can't vectorize.
4595 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4596 return std::nullopt;
4597 auto Res = Offsets.emplace(Dist, Cnt);
4598 if (!Res.second)
4599 return std::nullopt;
4600 // Consecutive order if the inserted element is the last one.
4601 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4602 ++Cnt;
4603 }
4604 if (Offsets.size() != SCEVs.size())
4605 return std::nullopt;
4606 SortedIndices.clear();
4607 if (!IsConsecutive) {
4608 // Fill SortedIndices array only if it is non-consecutive.
4609 SortedIndices.resize(PointerOps.size());
4610 Cnt = 0;
4611 for (const std::pair<int64_t, int> &Pair : Offsets) {
4612 SortedIndices[Cnt] = Pair.second;
4613 ++Cnt;
4614 }
4615 }
4616 if (!Inst)
4617 return nullptr;
4618 SCEVExpander Expander(SE, DL, "strided-load-vec");
4619 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4620}
4621
4622static std::pair<InstructionCost, InstructionCost>
4624 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4625 Type *ScalarTy, VectorType *VecTy);
4626
4627/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4628/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4629/// subvector pattern.
4630static InstructionCost
4632 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
4634 int Index = 0, VectorType *SubTp = nullptr,
4635 ArrayRef<const Value *> Args = std::nullopt) {
4636 if (Kind != TTI::SK_PermuteTwoSrc)
4637 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4638 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4639 int NumSubElts;
4640 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
4641 Mask, NumSrcElts, NumSubElts, Index)) {
4642 if (Index + NumSubElts > NumSrcElts &&
4643 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4644 return TTI.getShuffleCost(
4646 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4648 }
4649 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4650}
4651
4653 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4654 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4655 // Check that a vectorized load would load the same memory as a scalar
4656 // load. For example, we don't want to vectorize loads that are smaller
4657 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4658 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4659 // from such a struct, we read/write packed bits disagreeing with the
4660 // unvectorized version.
4661 Type *ScalarTy = VL0->getType();
4662
4663 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4664 return LoadsState::Gather;
4665
4666 // Make sure all loads in the bundle are simple - we can't vectorize
4667 // atomic or volatile loads.
4668 PointerOps.clear();
4669 const unsigned Sz = VL.size();
4670 PointerOps.resize(Sz);
4671 auto *POIter = PointerOps.begin();
4672 for (Value *V : VL) {
4673 auto *L = cast<LoadInst>(V);
4674 if (!L->isSimple())
4675 return LoadsState::Gather;
4676 *POIter = L->getPointerOperand();
4677 ++POIter;
4678 }
4679
4680 Order.clear();
4681 auto *VecTy = getWidenedType(ScalarTy, Sz);
4682 // Check the order of pointer operands or that all pointers are the same.
4683 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4684 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4685 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4686 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4687 "supported with VectorizeNonPowerOf2");
4688 return LoadsState::Gather;
4689 }
4690
4691 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4692 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4693 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4694 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4696 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4697 return arePointersCompatible(P, PointerOps.front(), *TLI);
4698 })) {
4699 if (IsSorted) {
4700 Value *Ptr0;
4701 Value *PtrN;
4702 if (Order.empty()) {
4703 Ptr0 = PointerOps.front();
4704 PtrN = PointerOps.back();
4705 } else {
4706 Ptr0 = PointerOps[Order.front()];
4707 PtrN = PointerOps[Order.back()];
4708 }
4709 std::optional<int> Diff =
4710 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4711 // Check that the sorted loads are consecutive.
4712 if (static_cast<unsigned>(*Diff) == Sz - 1)
4713 return LoadsState::Vectorize;
4714 // Simple check if not a strided access - clear order.
4715 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4716 // Try to generate strided load node if:
4717 // 1. Target with strided load support is detected.
4718 // 2. The number of loads is greater than MinProfitableStridedLoads,
4719 // or the potential stride <= MaxProfitableLoadStride and the
4720 // potential stride is power-of-2 (to avoid perf regressions for the very
4721 // small number of loads) and max distance > number of loads, or potential
4722 // stride is -1.
4723 // 3. The loads are ordered, or number of unordered loads <=
4724 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4725 // (this check is to avoid extra costs for very expensive shuffles).
4726 // 4. Any pointer operand is an instruction with the users outside of the
4727 // current graph (for masked gathers extra extractelement instructions
4728 // might be required).
4729 auto IsAnyPointerUsedOutGraph =
4730 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
4731 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
4732 return !getTreeEntry(U) && !MustGather.contains(U);
4733 });
4734 });
4735 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4737 (static_cast<unsigned>(std::abs(*Diff)) <=
4739 isPowerOf2_32(std::abs(*Diff)))) &&
4740 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4741 *Diff == -(static_cast<int>(Sz) - 1))) {
4742 int Stride = *Diff / static_cast<int>(Sz - 1);
4743 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4744 Align Alignment =
4745 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4746 ->getAlign();
4747 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4748 // Iterate through all pointers and check if all distances are
4749 // unique multiple of Dist.
4750 SmallSet<int, 4> Dists;
4751 for (Value *Ptr : PointerOps) {
4752 int Dist = 0;
4753 if (Ptr == PtrN)
4754 Dist = *Diff;
4755 else if (Ptr != Ptr0)
4756 Dist =
4757 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4758 // If the strides are not the same or repeated, we can't
4759 // vectorize.
4760 if (((Dist / Stride) * Stride) != Dist ||
4761 !Dists.insert(Dist).second)
4762 break;
4763 }
4764 if (Dists.size() == Sz)
4766 }
4767 }
4768 }
4769 }
4770 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4771 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4772 unsigned MinVF = getMinVF(Sz);
4773 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4774 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4775 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4776 unsigned VectorizedCnt = 0;
4778 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4779 Cnt += VF, ++VectorizedCnt) {
4780 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4782 SmallVector<Value *> PointerOps;
4783 LoadsState LS =
4784 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4785 /*TryRecursiveCheck=*/false);
4786 // Check that the sorted loads are consecutive.
4787 if (LS == LoadsState::Gather)
4788 break;
4789 // If need the reorder - consider as high-cost masked gather for now.
4790 if ((LS == LoadsState::Vectorize ||
4792 !Order.empty() && !isReverseOrder(Order))
4794 States.push_back(LS);
4795 }
4796 // Can be vectorized later as a serie of loads/insertelements.
4797 if (VectorizedCnt == VL.size() / VF) {
4798 // Compare masked gather cost and loads + insersubvector costs.
4800 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4801 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4802 CostKind, ScalarTy, VecTy);
4803 InstructionCost MaskedGatherCost =
4805 Instruction::Load, VecTy,
4806 cast<LoadInst>(VL0)->getPointerOperand(),
4807 /*VariableMask=*/false, CommonAlignment, CostKind) +
4808 VectorGEPCost - ScalarGEPCost;
4809 InstructionCost VecLdCost = 0;
4810 auto *SubVecTy = getWidenedType(ScalarTy, VF);
4811 for (auto [I, LS] : enumerate(States)) {
4812 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4813 switch (LS) {
4814 case LoadsState::Vectorize: {
4815 auto [ScalarGEPCost, VectorGEPCost] =
4816 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4817 LI0->getPointerOperand(), Instruction::Load,
4818 CostKind, ScalarTy, SubVecTy);
4819 VecLdCost += TTI.getMemoryOpCost(
4820 Instruction::Load, SubVecTy, LI0->getAlign(),
4821 LI0->getPointerAddressSpace(), CostKind,
4823 VectorGEPCost - ScalarGEPCost;
4824 break;
4825 }
4827 auto [ScalarGEPCost, VectorGEPCost] =
4828 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4829 LI0->getPointerOperand(), Instruction::Load,
4830 CostKind, ScalarTy, SubVecTy);
4831 VecLdCost +=
4833 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4834 /*VariableMask=*/false, CommonAlignment, CostKind) +
4835 VectorGEPCost - ScalarGEPCost;
4836 break;
4837 }
4839 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4840 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4841 LI0->getPointerOperand(), Instruction::GetElementPtr,
4842 CostKind, ScalarTy, SubVecTy);
4843 VecLdCost +=
4845 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4846 /*VariableMask=*/false, CommonAlignment, CostKind) +
4847 VectorGEPCost - ScalarGEPCost;
4848 break;
4849 }
4850 case LoadsState::Gather:
4852 "Expected only consecutive, strided or masked gather loads.");
4853 }
4854 SmallVector<int> ShuffleMask(VL.size());
4855 for (int Idx : seq<int>(0, VL.size()))
4856 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4857 VecLdCost +=
4859 ShuffleMask, CostKind, I * VF, SubVecTy);
4860 }
4861 // If masked gather cost is higher - better to vectorize, so
4862 // consider it as a gather node. It will be better estimated
4863 // later.
4864 if (MaskedGatherCost >= VecLdCost)
4865 return true;
4866 }
4867 }
4868 return false;
4869 };
4870 // TODO: need to improve analysis of the pointers, if not all of them are
4871 // GEPs or have > 2 operands, we end up with a gather node, which just
4872 // increases the cost.
4873 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4874 bool ProfitableGatherPointers =
4875 L && Sz > 2 &&
4876 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4877 return L->isLoopInvariant(V);
4878 })) <= Sz / 2;
4879 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4880 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4881 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4882 (GEP && GEP->getNumOperands() == 2 &&
4883 isa<Constant, Instruction>(GEP->getOperand(1)));
4884 })) {
4885 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4886 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4887 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4888 // Check if potential masked gather can be represented as series
4889 // of loads + insertsubvectors.
4890 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4891 // If masked gather cost is higher - better to vectorize, so
4892 // consider it as a gather node. It will be better estimated
4893 // later.
4894 return LoadsState::Gather;
4895 }
4897 }
4898 }
4899 }
4900
4901 return LoadsState::Gather;
4902}
4903
4905 const DataLayout &DL, ScalarEvolution &SE,
4906 SmallVectorImpl<unsigned> &SortedIndices) {
4908 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4909 "Expected list of pointer operands.");
4910 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4911 // Ptr into, sort and return the sorted indices with values next to one
4912 // another.
4914 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4915
4916 unsigned Cnt = 1;
4917 for (Value *Ptr : VL.drop_front()) {
4918 bool Found = any_of(Bases, [&](auto &Base) {
4919 std::optional<int> Diff =
4920 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4921 /*StrictCheck=*/true);
4922 if (!Diff)
4923 return false;
4924
4925 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4926 return true;
4927 });
4928
4929 if (!Found) {
4930 // If we haven't found enough to usefully cluster, return early.
4931 if (Bases.size() > VL.size() / 2 - 1)
4932 return false;
4933
4934 // Not found already - add a new Base
4935 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4936 }
4937 }
4938
4939 // For each of the bases sort the pointers by Offset and check if any of the
4940 // base become consecutively allocated.
4941 bool AnyConsecutive = false;
4942 for (auto &Base : Bases) {
4943 auto &Vec = Base.second;
4944 if (Vec.size() > 1) {
4945 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4946 const std::tuple<Value *, int, unsigned> &Y) {
4947 return std::get<1>(X) < std::get<1>(Y);
4948 });
4949 int InitialOffset = std::get<1>(Vec[0]);
4950 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4951 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4952 });
4953 }
4954 }
4955
4956 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4957 SortedIndices.clear();
4958 if (!AnyConsecutive)
4959 return false;
4960
4961 // If we have a better order, also sort the base pointers by increasing
4962 // (variable) values if possible, to try and keep the order more regular. In
4963 // order to create a valid strict-weak order we cluster by the Root of gep
4964 // chains and sort within each.
4966 for (auto &Base : Bases) {
4967 Value *Strip = Base.first->stripInBoundsConstantOffsets();
4968 Value *Root = Strip;
4969 while (auto *Gep = dyn_cast<GetElementPtrInst>(Root))
4970 Root = Gep->getOperand(0);
4971 SortedBases.emplace_back(Base.first, Strip, Root);
4972 }
4973 auto *Begin = SortedBases.begin();
4974 auto *End = SortedBases.end();
4975 while (Begin != End) {
4976 Value *Root = std::get<2>(*Begin);
4977 auto *Mid = std::stable_partition(
4978 Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; });
4980 for (auto I = Begin; I < Mid; ++I)
4981 LessThan.try_emplace(std::get<1>(*I));
4982 for (auto I = Begin; I < Mid; ++I) {
4983 Value *V = std::get<1>(*I);
4984 while (auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
4985 V = Gep->getOperand(0);
4986 if (LessThan.contains(V))
4987 LessThan[V][std::get<1>(*I)] = true;
4988 }
4989 }
4990 std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) {
4991 return LessThan[std::get<1>(V1)][std::get<1>(V2)];
4992 });
4993 Begin = Mid;
4994 }
4995
4996 // Collect the final order of sorted indices
4997 for (auto Base : SortedBases)
4998 for (auto &T : Bases[std::get<0>(Base)])
4999 SortedIndices.push_back(std::get<2>(T));
5000
5001 assert(SortedIndices.size() == VL.size() &&
5002 "Expected SortedIndices to be the size of VL");
5003 return true;
5004}
5005
5006std::optional<BoUpSLP::OrdersType>
5007BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5008 assert(TE.isGather() && "Expected gather node only.");
5009 Type *ScalarTy = TE.Scalars[0]->getType();
5010
5012 Ptrs.reserve(TE.Scalars.size());
5013 for (Value *V : TE.Scalars) {
5014 auto *L = dyn_cast<LoadInst>(V);
5015 if (!L || !L->isSimple())
5016 return std::nullopt;
5017 Ptrs.push_back(L->getPointerOperand());
5018 }
5019
5020 BoUpSLP::OrdersType Order;
5021 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
5022 return std::move(Order);
5023 return std::nullopt;
5024}
5025
5026/// Check if two insertelement instructions are from the same buildvector.
5029 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5030 // Instructions must be from the same basic blocks.
5031 if (VU->getParent() != V->getParent())
5032 return false;
5033 // Checks if 2 insertelements are from the same buildvector.
5034 if (VU->getType() != V->getType())
5035 return false;
5036 // Multiple used inserts are separate nodes.
5037 if (!VU->hasOneUse() && !V->hasOneUse())
5038 return false;
5039 auto *IE1 = VU;
5040 auto *IE2 = V;
5041 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5042 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5043 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5044 return false;
5045 // Go through the vector operand of insertelement instructions trying to find
5046 // either VU as the original vector for IE2 or V as the original vector for
5047 // IE1.
5048 SmallBitVector ReusedIdx(
5049 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5050 bool IsReusedIdx = false;
5051 do {
5052 if (IE2 == VU && !IE1)
5053 return VU->hasOneUse();
5054 if (IE1 == V && !IE2)
5055 return V->hasOneUse();
5056 if (IE1 && IE1 != V) {
5057 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5058 IsReusedIdx |= ReusedIdx.test(Idx1);
5059 ReusedIdx.set(Idx1);
5060 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5061 IE1 = nullptr;
5062 else
5063 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5064 }
5065 if (IE2 && IE2 != VU) {
5066 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5067 IsReusedIdx |= ReusedIdx.test(Idx2);
5068 ReusedIdx.set(Idx2);
5069 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5070 IE2 = nullptr;
5071 else
5072 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5073 }
5074 } while (!IsReusedIdx && (IE1 || IE2));
5075 return false;
5076}
5077
5078std::optional<BoUpSLP::OrdersType>
5079BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5080 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
5081 if (TE.isNonPowOf2Vec())
5082 return std::nullopt;
5083
5084 // No need to reorder if need to shuffle reuses, still need to shuffle the
5085 // node.
5086 if (!TE.ReuseShuffleIndices.empty()) {
5087 if (isSplat(TE.Scalars))
5088 return std::nullopt;
5089 // Check if reuse shuffle indices can be improved by reordering.
5090 // For this, check that reuse mask is "clustered", i.e. each scalar values
5091 // is used once in each submask of size <number_of_scalars>.
5092 // Example: 4 scalar values.
5093 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5094 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5095 // element 3 is used twice in the second submask.
5096 unsigned Sz = TE.Scalars.size();
5097 if (TE.isGather()) {
5098 if (std::optional<OrdersType> CurrentOrder =
5100 SmallVector<int> Mask;
5101 fixupOrderingIndices(*CurrentOrder);
5102 inversePermutation(*CurrentOrder, Mask);
5103 ::addMask(Mask, TE.ReuseShuffleIndices);
5104 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5105 unsigned Sz = TE.Scalars.size();
5106 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5107 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5108 if (Idx != PoisonMaskElem)
5109 Res[Idx + K * Sz] = I + K * Sz;
5110 }
5111 return std::move(Res);
5112 }
5113 }
5114 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5115 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5116 2 * TE.getVectorFactor())) == 1)
5117 return std::nullopt;
5118 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5119 Sz)) {
5120 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5121 if (TE.ReorderIndices.empty())
5122 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5123 else
5124 inversePermutation(TE.ReorderIndices, ReorderMask);
5125 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5126 unsigned VF = ReorderMask.size();
5127 OrdersType ResOrder(VF, VF);
5128 unsigned NumParts = divideCeil(VF, Sz);
5129 SmallBitVector UsedVals(NumParts);
5130 for (unsigned I = 0; I < VF; I += Sz) {
5131 int Val = PoisonMaskElem;
5132 unsigned UndefCnt = 0;
5133 unsigned Limit = std::min(Sz, VF - I);
5134 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5135 [&](int Idx) {
5136 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5137 Val = Idx;
5138 if (Idx == PoisonMaskElem)
5139 ++UndefCnt;
5140 return Idx != PoisonMaskElem && Idx != Val;
5141 }) ||
5142 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5143 UndefCnt > Sz / 2)
5144 return std::nullopt;
5145 UsedVals.set(Val);
5146 for (unsigned K = 0; K < NumParts; ++K)
5147 ResOrder[Val + Sz * K] = I + K;
5148 }
5149 return std::move(ResOrder);
5150 }
5151 unsigned VF = TE.getVectorFactor();
5152 // Try build correct order for extractelement instructions.
5153 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5154 TE.ReuseShuffleIndices.end());
5155 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5156 all_of(TE.Scalars, [Sz](Value *V) {
5157 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5158 return Idx && *Idx < Sz;
5159 })) {
5160 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5161 if (TE.ReorderIndices.empty())
5162 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5163 else
5164 inversePermutation(TE.ReorderIndices, ReorderMask);
5165 for (unsigned I = 0; I < VF; ++I) {
5166 int &Idx = ReusedMask[I];
5167 if (Idx == PoisonMaskElem)
5168 continue;
5169 Value *V = TE.Scalars[ReorderMask[Idx]];
5170 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5171 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5172 }
5173 }
5174 // Build the order of the VF size, need to reorder reuses shuffles, they are
5175 // always of VF size.
5176 OrdersType ResOrder(VF);
5177 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5178 auto *It = ResOrder.begin();
5179 for (unsigned K = 0; K < VF; K += Sz) {
5180 OrdersType CurrentOrder(TE.ReorderIndices);
5181 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5182 if (SubMask.front() == PoisonMaskElem)
5183 std::iota(SubMask.begin(), SubMask.end(), 0);
5184 reorderOrder(CurrentOrder, SubMask);
5185 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5186 std::advance(It, Sz);
5187 }
5188 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5189 return Data.index() == Data.value();
5190 }))
5191 return std::nullopt; // No need to reorder.
5192 return std::move(ResOrder);
5193 }
5194 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5195 any_of(TE.UserTreeIndices,
5196 [](const EdgeInfo &EI) {
5197 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5198 }) &&
5199 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5200 return std::nullopt;
5201 if ((TE.State == TreeEntry::Vectorize ||
5202 TE.State == TreeEntry::StridedVectorize) &&
5203 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5204 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5205 !TE.isAltShuffle())
5206 return TE.ReorderIndices;
5207 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5208 auto PHICompare = [&](unsigned I1, unsigned I2) {
5209 Value *V1 = TE.Scalars[I1];
5210 Value *V2 = TE.Scalars[I2];
5211 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5212 return false;
5213 if (V1->getNumUses() < V2->getNumUses())
5214 return true;
5215 if (V1->getNumUses() > V2->getNumUses())
5216 return false;
5217 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5218 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5219 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5220 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5222 IE1, IE2,
5223 [](InsertElementInst *II) { return II->getOperand(0); }))
5224 return I1 < I2;
5225 return getElementIndex(IE1) < getElementIndex(IE2);
5226 }
5227 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5228 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5229 if (EE1->getOperand(0) != EE2->getOperand(0))
5230 return I1 < I2;
5231 return getElementIndex(EE1) < getElementIndex(EE2);
5232 }
5233 return I1 < I2;
5234 };
5235 auto IsIdentityOrder = [](const OrdersType &Order) {
5236 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5237 if (Idx != Order[Idx])
5238 return false;
5239 return true;
5240 };
5241 if (!TE.ReorderIndices.empty())
5242 return TE.ReorderIndices;
5244 SmallVector<unsigned> Phis(TE.Scalars.size());
5245 std::iota(Phis.begin(), Phis.end(), 0);
5246 OrdersType ResOrder(TE.Scalars.size());
5247 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5248 PhiToId[Id] = Id;
5249 stable_sort(Phis, PHICompare);
5250 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5251 ResOrder[Id] = PhiToId[Phis[Id]];
5252 if (IsIdentityOrder(ResOrder))
5253 return std::nullopt; // No need to reorder.
5254 return std::move(ResOrder);
5255 }
5256 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5257 // TODO: add analysis of other gather nodes with extractelement
5258 // instructions and other values/instructions, not only undefs.
5259 if ((TE.getOpcode() == Instruction::ExtractElement ||
5260 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5261 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5262 all_of(TE.Scalars, [](Value *V) {
5263 auto *EE = dyn_cast<ExtractElementInst>(V);
5264 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5265 })) {
5266 // Check that gather of extractelements can be represented as
5267 // just a shuffle of a single vector.
5268 OrdersType CurrentOrder;
5269 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5270 /*ResizeAllowed=*/true);
5271 if (Reuse || !CurrentOrder.empty())
5272 return std::move(CurrentOrder);
5273 }
5274 // If the gather node is <undef, v, .., poison> and
5275 // insertelement poison, v, 0 [+ permute]
5276 // is cheaper than
5277 // insertelement poison, v, n - try to reorder.
5278 // If rotating the whole graph, exclude the permute cost, the whole graph
5279 // might be transformed.
5280 int Sz = TE.Scalars.size();
5281 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5282 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5283 const auto *It =
5284 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5285 if (It == TE.Scalars.begin())
5286 return OrdersType();
5287 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5288 if (It != TE.Scalars.end()) {
5289 OrdersType Order(Sz, Sz);
5290 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5291 Order[Idx] = 0;
5292 fixupOrderingIndices(Order);
5293 SmallVector<int> Mask;
5294 inversePermutation(Order, Mask);
5295 InstructionCost PermuteCost =
5296 TopToBottom
5297 ? 0
5299 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5300 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5301 PoisonValue::get(Ty), *It);
5302 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5303 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5304 PoisonValue::get(Ty), *It);
5305 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5306 OrdersType Order(Sz, Sz);
5307 Order[Idx] = 0;
5308 return std::move(Order);
5309 }
5310 }
5311 }
5312 if (isSplat(TE.Scalars))
5313 return std::nullopt;
5314 if (TE.Scalars.size() >= 4)
5315 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5316 return Order;
5317 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5318 return CurrentOrder;
5319 }
5320 return std::nullopt;
5321}
5322
5323/// Checks if the given mask is a "clustered" mask with the same clusters of
5324/// size \p Sz, which are not identity submasks.
5326 unsigned Sz) {
5327 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5328 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5329 return false;
5330 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5331 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5332 if (Cluster != FirstCluster)
5333 return false;
5334 }
5335 return true;
5336}
5337
5338void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5339 // Reorder reuses mask.
5340 reorderReuses(TE.ReuseShuffleIndices, Mask);
5341 const unsigned Sz = TE.Scalars.size();
5342 // For vectorized and non-clustered reused no need to do anything else.
5343 if (!TE.isGather() ||
5345 Sz) ||
5346 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5347 return;
5348 SmallVector<int> NewMask;
5349 inversePermutation(TE.ReorderIndices, NewMask);
5350 addMask(NewMask, TE.ReuseShuffleIndices);
5351 // Clear reorder since it is going to be applied to the new mask.
5352 TE.ReorderIndices.clear();
5353 // Try to improve gathered nodes with clustered reuses, if possible.
5354 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5355 SmallVector<unsigned> NewOrder(Slice);
5356 inversePermutation(NewOrder, NewMask);
5357 reorderScalars(TE.Scalars, NewMask);
5358 // Fill the reuses mask with the identity submasks.
5359 for (auto *It = TE.ReuseShuffleIndices.begin(),
5360 *End = TE.ReuseShuffleIndices.end();
5361 It != End; std::advance(It, Sz))
5362 std::iota(It, std::next(It, Sz), 0);
5363}
5364
5366 ArrayRef<unsigned> SecondaryOrder) {
5367 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5368 "Expected same size of orders");
5369 unsigned Sz = Order.size();
5370 SmallBitVector UsedIndices(Sz);
5371 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5372 if (Order[Idx] != Sz)
5373 UsedIndices.set(Order[Idx]);
5374 }
5375 if (SecondaryOrder.empty()) {
5376 for (unsigned Idx : seq<unsigned>(0, Sz))
5377 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5378 Order[Idx] = Idx;
5379 } else {
5380 for (unsigned Idx : seq<unsigned>(0, Sz))
5381 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5382 !UsedIndices.test(SecondaryOrder[Idx]))
5383 Order[Idx] = SecondaryOrder[Idx];
5384 }
5385}
5386
5388 // Maps VF to the graph nodes.
5390 // ExtractElement gather nodes which can be vectorized and need to handle
5391 // their ordering.
5393
5394 // Phi nodes can have preferred ordering based on their result users
5396
5397 // AltShuffles can also have a preferred ordering that leads to fewer
5398 // instructions, e.g., the addsub instruction in x86.
5399 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5400
5401 // Maps a TreeEntry to the reorder indices of external users.
5403 ExternalUserReorderMap;
5404 // Find all reorderable nodes with the given VF.
5405 // Currently the are vectorized stores,loads,extracts + some gathering of
5406 // extracts.
5407 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5408 const std::unique_ptr<TreeEntry> &TE) {
5409 // Look for external users that will probably be vectorized.
5410 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5411 findExternalStoreUsersReorderIndices(TE.get());
5412 if (!ExternalUserReorderIndices.empty()) {
5413 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5414 ExternalUserReorderMap.try_emplace(TE.get(),
5415 std::move(ExternalUserReorderIndices));
5416 }
5417
5418 // Patterns like [fadd,fsub] can be combined into a single instruction in
5419 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5420 // to take into account their order when looking for the most used order.
5421 if (TE->isAltShuffle()) {
5422 VectorType *VecTy =
5423 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5424 unsigned Opcode0 = TE->getOpcode();
5425 unsigned Opcode1 = TE->getAltOpcode();
5426 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5427 // If this pattern is supported by the target then we consider the order.
5428 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5429 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5430 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5431 }
5432 // TODO: Check the reverse order too.
5433 }
5434
5435 if (std::optional<OrdersType> CurrentOrder =
5436 getReorderingData(*TE, /*TopToBottom=*/true)) {
5437 // Do not include ordering for nodes used in the alt opcode vectorization,
5438 // better to reorder them during bottom-to-top stage. If follow the order
5439 // here, it causes reordering of the whole graph though actually it is
5440 // profitable just to reorder the subgraph that starts from the alternate
5441 // opcode vectorization node. Such nodes already end-up with the shuffle
5442 // instruction and it is just enough to change this shuffle rather than
5443 // rotate the scalars for the whole graph.
5444 unsigned Cnt = 0;
5445 const TreeEntry *UserTE = TE.get();
5446 while (UserTE && Cnt < RecursionMaxDepth) {
5447 if (UserTE->UserTreeIndices.size() != 1)
5448 break;
5449 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5450 return EI.UserTE->State == TreeEntry::Vectorize &&
5451 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5452 }))
5453 return;
5454 UserTE = UserTE->UserTreeIndices.back().UserTE;
5455 ++Cnt;
5456 }
5457 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5458 if (!(TE->State == TreeEntry::Vectorize ||
5459 TE->State == TreeEntry::StridedVectorize) ||
5460 !TE->ReuseShuffleIndices.empty())
5461 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5462 if (TE->State == TreeEntry::Vectorize &&
5463 TE->getOpcode() == Instruction::PHI)
5464 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5465 }
5466 });
5467
5468 // Reorder the graph nodes according to their vectorization factor.
5469 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5470 VF /= 2) {
5471 auto It = VFToOrderedEntries.find(VF);
5472 if (It == VFToOrderedEntries.end())
5473 continue;
5474 // Try to find the most profitable order. We just are looking for the most
5475 // used order and reorder scalar elements in the nodes according to this
5476 // mostly used order.
5477 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5478 // All operands are reordered and used only in this node - propagate the
5479 // most used order to the user node.
5482 OrdersUses;
5484 for (const TreeEntry *OpTE : OrderedEntries) {
5485 // No need to reorder this nodes, still need to extend and to use shuffle,
5486 // just need to merge reordering shuffle and the reuse shuffle.
5487 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5488 continue;
5489 // Count number of orders uses.
5490 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5491 &PhisToOrders]() -> const OrdersType & {
5492 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5493 auto It = GathersToOrders.find(OpTE);
5494 if (It != GathersToOrders.end())
5495 return It->second;
5496 }
5497 if (OpTE->isAltShuffle()) {
5498 auto It = AltShufflesToOrders.find(OpTE);
5499 if (It != AltShufflesToOrders.end())
5500 return It->second;
5501 }
5502 if (OpTE->State == TreeEntry::Vectorize &&
5503 OpTE->getOpcode() == Instruction::PHI) {
5504 auto It = PhisToOrders.find(OpTE);
5505 if (It != PhisToOrders.end())
5506 return It->second;
5507 }
5508 return OpTE->ReorderIndices;
5509 }();
5510 // First consider the order of the external scalar users.
5511 auto It = ExternalUserReorderMap.find(OpTE);
5512 if (It != ExternalUserReorderMap.end()) {
5513 const auto &ExternalUserReorderIndices = It->second;
5514 // If the OpTE vector factor != number of scalars - use natural order,
5515 // it is an attempt to reorder node with reused scalars but with
5516 // external uses.
5517 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5518 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5519 ExternalUserReorderIndices.size();
5520 } else {
5521 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5522 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5523 }
5524 // No other useful reorder data in this entry.
5525 if (Order.empty())
5526 continue;
5527 }
5528 // Stores actually store the mask, not the order, need to invert.
5529 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5530 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5531 SmallVector<int> Mask;
5532 inversePermutation(Order, Mask);
5533 unsigned E = Order.size();
5534 OrdersType CurrentOrder(E, E);
5535 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5536 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5537 });
5538 fixupOrderingIndices(CurrentOrder);
5539 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5540 } else {
5541 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5542 }
5543 }
5544 if (OrdersUses.empty())
5545 continue;
5546 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5547 const unsigned Sz = Order.size();
5548 for (unsigned Idx : seq<unsigned>(0, Sz))
5549 if (Idx != Order[Idx] && Order[Idx] != Sz)
5550 return false;
5551 return true;
5552 };
5553 // Choose the most used order.
5554 unsigned IdentityCnt = 0;
5555 unsigned FilledIdentityCnt = 0;
5556 OrdersType IdentityOrder(VF, VF);
5557 for (auto &Pair : OrdersUses) {
5558 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5559 if (!Pair.first.empty())
5560 FilledIdentityCnt += Pair.second;
5561 IdentityCnt += Pair.second;
5562 combineOrders(IdentityOrder, Pair.first);
5563 }
5564 }
5565 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5566 unsigned Cnt = IdentityCnt;
5567 for (auto &Pair : OrdersUses) {
5568 // Prefer identity order. But, if filled identity found (non-empty order)
5569 // with same number of uses, as the new candidate order, we can choose
5570 // this candidate order.
5571 if (Cnt < Pair.second ||
5572 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5573 Cnt == Pair.second && !BestOrder.empty() &&
5574 IsIdentityOrder(BestOrder))) {
5575 combineOrders(Pair.first, BestOrder);
5576 BestOrder = Pair.first;
5577 Cnt = Pair.second;
5578 } else {
5579 combineOrders(BestOrder, Pair.first);
5580 }
5581 }
5582 // Set order of the user node.
5583 if (IsIdentityOrder(BestOrder))
5584 continue;
5585 fixupOrderingIndices(BestOrder);
5586 SmallVector<int> Mask;
5587 inversePermutation(BestOrder, Mask);
5588 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5589 unsigned E = BestOrder.size();
5590 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5591 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5592 });
5593 // Do an actual reordering, if profitable.
5594 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5595 // Just do the reordering for the nodes with the given VF.
5596 if (TE->Scalars.size() != VF) {
5597 if (TE->ReuseShuffleIndices.size() == VF) {
5598 // Need to reorder the reuses masks of the operands with smaller VF to
5599 // be able to find the match between the graph nodes and scalar
5600 // operands of the given node during vectorization/cost estimation.
5601 assert(all_of(TE->UserTreeIndices,
5602 [VF, &TE](const EdgeInfo &EI) {
5603 return EI.UserTE->Scalars.size() == VF ||
5604 EI.UserTE->Scalars.size() ==
5605 TE->Scalars.size();
5606 }) &&
5607 "All users must be of VF size.");
5608 // Update ordering of the operands with the smaller VF than the given
5609 // one.
5610 reorderNodeWithReuses(*TE, Mask);
5611 }
5612 continue;
5613 }
5614 if ((TE->State == TreeEntry::Vectorize ||
5615 TE->State == TreeEntry::StridedVectorize) &&
5617 InsertElementInst>(TE->getMainOp()) &&
5618 !TE->isAltShuffle()) {
5619 // Build correct orders for extract{element,value}, loads and
5620 // stores.
5621 reorderOrder(TE->ReorderIndices, Mask);
5622 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5623 TE->reorderOperands(Mask);
5624 } else {
5625 // Reorder the node and its operands.
5626 TE->reorderOperands(Mask);
5627 assert(TE->ReorderIndices.empty() &&
5628 "Expected empty reorder sequence.");
5629 reorderScalars(TE->Scalars, Mask);
5630 }
5631 if (!TE->ReuseShuffleIndices.empty()) {
5632 // Apply reversed order to keep the original ordering of the reused
5633 // elements to avoid extra reorder indices shuffling.
5634 OrdersType CurrentOrder;
5635 reorderOrder(CurrentOrder, MaskOrder);
5636 SmallVector<int> NewReuses;
5637 inversePermutation(CurrentOrder, NewReuses);
5638 addMask(NewReuses, TE->ReuseShuffleIndices);
5639 TE->ReuseShuffleIndices.swap(NewReuses);
5640 }
5641 }
5642 }
5643}
5644
5645bool BoUpSLP::canReorderOperands(
5646 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5647 ArrayRef<TreeEntry *> ReorderableGathers,
5648 SmallVectorImpl<TreeEntry *> &GatherOps) {
5649 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5650 if (UserTE->isNonPowOf2Vec())
5651 return false;
5652
5653 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5654 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5655 return OpData.first == I &&
5656 (OpData.second->State == TreeEntry::Vectorize ||
5657 OpData.second->State == TreeEntry::StridedVectorize);
5658 }))
5659 continue;
5660 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5661 // Do not reorder if operand node is used by many user nodes.
5662 if (any_of(TE->UserTreeIndices,
5663 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5664 return false;
5665 // Add the node to the list of the ordered nodes with the identity
5666 // order.
5667 Edges.emplace_back(I, TE);
5668 // Add ScatterVectorize nodes to the list of operands, where just
5669 // reordering of the scalars is required. Similar to the gathers, so
5670 // simply add to the list of gathered ops.
5671 // If there are reused scalars, process this node as a regular vectorize
5672 // node, just reorder reuses mask.
5673 if (TE->State != TreeEntry::Vectorize &&
5674 TE->State != TreeEntry::StridedVectorize &&
5675 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5676 GatherOps.push_back(TE);
5677 continue;
5678 }
5679 TreeEntry *Gather = nullptr;
5680 if (count_if(ReorderableGathers,
5681 [&Gather, UserTE, I](TreeEntry *TE) {
5682 assert(TE->State != TreeEntry::Vectorize &&
5683 TE->State != TreeEntry::StridedVectorize &&
5684 "Only non-vectorized nodes are expected.");
5685 if (any_of(TE->UserTreeIndices,
5686 [UserTE, I](const EdgeInfo &EI) {
5687 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5688 })) {
5689 assert(TE->isSame(UserTE->getOperand(I)) &&
5690 "Operand entry does not match operands.");
5691 Gather = TE;
5692 return true;
5693 }
5694 return false;
5695 }) > 1 &&
5696 !allConstant(UserTE->getOperand(I)))
5697 return false;
5698 if (Gather)
5699 GatherOps.push_back(Gather);
5700 }
5701 return true;
5702}
5703
5704void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5705 SetVector<TreeEntry *> OrderedEntries;
5706 DenseSet<const TreeEntry *> GathersToOrders;
5707 // Find all reorderable leaf nodes with the given VF.
5708 // Currently the are vectorized loads,extracts without alternate operands +
5709 // some gathering of extracts.
5710 SmallVector<TreeEntry *> NonVectorized;
5711 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5712 if (TE->State != TreeEntry::Vectorize &&
5713 TE->State != TreeEntry::StridedVectorize)
5714 NonVectorized.push_back(TE.get());
5715 if (std::optional<OrdersType> CurrentOrder =
5716 getReorderingData(*TE, /*TopToBottom=*/false)) {
5717 OrderedEntries.insert(TE.get());
5718 if (!(TE->State == TreeEntry::Vectorize ||
5719 TE->State == TreeEntry::StridedVectorize) ||
5720 !TE->ReuseShuffleIndices.empty())
5721 GathersToOrders.insert(TE.get());
5722 }
5723 }
5724
5725 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5726 // I.e., if the node has operands, that are reordered, try to make at least
5727 // one operand order in the natural order and reorder others + reorder the
5728 // user node itself.
5730 while (!OrderedEntries.empty()) {
5731 // 1. Filter out only reordered nodes.
5732 // 2. If the entry has multiple uses - skip it and jump to the next node.
5734 SmallVector<TreeEntry *> Filtered;
5735 for (TreeEntry *TE : OrderedEntries) {
5736 if (!(TE->State == TreeEntry::Vectorize ||
5737 TE->State == TreeEntry::StridedVectorize ||
5738 (TE->isGather() && GathersToOrders.contains(TE))) ||
5739 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5740 !all_of(drop_begin(TE->UserTreeIndices),
5741 [TE](const EdgeInfo &EI) {
5742 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5743 }) ||
5744 !Visited.insert(TE).second) {
5745 Filtered.push_back(TE);
5746 continue;
5747 }
5748 // Build a map between user nodes and their operands order to speedup
5749 // search. The graph currently does not provide this dependency directly.
5750 for (EdgeInfo &EI : TE->UserTreeIndices) {
5751 TreeEntry *UserTE = EI.UserTE;
5752 auto It = Users.find(UserTE);
5753 if (It == Users.end())
5754 It = Users.insert({UserTE, {}}).first;
5755 It->second.emplace_back(EI.EdgeIdx, TE);
5756 }
5757 }
5758 // Erase filtered entries.
5759 for (TreeEntry *TE : Filtered)
5760 OrderedEntries.remove(TE);
5762 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5763 UsersVec(Users.begin(), Users.end());
5764 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5765 return Data1.first->Idx > Data2.first->Idx;
5766 });
5767 for (auto &Data : UsersVec) {
5768 // Check that operands are used only in the User node.
5769 SmallVector<TreeEntry *> GatherOps;
5770 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5771 GatherOps)) {
5772 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5773 OrderedEntries.remove(Op.second);
5774 continue;
5775 }
5776 // All operands are reordered and used only in this node - propagate the
5777 // most used order to the user node.
5780 OrdersUses;
5781 // Do the analysis for each tree entry only once, otherwise the order of
5782 // the same node my be considered several times, though might be not
5783 // profitable.
5786 for (const auto &Op : Data.second) {
5787 TreeEntry *OpTE = Op.second;
5788 if (!VisitedOps.insert(OpTE).second)
5789 continue;
5790 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5791 continue;
5792 const auto Order = [&]() -> const OrdersType {
5793 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5794 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5795 .value_or(OrdersType(1));
5796 return OpTE->ReorderIndices;
5797 }();
5798 // The order is partially ordered, skip it in favor of fully non-ordered
5799 // orders.
5800 if (Order.size() == 1)
5801 continue;
5802 unsigned NumOps = count_if(
5803 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5804 return P.second == OpTE;
5805 });
5806 // Stores actually store the mask, not the order, need to invert.
5807 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5808 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5809 SmallVector<int> Mask;
5810 inversePermutation(Order, Mask);
5811 unsigned E = Order.size();
5812 OrdersType CurrentOrder(E, E);
5813 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5814 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5815 });
5816 fixupOrderingIndices(CurrentOrder);
5817 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5818 NumOps;
5819 } else {
5820 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5821 }
5822 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5823 const auto AllowsReordering = [&](const TreeEntry *TE) {
5824 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5825 if (TE->isNonPowOf2Vec())
5826 return false;
5827 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5828 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5829 (IgnoreReorder && TE->Idx == 0))
5830 return true;
5831 if (TE->isGather()) {
5832 if (GathersToOrders.contains(TE))
5833 return !getReorderingData(*TE, /*TopToBottom=*/false)
5834 .value_or(OrdersType(1))
5835 .empty();
5836 return true;
5837 }
5838 return false;
5839 };
5840 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5841 TreeEntry *UserTE = EI.UserTE;
5842 if (!VisitedUsers.insert(UserTE).second)
5843 continue;
5844 // May reorder user node if it requires reordering, has reused
5845 // scalars, is an alternate op vectorize node or its op nodes require
5846 // reordering.
5847 if (AllowsReordering(UserTE))
5848 continue;
5849 // Check if users allow reordering.
5850 // Currently look up just 1 level of operands to avoid increase of
5851 // the compile time.
5852 // Profitable to reorder if definitely more operands allow
5853 // reordering rather than those with natural order.
5855 if (static_cast<unsigned>(count_if(
5856 Ops, [UserTE, &AllowsReordering](
5857 const std::pair<unsigned, TreeEntry *> &Op) {
5858 return AllowsReordering(Op.second) &&
5859 all_of(Op.second->UserTreeIndices,
5860 [UserTE](const EdgeInfo &EI) {
5861 return EI.UserTE == UserTE;
5862 });
5863 })) <= Ops.size() / 2)
5864 ++Res.first->second;
5865 }
5866 }
5867 if (OrdersUses.empty()) {
5868 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5869 OrderedEntries.remove(Op.second);
5870 continue;
5871 }
5872 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5873 const unsigned Sz = Order.size();
5874 for (unsigned Idx : seq<unsigned>(0, Sz))
5875 if (Idx != Order[Idx] && Order[Idx] != Sz)
5876 return false;
5877 return true;
5878 };
5879 // Choose the most used order.
5880 unsigned IdentityCnt = 0;
5881 unsigned VF = Data.second.front().second->getVectorFactor();
5882 OrdersType IdentityOrder(VF, VF);
5883 for (auto &Pair : OrdersUses) {
5884 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5885 IdentityCnt += Pair.second;
5886 combineOrders(IdentityOrder, Pair.first);
5887 }
5888 }
5889 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5890 unsigned Cnt = IdentityCnt;
5891 for (auto &Pair : OrdersUses) {
5892 // Prefer identity order. But, if filled identity found (non-empty
5893 // order) with same number of uses, as the new candidate order, we can
5894 // choose this candidate order.
5895 if (Cnt < Pair.second) {
5896 combineOrders(Pair.first, BestOrder);
5897 BestOrder = Pair.first;
5898 Cnt = Pair.second;
5899 } else {
5900 combineOrders(BestOrder, Pair.first);
5901 }
5902 }
5903 // Set order of the user node.
5904 if (IsIdentityOrder(BestOrder)) {
5905 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5906 OrderedEntries.remove(Op.second);
5907 continue;
5908 }
5909 fixupOrderingIndices(BestOrder);
5910 // Erase operands from OrderedEntries list and adjust their orders.
5911 VisitedOps.clear();
5912 SmallVector<int> Mask;
5913 inversePermutation(BestOrder, Mask);
5914 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5915 unsigned E = BestOrder.size();
5916 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5917 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5918 });
5919 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5920 TreeEntry *TE = Op.second;
5921 OrderedEntries.remove(TE);
5922 if (!VisitedOps.insert(TE).second)
5923 continue;
5924 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5925 reorderNodeWithReuses(*TE, Mask);
5926 continue;
5927 }
5928 // Gathers are processed separately.
5929 if (TE->State != TreeEntry::Vectorize &&
5930 TE->State != TreeEntry::StridedVectorize &&
5931 (TE->State != TreeEntry::ScatterVectorize ||
5932 TE->ReorderIndices.empty()))
5933 continue;
5934 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5935 TE->ReorderIndices.empty()) &&
5936 "Non-matching sizes of user/operand entries.");
5937 reorderOrder(TE->ReorderIndices, Mask);
5938 if (IgnoreReorder && TE == VectorizableTree.front().get())
5939 IgnoreReorder = false;
5940 }
5941 // For gathers just need to reorder its scalars.
5942 for (TreeEntry *Gather : GatherOps) {
5943 assert(Gather->ReorderIndices.empty() &&
5944 "Unexpected reordering of gathers.");
5945 if (!Gather->ReuseShuffleIndices.empty()) {
5946 // Just reorder reuses indices.
5947 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5948 continue;
5949 }
5950 reorderScalars(Gather->Scalars, Mask);
5951 OrderedEntries.remove(Gather);
5952 }
5953 // Reorder operands of the user node and set the ordering for the user
5954 // node itself.
5955 if (Data.first->State != TreeEntry::Vectorize ||
5956 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5957 Data.first->getMainOp()) ||
5958 Data.first->isAltShuffle())
5959 Data.first->reorderOperands(Mask);
5960 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5961 Data.first->isAltShuffle() ||
5962 Data.first->State == TreeEntry::StridedVectorize) {
5963 reorderScalars(Data.first->Scalars, Mask);
5964 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5965 /*BottomOrder=*/true);
5966 if (Data.first->ReuseShuffleIndices.empty() &&
5967 !Data.first->ReorderIndices.empty() &&
5968 !Data.first->isAltShuffle()) {
5969 // Insert user node to the list to try to sink reordering deeper in
5970 // the graph.
5971 OrderedEntries.insert(Data.first);
5972 }
5973 } else {
5974 reorderOrder(Data.first->ReorderIndices, Mask);
5975 }
5976 }
5977 }
5978 // If the reordering is unnecessary, just remove the reorder.
5979 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5980 VectorizableTree.front()->ReuseShuffleIndices.empty())
5981 VectorizableTree.front()->ReorderIndices.clear();
5982}
5983
5985 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5986 DenseMap<Value *, unsigned> ScalarToExtUses;
5987 // Collect the values that we need to extract from the tree.
5988 for (auto &TEPtr : VectorizableTree) {
5989 TreeEntry *Entry = TEPtr.get();
5990
5991 // No need to handle users of gathered values.
5992 if (Entry->isGather())
5993 continue;
5994
5995 // For each lane:
5996 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5997 Value *Scalar = Entry->Scalars[Lane];
5998 if (!isa<Instruction>(Scalar))
5999 continue;
6000 // All uses must be replaced already? No need to do it again.
6001 auto It = ScalarToExtUses.find(Scalar);
6002 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6003 continue;
6004
6005 // Check if the scalar is externally used as an extra arg.
6006 const auto *ExtI = ExternallyUsedValues.find(Scalar);
6007 if (ExtI != ExternallyUsedValues.end()) {
6008 int FoundLane = Entry->findLaneForValue(Scalar);
6009 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6010 << FoundLane << " from " << *Scalar << ".\n");
6011 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6012 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6013 continue;
6014 }
6015 for (User *U : Scalar->users()) {
6016 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6017
6018 Instruction *UserInst = dyn_cast<Instruction>(U);
6019 if (!UserInst || isDeleted(UserInst))
6020 continue;
6021
6022 // Ignore users in the user ignore list.
6023 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6024 continue;
6025
6026 // Skip in-tree scalars that become vectors
6027 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6028 // Some in-tree scalars will remain as scalar in vectorized
6029 // instructions. If that is the case, the one in FoundLane will
6030 // be used.
6031 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6033 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
6034 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6035 << ".\n");
6036 assert(!UseEntry->isGather() && "Bad state");
6037 continue;
6038 }
6039 U = nullptr;
6040 if (It != ScalarToExtUses.end()) {
6041 ExternalUses[It->second].User = nullptr;
6042 break;
6043 }
6044 }
6045
6046 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6047 U = nullptr;
6048 int FoundLane = Entry->findLaneForValue(Scalar);
6049 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6050 << " from lane " << FoundLane << " from " << *Scalar
6051 << ".\n");
6052 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6053 ExternalUses.emplace_back(Scalar, U, FoundLane);
6054 if (!U)
6055 break;
6056 }
6057 }
6058 }
6059}
6060
6062BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6064 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6065 Value *V = TE->Scalars[Lane];
6066 // Don't iterate over the users of constant data.
6067 if (isa<ConstantData>(V))
6068 continue;
6069 // To save compilation time we don't visit if we have too many users.
6070 if (V->hasNUsesOrMore(UsesLimit))
6071 break;
6072
6073 // Collect stores per pointer object.
6074 for (User *U : V->users()) {
6075 auto *SI = dyn_cast<StoreInst>(U);
6076 // Test whether we can handle the store. V might be a global, which could
6077 // be used in a different function.
6078 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6079 !isValidElementType(SI->getValueOperand()->getType()))
6080 continue;
6081 // Skip entry if already
6082 if (getTreeEntry(U))
6083 continue;
6084
6085 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
6086 auto &StoresVec = PtrToStoresMap[Ptr];
6087 // For now just keep one store per pointer object per lane.
6088 // TODO: Extend this to support multiple stores per pointer per lane
6089 if (StoresVec.size() > Lane)
6090 continue;
6091 // Skip if in different BBs.
6092 if (!StoresVec.empty() &&
6093 SI->getParent() != StoresVec.back()->getParent())
6094 continue;
6095 // Make sure that the stores are of the same type.
6096 if (!StoresVec.empty() &&
6097 SI->getValueOperand()->getType() !=
6098 StoresVec.back()->getValueOperand()->getType())
6099 continue;
6100 StoresVec.push_back(SI);
6101 }
6102 }
6103 return PtrToStoresMap;
6104}
6105
6106bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6107 OrdersType &ReorderIndices) const {
6108 // We check whether the stores in StoreVec can form a vector by sorting them
6109 // and checking whether they are consecutive.
6110
6111 // To avoid calling getPointersDiff() while sorting we create a vector of
6112 // pairs {store, offset from first} and sort this instead.
6113 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
6114 StoreInst *S0 = StoresVec[0];
6115 StoreOffsetVec[0] = {S0, 0};
6116 Type *S0Ty = S0->getValueOperand()->getType();
6117 Value *S0Ptr = S0->getPointerOperand();
6118 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6119 StoreInst *SI = StoresVec[Idx];
6120 std::optional<int> Diff =
6121 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6122 SI->getPointerOperand(), *DL, *SE,
6123 /*StrictCheck=*/true);
6124 // We failed to compare the pointers so just abandon this StoresVec.
6125 if (!Diff)
6126 return false;
6127 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
6128 }
6129
6130 // Sort the vector based on the pointers. We create a copy because we may
6131 // need the original later for calculating the reorder (shuffle) indices.
6132 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
6133 const std::pair<StoreInst *, int> &Pair2) {
6134 int Offset1 = Pair1.second;
6135 int Offset2 = Pair2.second;
6136 return Offset1 < Offset2;
6137 });
6138
6139 // Check if the stores are consecutive by checking if their difference is 1.
6140 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
6141 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
6142 return false;
6143
6144 // Calculate the shuffle indices according to their offset against the sorted
6145 // StoreOffsetVec.
6146 ReorderIndices.reserve(StoresVec.size());
6147 for (StoreInst *SI : StoresVec) {
6148 unsigned Idx = find_if(StoreOffsetVec,
6149 [SI](const std::pair<StoreInst *, int> &Pair) {
6150 return Pair.first == SI;
6151 }) -
6152 StoreOffsetVec.begin();
6153 ReorderIndices.push_back(Idx);
6154 }
6155 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6156 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6157 // same convention here.
6158 auto IsIdentityOrder = [](const OrdersType &Order) {
6159 for (unsigned Idx : seq<unsigned>(0, Order.size()))
6160 if (Idx != Order[Idx])
6161 return false;
6162 return true;
6163 };
6164 if (IsIdentityOrder(ReorderIndices))
6165 ReorderIndices.clear();
6166
6167 return true;
6168}
6169
6170#ifndef NDEBUG
6172 for (unsigned Idx : Order)
6173 dbgs() << Idx << ", ";
6174 dbgs() << "\n";
6175}
6176#endif
6177
6179BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6180 unsigned NumLanes = TE->Scalars.size();
6181
6183 collectUserStores(TE);
6184
6185 // Holds the reorder indices for each candidate store vector that is a user of
6186 // the current TreeEntry.
6187 SmallVector<OrdersType, 1> ExternalReorderIndices;
6188
6189 // Now inspect the stores collected per pointer and look for vectorization
6190 // candidates. For each candidate calculate the reorder index vector and push
6191 // it into `ExternalReorderIndices`
6192 for (const auto &Pair : PtrToStoresMap) {
6193 auto &StoresVec = Pair.second;
6194 // If we have fewer than NumLanes stores, then we can't form a vector.
6195 if (StoresVec.size() != NumLanes)
6196 continue;
6197
6198 // If the stores are not consecutive then abandon this StoresVec.
6199 OrdersType ReorderIndices;
6200 if (!canFormVector(StoresVec, ReorderIndices))
6201 continue;
6202
6203 // We now know that the scalars in StoresVec can form a vector instruction,
6204 // so set the reorder indices.
6205 ExternalReorderIndices.push_back(ReorderIndices);
6206 }
6207 return ExternalReorderIndices;
6208}
6209
6211 const SmallDenseSet<Value *> &UserIgnoreLst) {
6212 deleteTree();
6213 UserIgnoreList = &UserIgnoreLst;
6214 if (!allSameType(Roots))
6215 return;
6216 buildTree_rec(Roots, 0, EdgeInfo());
6217}
6218
6220 deleteTree();
6221 if (!allSameType(Roots))
6222 return;
6223 buildTree_rec(Roots, 0, EdgeInfo());
6224}
6225
6226/// \return true if the specified list of values has only one instruction that
6227/// requires scheduling, false otherwise.
6228#ifndef NDEBUG
6230 Value *NeedsScheduling = nullptr;
6231 for (Value *V : VL) {
6233 continue;
6234 if (!NeedsScheduling) {
6235 NeedsScheduling = V;
6236 continue;
6237 }
6238 return false;
6239 }
6240 return NeedsScheduling;
6241}
6242#endif
6243
6244/// Generates key/subkey pair for the given value to provide effective sorting
6245/// of the values and better detection of the vectorizable values sequences. The
6246/// keys/subkeys can be used for better sorting of the values themselves (keys)
6247/// and in values subgroups (subkeys).
6248static std::pair<size_t, size_t> generateKeySubkey(
6249 Value *V, const TargetLibraryInfo *TLI,
6250 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6251 bool AllowAlternate) {
6252 hash_code Key = hash_value(V->getValueID() + 2);
6253 hash_code SubKey = hash_value(0);
6254 // Sort the loads by the distance between the pointers.
6255 if (auto *LI = dyn_cast<LoadInst>(V)) {
6256 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
6257 if (LI->isSimple())
6258 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
6259 else
6260 Key = SubKey = hash_value(LI);
6261 } else if (isVectorLikeInstWithConstOps(V)) {
6262 // Sort extracts by the vector operands.
6263 if (isa<ExtractElementInst, UndefValue>(V))
6264 Key = hash_value(Value::UndefValueVal + 1);
6265 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
6266 if (!isUndefVector(EI->getVectorOperand()).all() &&
6267 !isa<UndefValue>(EI->getIndexOperand()))
6268 SubKey = hash_value(EI->getVectorOperand());
6269 }
6270 } else if (auto *I = dyn_cast<Instruction>(V)) {
6271 // Sort other instructions just by the opcodes except for CMPInst.
6272 // For CMP also sort by the predicate kind.
6273 if ((isa<BinaryOperator, CastInst>(I)) &&
6274 isValidForAlternation(I->getOpcode())) {
6275 if (AllowAlternate)
6276 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
6277 else
6278 Key = hash_combine(hash_value(I->getOpcode()), Key);
6279 SubKey = hash_combine(
6280 hash_value(I->getOpcode()), hash_value(I->getType()),
6281 hash_value(isa<BinaryOperator>(I)
6282 ? I->getType()
6283 : cast<CastInst>(I)->getOperand(0)->getType()));
6284 // For casts, look through the only operand to improve compile time.
6285 if (isa<CastInst>(I)) {
6286 std::pair<size_t, size_t> OpVals =
6287 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
6288 /*AllowAlternate=*/true);
6289 Key = hash_combine(OpVals.first, Key);
6290 SubKey = hash_combine(OpVals.first, SubKey);
6291 }
6292 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
6293 CmpInst::Predicate Pred = CI->getPredicate();
6294 if (CI->isCommutative())
6295 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
6297 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
6298 hash_value(SwapPred),
6299 hash_value(CI->getOperand(0)->getType()));
6300 } else if (auto *Call = dyn_cast<CallInst>(I)) {
6303 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
6304 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
6305 SubKey = hash_combine(hash_value(I->getOpcode()),
6306 hash_value(Call->getCalledFunction()));
6307 } else {
6308 Key = hash_combine(hash_value(Call), Key);
6309 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
6310 }
6311 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
6312 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
6313 hash_value(Op.Tag), SubKey);
6314 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
6315 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
6316 SubKey = hash_value(Gep->getPointerOperand());
6317 else
6318 SubKey = hash_value(Gep);
6319 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
6320 !isa<ConstantInt>(I->getOperand(1))) {
6321 // Do not try to vectorize instructions with potentially high cost.
6322 SubKey = hash_value(I);
6323 } else {
6324 SubKey = hash_value(I->getOpcode());
6325 }
6326 Key = hash_combine(hash_value(I->getParent()), Key);
6327 }
6328 return std::make_pair(Key, SubKey);
6329}
6330
6331/// Checks if the specified instruction \p I is an alternate operation for
6332/// the given \p MainOp and \p AltOp instructions.
6333static bool isAlternateInstruction(const Instruction *I,
6334 const Instruction *MainOp,
6335 const Instruction *AltOp,
6336 const TargetLibraryInfo &TLI);
6337
6338bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6339 ArrayRef<Value *> VL) const {
6340 unsigned Opcode0 = S.getOpcode();
6341 unsigned Opcode1 = S.getAltOpcode();
6342 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6343 // If this pattern is supported by the target then consider it profitable.
6344 if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
6345 Opcode0, Opcode1, OpcodeMask))
6346 return true;
6348 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6349 Operands.emplace_back();
6350 // Prepare the operand vector.
6351 for (Value *V : VL)
6352 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
6353 }
6354 if (Operands.size() == 2) {
6355 // Try find best operands candidates.
6356 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6358 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6359 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6360 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6361 std::optional<int> Res = findBestRootPair(Candidates);
6362 switch (Res.value_or(0)) {
6363 case 0:
6364 break;
6365 case 1:
6366 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6367 break;
6368 case 2:
6369 std::swap(Operands[0][I], Operands[1][I]);
6370 break;
6371 default:
6372 llvm_unreachable("Unexpected index.");
6373 }
6374 }
6375 }
6376 DenseSet<unsigned> UniqueOpcodes;
6377 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6378 unsigned NonInstCnt = 0;
6379 // Estimate number of instructions, required for the vectorized node and for
6380 // the buildvector node.
6381 unsigned UndefCnt = 0;
6382 // Count the number of extra shuffles, required for vector nodes.
6383 unsigned ExtraShuffleInsts = 0;
6384 // Check that operands do not contain same values and create either perfect
6385 // diamond match or shuffled match.
6386 if (Operands.size() == 2) {
6387 // Do not count same operands twice.
6388 if (Operands.front() == Operands.back()) {
6389 Operands.erase(Operands.begin());
6390 } else if (!allConstant(Operands.front()) &&
6391 all_of(Operands.front(), [&](Value *V) {
6392 return is_contained(Operands.back(), V);
6393 })) {
6394 Operands.erase(Operands.begin());
6395 ++ExtraShuffleInsts;
6396 }
6397 }
6398 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6399 // Vectorize node, if:
6400 // 1. at least single operand is constant or splat.
6401 // 2. Operands have many loop invariants (the instructions are not loop
6402 // invariants).
6403 // 3. At least single unique operands is supposed to vectorized.
6404 return none_of(Operands,
6405 [&](ArrayRef<Value *> Op) {
6406 if (allConstant(Op) ||
6407 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6408 getSameOpcode(Op, *TLI).MainOp))
6409 return false;
6411 for (Value *V : Op) {
6412 if (isa<Constant, ExtractElementInst>(V) ||
6413 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6414 if (isa<UndefValue>(V))
6415 ++UndefCnt;
6416 continue;
6417 }
6418 auto Res = Uniques.try_emplace(V, 0);
6419 // Found first duplicate - need to add shuffle.
6420 if (!Res.second && Res.first->second == 1)
6421 ++ExtraShuffleInsts;
6422 ++Res.first->getSecond();
6423 if (auto *I = dyn_cast<Instruction>(V))
6424 UniqueOpcodes.insert(I->getOpcode());
6425 else if (Res.second)
6426 ++NonInstCnt;
6427 }
6428 return none_of(Uniques, [&](const auto &P) {
6429 return P.first->hasNUsesOrMore(P.second + 1) &&
6430 none_of(P.first->users(), [&](User *U) {
6431 return getTreeEntry(U) || Uniques.contains(U);
6432 });
6433 });
6434 }) ||
6435 // Do not vectorize node, if estimated number of vector instructions is
6436 // more than estimated number of buildvector instructions. Number of
6437 // vector operands is number of vector instructions + number of vector
6438 // instructions for operands (buildvectors). Number of buildvector
6439 // instructions is just number_of_operands * number_of_scalars.
6440 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6441 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6442 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6443}
6444
6445BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6446 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6447 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6448 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6449
6450 unsigned ShuffleOrOp =
6451 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6452 auto *VL0 = cast<Instruction>(S.OpValue);
6453 switch (ShuffleOrOp) {
6454 case Instruction::PHI: {
6455 // Too many operands - gather, most probably won't be vectorized.
6456 if (VL0->getNumOperands() > MaxPHINumOperands)
6457 return TreeEntry::NeedToGather;
6458 // Check for terminator values (e.g. invoke).
6459 for (Value *V : VL)
6460 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6461 Instruction *Term = dyn_cast<Instruction>(Incoming);
6462 if (Term && Term->isTerminator()) {
6464 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6465 return TreeEntry::NeedToGather;
6466 }
6467 }
6468
6469 return TreeEntry::Vectorize;
6470 }
6471 case Instruction::ExtractValue:
6472 case Instruction::ExtractElement: {
6473 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6474 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6475 if (!isPowerOf2_32(VL.size()))
6476 return TreeEntry::NeedToGather;
6477 if (Reuse || !CurrentOrder.empty())
6478 return TreeEntry::Vectorize;
6479 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6480 return TreeEntry::NeedToGather;
6481 }
6482 case Instruction::InsertElement: {
6483 // Check that we have a buildvector and not a shuffle of 2 or more
6484 // different vectors.
6485 ValueSet SourceVectors;
6486 for (Value *V : VL) {
6487 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6488 assert(getElementIndex(V) != std::nullopt &&
6489 "Non-constant or undef index?");
6490 }
6491
6492 if (count_if(VL, [&SourceVectors](Value *V) {
6493 return !SourceVectors.contains(V);
6494 }) >= 2) {
6495 // Found 2nd source vector - cancel.
6496 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6497 "different source vectors.\n");
6498 return TreeEntry::NeedToGather;
6499 }
6500
6501 if (any_of(VL, [&SourceVectors](Value *V) {
6502 // The last InsertElement can have multiple uses.
6503 return SourceVectors.contains(V) && !V->hasOneUse();
6504 })) {
6505 assert(SLPReVec && "Only supported by REVEC.");
6506 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6507 "multiple uses.\n");
6508 return TreeEntry::NeedToGather;
6509 }
6510
6511 return TreeEntry::Vectorize;
6512 }
6513 case Instruction::Load: {
6514 // Check that a vectorized load would load the same memory as a scalar
6515 // load. For example, we don't want to vectorize loads that are smaller
6516 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6517 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6518 // from such a struct, we read/write packed bits disagreeing with the
6519 // unvectorized version.
6520 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6522 return TreeEntry::Vectorize;
6524 return TreeEntry::ScatterVectorize;
6526 return TreeEntry::StridedVectorize;
6527 case LoadsState::Gather:
6528#ifndef NDEBUG
6529 Type *ScalarTy = VL0->getType();
6530 if (DL->getTypeSizeInBits(ScalarTy) !=
6531 DL->getTypeAllocSizeInBits(ScalarTy))
6532 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6533 else if (any_of(VL,
6534 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6535 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6536 else
6537 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6538#endif // NDEBUG
6539 return TreeEntry::NeedToGather;
6540 }
6541 llvm_unreachable("Unexpected state of loads");
6542 }
6543 case Instruction::ZExt:
6544 case Instruction::SExt:
6545 case Instruction::FPToUI:
6546 case Instruction::FPToSI:
6547 case Instruction::FPExt:
6548 case Instruction::PtrToInt:
6549 case Instruction::IntToPtr:
6550 case Instruction::SIToFP:
6551 case Instruction::UIToFP:
6552 case Instruction::Trunc:
6553 case Instruction::FPTrunc:
6554 case Instruction::BitCast: {
6555 Type *SrcTy = VL0->getOperand(0)->getType();
6556 for (Value *V : VL) {
6557 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6558 if (Ty != SrcTy || !isValidElementType(Ty)) {
6559 LLVM_DEBUG(
6560 dbgs() << "SLP: Gathering casts with different src types.\n");
6561 return TreeEntry::NeedToGather;
6562 }
6563 }
6564 return TreeEntry::Vectorize;
6565 }
6566 case Instruction::ICmp:
6567 case Instruction::FCmp: {
6568 // Check that all of the compares have the same predicate.
6569 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6571 Type *ComparedTy = VL0->getOperand(0)->getType();
6572 for (Value *V : VL) {
6573 CmpInst *Cmp = cast<CmpInst>(V);
6574 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6575 Cmp->getOperand(0)->getType() != ComparedTy) {
6576 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6577 return TreeEntry::NeedToGather;
6578 }
6579 }
6580 return TreeEntry::Vectorize;
6581 }
6582 case Instruction::Select:
6583 case Instruction::FNeg:
6584 case Instruction::Add:
6585 case Instruction::FAdd:
6586 case Instruction::Sub:
6587 case Instruction::FSub:
6588 case Instruction::Mul:
6589 case Instruction::FMul:
6590 case Instruction::UDiv:
6591 case Instruction::SDiv:
6592 case Instruction::FDiv:
6593 case Instruction::URem:
6594 case Instruction::SRem:
6595 case Instruction::FRem:
6596 case Instruction::Shl:
6597 case Instruction::LShr:
6598 case Instruction::AShr:
6599 case Instruction::And:
6600 case Instruction::Or:
6601 case Instruction::Xor:
6602 case Instruction::Freeze:
6603 return TreeEntry::Vectorize;
6604 case Instruction::GetElementPtr: {
6605 // We don't combine GEPs with complicated (nested) indexing.
6606 for (Value *V : VL) {
6607 auto *I = dyn_cast<GetElementPtrInst>(V);
6608 if (!I)
6609 continue;
6610 if (I->getNumOperands() != 2) {
6611 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6612 return TreeEntry::NeedToGather;
6613 }
6614 }
6615
6616 // We can't combine several GEPs into one vector if they operate on
6617 // different types.
6618 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6619 for (Value *V : VL) {
6620 auto *GEP = dyn_cast<GEPOperator>(V);
6621 if (!GEP)
6622 continue;
6623 Type *CurTy = GEP->getSourceElementType();
6624 if (Ty0 != CurTy) {
6625 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6626 return TreeEntry::NeedToGather;
6627 }
6628 }
6629
6630 // We don't combine GEPs with non-constant indexes.
6631 Type *Ty1 = VL0->getOperand(1)->getType();
6632 for (Value *V : VL) {
6633 auto *I = dyn_cast<GetElementPtrInst>(V);
6634 if (!I)
6635 continue;
6636 auto *Op = I->getOperand(1);
6637 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6638 (Op->getType() != Ty1 &&
6639 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6640 Op->getType()->getScalarSizeInBits() >
6641 DL->getIndexSizeInBits(
6642 V->getType()->getPointerAddressSpace())))) {
6643 LLVM_DEBUG(
6644 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6645 return TreeEntry::NeedToGather;
6646 }
6647 }
6648
6649 return TreeEntry::Vectorize;
6650 }
6651 case Instruction::Store: {
6652 // Check if the stores are consecutive or if we need to swizzle them.
6653 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6654 // Avoid types that are padded when being allocated as scalars, while
6655 // being packed together in a vector (such as i1).
6656 if (DL->getTypeSizeInBits(ScalarTy) !=
6657 DL->getTypeAllocSizeInBits(ScalarTy)) {
6658 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6659 return TreeEntry::NeedToGather;
6660 }
6661 // Make sure all stores in the bundle are simple - we can't vectorize
6662 // atomic or volatile stores.
6663 for (Value *V : VL) {
6664 auto *SI = cast<StoreInst>(V);
6665 if (!SI->isSimple()) {
6666 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6667 return TreeEntry::NeedToGather;
6668 }
6669 PointerOps.push_back(SI->getPointerOperand());
6670 }
6671
6672 // Check the order of pointer operands.
6673 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6674 Value *Ptr0;
6675 Value *PtrN;
6676 if (CurrentOrder.empty()) {
6677 Ptr0 = PointerOps.front();
6678 PtrN = PointerOps.back();
6679 } else {
6680 Ptr0 = PointerOps[CurrentOrder.front()];
6681 PtrN = PointerOps[CurrentOrder.back()];
6682 }
6683 std::optional<int> Dist =
6684 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6685 // Check that the sorted pointer operands are consecutive.
6686 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6687 return TreeEntry::Vectorize;
6688 }
6689
6690 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6691 return TreeEntry::NeedToGather;
6692 }
6693 case Instruction::Call: {
6694 // Check if the calls are all to the same vectorizable intrinsic or
6695 // library function.
6696 CallInst *CI = cast<CallInst>(VL0);
6698
6699 VFShape Shape = VFShape::get(
6700 CI->getFunctionType(),
6701 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6702 false /*HasGlobalPred*/);
6703 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6704
6705 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6706 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6707 return TreeEntry::NeedToGather;
6708 }
6709 Function *F = CI->getCalledFunction();
6710 unsigned NumArgs = CI->arg_size();
6711 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6712 for (unsigned J = 0; J != NumArgs; ++J)
6714 ScalarArgs[J] = CI->getArgOperand(J);
6715 for (Value *V : VL) {
6716 CallInst *CI2 = dyn_cast<CallInst>(V);
6717 if (!CI2 || CI2->getCalledFunction() != F ||
6718 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6719 (VecFunc &&
6720 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6722 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6723 << "\n");
6724 return TreeEntry::NeedToGather;
6725 }
6726 // Some intrinsics have scalar arguments and should be same in order for
6727 // them to be vectorized.
6728 for (unsigned J = 0; J != NumArgs; ++J) {
6730 Value *A1J = CI2->getArgOperand(J);
6731 if (ScalarArgs[J] != A1J) {
6733 << "SLP: mismatched arguments in call:" << *CI
6734 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6735 return TreeEntry::NeedToGather;
6736 }
6737 }
6738 }
6739 // Verify that the bundle operands are identical between the two calls.
6740 if (CI->hasOperandBundles() &&
6741 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6742 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6743 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6744 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6745 << "!=" << *V << '\n');
6746 return TreeEntry::NeedToGather;
6747 }
6748 }
6749
6750 return TreeEntry::Vectorize;
6751 }
6752 case Instruction::ShuffleVector: {
6753 if (!S.isAltShuffle()) {
6754 // REVEC can support non alternate shuffle.
6756 return TreeEntry::Vectorize;
6757 // If this is not an alternate sequence of opcode like add-sub
6758 // then do not vectorize this instruction.
6759 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6760 return TreeEntry::NeedToGather;
6761 }
6762 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6763 LLVM_DEBUG(
6764 dbgs()
6765 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6766 "the whole alt sequence is not profitable.\n");
6767 return TreeEntry::NeedToGather;
6768 }
6769
6770 return TreeEntry::Vectorize;
6771 }
6772 default:
6773 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6774 return TreeEntry::NeedToGather;
6775 }
6776}
6777
6778namespace {
6779/// Allows to correctly handle operands of the phi nodes based on the \p Main
6780/// PHINode order of incoming basic blocks/values.
6781class PHIHandler {
6782 DominatorTree &DT;
6783 PHINode *Main = nullptr;
6786
6787public:
6788 PHIHandler() = delete;
6789 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6790 : DT(DT), Main(Main), Phis(Phis),
6791 Operands(Main->getNumIncomingValues(),
6792 SmallVector<Value *>(Phis.size(), nullptr)) {}
6793 void buildOperands() {
6794 constexpr unsigned FastLimit = 4;
6795 if (Main->getNumIncomingValues() <= FastLimit) {
6796 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6797 BasicBlock *InBB = Main->getIncomingBlock(I);
6798 if (!DT.isReachableFromEntry(InBB)) {
6799 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6800 continue;
6801 }
6802 // Prepare the operand vector.
6803 for (auto [Idx, V] : enumerate(Phis)) {
6804 auto *P = cast<PHINode>(V);
6805 if (P->getIncomingBlock(I) == InBB)
6806 Operands[I][Idx] = P->getIncomingValue(I);
6807 else
6808 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6809 }
6810 }
6811 return;
6812 }
6814 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6815 BasicBlock *InBB = Main->getIncomingBlock(I);
6816 if (!DT.isReachableFromEntry(InBB)) {
6817 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6818 continue;
6819 }
6820 Blocks.try_emplace(InBB).first->second.push_back(I);
6821 }
6822 for (auto [Idx, V] : enumerate(Phis)) {
6823 auto *P = cast<PHINode>(V);
6824 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6825 BasicBlock *InBB = P->getIncomingBlock(I);
6826 if (InBB == Main->getIncomingBlock(I)) {
6827 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6828 continue;
6829 Operands[I][Idx] = P->getIncomingValue(I);
6830 continue;
6831 }
6832 auto It = Blocks.find(InBB);
6833 if (It == Blocks.end())
6834 continue;
6835 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6836 }
6837 }
6838 for (const auto &P : Blocks) {
6839 if (P.getSecond().size() <= 1)
6840 continue;
6841 unsigned BasicI = P.getSecond().front();
6842 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6844 [&](const auto &Data) {
6845 return !Data.value() ||
6846 Data.value() == Operands[BasicI][Data.index()];
6847 }) &&
6848 "Expected empty operands list.");
6849 Operands[I] = Operands[BasicI];
6850 }
6851 }
6852 }
6853 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6854};
6855} // namespace
6856
6857void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6858 const EdgeInfo &UserTreeIdx) {
6859 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6860
6861 SmallVector<int> ReuseShuffleIndices;
6862 SmallVector<Value *> UniqueValues;
6863 SmallVector<Value *> NonUniqueValueVL;
6864 auto TryToFindDuplicates = [&](const InstructionsState &S,
6865 bool DoNotFail = false) {
6866 // Check that every instruction appears once in this bundle.
6867 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6868 for (Value *V : VL) {
6869 if (isConstant(V)) {
6870 ReuseShuffleIndices.emplace_back(
6871 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6872 UniqueValues.emplace_back(V);
6873 continue;
6874 }
6875 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6876 ReuseShuffleIndices.emplace_back(Res.first->second);
6877 if (Res.second)
6878 UniqueValues.emplace_back(V);
6879 }
6880 size_t NumUniqueScalarValues = UniqueValues.size();
6881 if (NumUniqueScalarValues == VL.size()) {
6882 ReuseShuffleIndices.clear();
6883 } else {
6884 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6885 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6886 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6887 "for nodes with padding.\n");
6888 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6889 return false;
6890 }
6891 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6892 if (NumUniqueScalarValues <= 1 ||
6893 (UniquePositions.size() == 1 && all_of(UniqueValues,
6894 [](Value *V) {
6895 return isa<UndefValue>(V) ||
6896 !isConstant(V);
6897 })) ||
6898 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6899 if (DoNotFail && UniquePositions.size() > 1 &&
6900 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6901 all_of(UniqueValues, [=](Value *V) {
6902 return isa<ExtractElementInst>(V) ||
6903 areAllUsersVectorized(cast<Instruction>(V),
6904 UserIgnoreList);
6905 })) {
6906 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6907 if (PWSz == VL.size()) {
6908 ReuseShuffleIndices.clear();
6909 } else {
6910 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6911 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6912 UniqueValues.back());
6913 VL = NonUniqueValueVL;
6914 }
6915 return true;
6916 }
6917 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6918 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6919 return false;
6920 }
6921 VL = UniqueValues;
6922 }
6923 return true;
6924 };
6925
6926 InstructionsState S = getSameOpcode(VL, *TLI);
6927
6928 // Don't vectorize ephemeral values.
6929 if (!EphValues.empty()) {
6930 for (Value *V : VL) {
6931 if (EphValues.count(V)) {
6932 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6933 << ") is ephemeral.\n");
6934 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6935 return;
6936 }
6937 }
6938 }
6939
6940 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6941 // a load), in which case peek through to include it in the tree, without
6942 // ballooning over-budget.
6943 if (Depth >= RecursionMaxDepth &&
6944 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6945 VL.size() >= 4 &&
6946 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6947 return match(I,
6949 cast<Instruction>(I)->getOpcode() ==
6950 cast<Instruction>(S.MainOp)->getOpcode();
6951 })))) {
6952 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6953 if (TryToFindDuplicates(S))
6954 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6955 ReuseShuffleIndices);
6956 return;
6957 }
6958
6959 // Don't handle scalable vectors
6960 if (S.getOpcode() == Instruction::ExtractElement &&
6961 isa<ScalableVectorType>(
6962 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6963 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6964 if (TryToFindDuplicates(S))
6965 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6966 ReuseShuffleIndices);
6967 return;
6968 }
6969
6970 // Don't handle vectors.
6971 if (!SLPReVec && S.OpValue->getType()->isVectorTy() &&
6972 !isa<InsertElementInst>(S.OpValue)) {
6973 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6974 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6975 return;
6976 }
6977
6978 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6979 if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) {
6980 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6981 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6982 return;
6983 }
6984
6985 // If all of the operands are identical or constant we have a simple solution.
6986 // If we deal with insert/extract instructions, they all must have constant
6987 // indices, otherwise we should gather them, not try to vectorize.
6988 // If alternate op node with 2 elements with gathered operands - do not
6989 // vectorize.
6990 auto &&NotProfitableForVectorization = [&S, this,
6992 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6993 return false;
6994 if (VectorizableTree.size() < MinTreeSize)
6995 return false;
6996 if (Depth >= RecursionMaxDepth - 1)
6997 return true;
6998 // Check if all operands are extracts, part of vector node or can build a
6999 // regular vectorize node.
7000 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
7001 for (Value *V : VL) {
7002 auto *I = cast<Instruction>(V);
7003 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
7004 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
7005 }));
7006 }
7007 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
7008 if ((IsCommutative &&
7009 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
7010 (!IsCommutative &&
7011 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
7012 return true;
7013 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
7015 auto *I1 = cast<Instruction>(VL.front());
7016 auto *I2 = cast<Instruction>(VL.back());
7017 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
7018 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
7019 I2->getOperand(Op));
7020 if (static_cast<unsigned>(count_if(
7021 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
7023 })) >= S.MainOp->getNumOperands() / 2)
7024 return false;
7025 if (S.MainOp->getNumOperands() > 2)
7026 return true;
7027 if (IsCommutative) {
7028 // Check permuted operands.
7029 Candidates.clear();
7030 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
7031 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
7032 I2->getOperand((Op + 1) % E));
7033 if (any_of(
7034 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
7036 }))
7037 return false;
7038 }
7039 return true;
7040 };
7041 SmallVector<unsigned> SortedIndices;
7042 BasicBlock *BB = nullptr;
7043 bool IsScatterVectorizeUserTE =
7044 UserTreeIdx.UserTE &&
7045 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
7046 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
7047 bool AreScatterAllGEPSameBlock =
7048 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
7049 VL.size() > 2 &&
7050 all_of(VL,
7051 [&BB](Value *V) {
7052 auto *I = dyn_cast<GetElementPtrInst>(V);
7053 if (!I)
7054 return doesNotNeedToBeScheduled(V);
7055 if (!BB)
7056 BB = I->getParent();
7057 return BB == I->getParent() && I->getNumOperands() == 2;
7058 }) &&
7059 BB &&
7060 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
7061 SortedIndices));
7062 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
7063 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
7064 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
7065 S.OpValue) &&
7067 NotProfitableForVectorization(VL)) {
7068 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
7069 if (TryToFindDuplicates(S))
7070 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7071 ReuseShuffleIndices);
7072 return;
7073 }
7074
7075 // We now know that this is a vector of instructions of the same type from
7076 // the same block.
7077
7078 // Check if this is a duplicate of another entry.
7079 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
7080 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
7081 if (!E->isSame(VL)) {
7082 auto It = MultiNodeScalars.find(S.OpValue);
7083 if (It != MultiNodeScalars.end()) {
7084 auto *TEIt = find_if(It->getSecond(),
7085 [&](TreeEntry *ME) { return ME->isSame(VL); });
7086 if (TEIt != It->getSecond().end())
7087 E = *TEIt;
7088 else
7089 E = nullptr;
7090 } else {
7091 E = nullptr;
7092 }
7093 }
7094 if (!E) {
7095 if (!doesNotNeedToBeScheduled(S.OpValue)) {
7096 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
7097 if (TryToFindDuplicates(S))
7098 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7099 ReuseShuffleIndices);
7100 return;
7101 }
7103 Nodes.insert(getTreeEntry(S.OpValue));
7104 for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
7105 Nodes.insert(E);
7106 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
7107 if (any_of(Nodes, [&](const TreeEntry *E) {
7108 return all_of(E->Scalars,
7109 [&](Value *V) { return Values.contains(V); });
7110 })) {
7111 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
7112 if (TryToFindDuplicates(S))
7113 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7114 ReuseShuffleIndices);
7115 return;
7116 }
7117 } else {
7118 // Record the reuse of the tree node. FIXME, currently this is only used
7119 // to properly draw the graph rather than for the actual vectorization.
7120 E->UserTreeIndices.push_back(UserTreeIdx);
7121 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
7122 << ".\n");
7123 return;
7124 }
7125 }
7126
7127 // Check that none of the instructions in the bundle are already in the tree.
7128 for (Value *V : VL) {
7129 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
7131 continue;
7132 if (getTreeEntry(V)) {
7133 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
7134 << ") is already in tree.\n");
7135 if (TryToFindDuplicates(S))
7136 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7137 ReuseShuffleIndices);
7138 return;
7139 }
7140 }
7141
7142 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
7143 if (UserIgnoreList && !UserIgnoreList->empty()) {
7144 for (Value *V : VL) {
7145 if (UserIgnoreList && UserIgnoreList->contains(V)) {
7146 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
7147 if (TryToFindDuplicates(S))
7148 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7149 ReuseShuffleIndices);
7150 return;
7151 }
7152 }
7153 }
7154
7155 // Special processing for sorted pointers for ScatterVectorize node with
7156 // constant indeces only.
7157 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
7158 assert(S.OpValue->getType()->isPointerTy() &&
7159 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
7160 "Expected pointers only.");
7161 // Reset S to make it GetElementPtr kind of node.
7162 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
7163 assert(It != VL.end() && "Expected at least one GEP.");
7164 S = getSameOpcode(*It, *TLI);
7165 }
7166
7167 // Check that all of the users of the scalars that we want to vectorize are
7168 // schedulable.
7169 auto *VL0 = cast<Instruction>(S.OpValue);
7170 BB = VL0->getParent();
7171
7172 if (!DT->isReachableFromEntry(BB)) {
7173 // Don't go into unreachable blocks. They may contain instructions with
7174 // dependency cycles which confuse the final scheduling.
7175 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
7176 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7177 return;
7178 }
7179
7180 // Don't go into catchswitch blocks, which can happen with PHIs.
7181 // Such blocks can only have PHIs and the catchswitch. There is no
7182 // place to insert a shuffle if we need to, so just avoid that issue.
7183 if (isa<CatchSwitchInst>(BB->getTerminator())) {
7184 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7185 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7186 return;
7187 }
7188
7189 // Check that every instruction appears once in this bundle.
7190 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
7191 return;
7192
7193 // Perform specific checks for each particular instruction kind.
7194 OrdersType CurrentOrder;
7195 SmallVector<Value *> PointerOps;
7196 TreeEntry::EntryState State = getScalarsVectorizationState(
7197 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7198 if (State == TreeEntry::NeedToGather) {
7199 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7200 ReuseShuffleIndices);
7201 return;
7202 }
7203
7204 auto &BSRef = BlocksSchedules[BB];
7205 if (!BSRef)
7206 BSRef = std::make_unique<BlockScheduling>(BB);
7207
7208 BlockScheduling &BS = *BSRef;
7209
7210 std::optional<ScheduleData *> Bundle =
7211 BS.tryScheduleBundle(UniqueValues, this, S);
7212#ifdef EXPENSIVE_CHECKS
7213 // Make sure we didn't break any internal invariants
7214 BS.verify();
7215#endif
7216 if (!Bundle) {
7217 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7218 assert((!BS.getScheduleData(VL0) ||
7219 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7220 "tryScheduleBundle should cancelScheduling on failure");
7221 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7222 ReuseShuffleIndices);
7223 NonScheduledFirst.insert(VL.front());
7224 return;
7225 }
7226 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7227
7228 unsigned ShuffleOrOp = S.isAltShuffle() ?
7229 (unsigned) Instruction::ShuffleVector : S.getOpcode();
7230 switch (ShuffleOrOp) {
7231 case Instruction::PHI: {
7232 auto *PH = cast<PHINode>(VL0);
7233
7234 TreeEntry *TE =
7235 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7236 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7237
7238 // Keeps the reordered operands to avoid code duplication.
7239 PHIHandler Handler(*DT, PH, VL);
7240 Handler.buildOperands();
7241 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7242 TE->setOperand(I, Handler.getOperands(I));
7243 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7244 buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
7245 return;
7246 }
7247 case Instruction::ExtractValue:
7248 case Instruction::ExtractElement: {
7249 if (CurrentOrder.empty()) {
7250 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7251 } else {
7252 LLVM_DEBUG({
7253 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7254 "with order";
7255 for (unsigned Idx : CurrentOrder)
7256 dbgs() << " " << Idx;
7257 dbgs() << "\n";
7258 });
7259 fixupOrderingIndices(CurrentOrder);
7260 }
7261 // Insert new order with initial value 0, if it does not exist,
7262 // otherwise return the iterator to the existing one.
7263 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7264 ReuseShuffleIndices, CurrentOrder);
7265 // This is a special case, as it does not gather, but at the same time
7266 // we are not extending buildTree_rec() towards the operands.
7267 ValueList Op0;
7268 Op0.assign(VL.size(), VL0->getOperand(0));
7269 VectorizableTree.back()->setOperand(0, Op0);
7270 return;
7271 }
7272 case Instruction::InsertElement: {
7273 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7274
7275 auto OrdCompare = [](const std::pair<int, int> &P1,
7276 const std::pair<int, int> &P2) {
7277 return P1.first > P2.first;
7278 };
7280 decltype(OrdCompare)>
7281 Indices(OrdCompare);
7282 for (int I = 0, E = VL.size(); I < E; ++I) {
7283 unsigned Idx = *getElementIndex(VL[I]);
7284 Indices.emplace(Idx, I);
7285 }
7286 OrdersType CurrentOrder(VL.size(), VL.size());
7287 bool IsIdentity = true;
7288 for (int I = 0, E = VL.size(); I < E; ++I) {
7289 CurrentOrder[Indices.top().second] = I;
7290 IsIdentity &= Indices.top().second == I;
7291 Indices.pop();
7292 }
7293 if (IsIdentity)
7294 CurrentOrder.clear();
7295 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7296 std::nullopt, CurrentOrder);
7297 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
7298
7299 TE->setOperandsInOrder();
7300 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
7301 return;
7302 }
7303 case Instruction::Load: {
7304 // Check that a vectorized load would load the same memory as a scalar
7305 // load. For example, we don't want to vectorize loads that are smaller
7306 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7307 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7308 // from such a struct, we read/write packed bits disagreeing with the
7309 // unvectorized version.
7310 TreeEntry *TE = nullptr;
7311 fixupOrderingIndices(CurrentOrder);
7312 switch (State) {
7313 case TreeEntry::Vectorize:
7314 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7315 ReuseShuffleIndices, CurrentOrder);
7316 if (CurrentOrder.empty())
7317 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
7318 else
7319 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
7320 TE->setOperandsInOrder();
7321 break;
7322 case TreeEntry::StridedVectorize:
7323 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7324 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
7325 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
7326 TE->setOperandsInOrder();
7327 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
7328 break;
7329 case TreeEntry::ScatterVectorize:
7330 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7331 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
7332 UserTreeIdx, ReuseShuffleIndices);
7333 TE->setOperandsInOrder();
7334 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
7335 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7336 break;
7337 case TreeEntry::CombinedVectorize:
7338 case TreeEntry::NeedToGather:
7339 llvm_unreachable("Unexpected loads state.");
7340 }
7341 return;
7342 }
7343 case Instruction::ZExt:
7344 case Instruction::SExt:
7345 case Instruction::FPToUI:
7346 case Instruction::FPToSI:
7347 case Instruction::FPExt:
7348 case Instruction::PtrToInt:
7349 case Instruction::IntToPtr:
7350 case Instruction::SIToFP:
7351 case Instruction::UIToFP:
7352 case Instruction::Trunc:
7353 case Instruction::FPTrunc:
7354 case Instruction::BitCast: {
7355 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7356 std::make_pair(std::numeric_limits<unsigned>::min(),
7357 std::numeric_limits<unsigned>::max()));
7358 if (ShuffleOrOp == Instruction::ZExt ||
7359 ShuffleOrOp == Instruction::SExt) {
7360 CastMaxMinBWSizes = std::make_pair(
7361 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7362 PrevMaxBW),
7363 std::min<unsigned>(
7364 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7365 PrevMinBW));
7366 } else if (ShuffleOrOp == Instruction::Trunc) {
7367 CastMaxMinBWSizes = std::make_pair(
7368 std::max<unsigned>(
7369 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7370 PrevMaxBW),
7371 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7372 PrevMinBW));
7373 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7374 } else if (ShuffleOrOp == Instruction::SIToFP ||
7375 ShuffleOrOp == Instruction::UIToFP) {
7376 unsigned NumSignBits =
7377 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7378 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7379 APInt Mask = DB->getDemandedBits(OpI);
7380 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7381 }
7382 if (NumSignBits * 2 >=
7383 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7384 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7385 }
7386 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7387 ReuseShuffleIndices);
7388 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7389
7390 TE->setOperandsInOrder();
7391 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7392 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7393 return;
7394 }
7395 case Instruction::ICmp:
7396 case Instruction::FCmp: {
7397 // Check that all of the compares have the same predicate.
7398 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7399 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7400 ReuseShuffleIndices);
7401 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7402
7404 if (cast<CmpInst>(VL0)->isCommutative()) {
7405 // Commutative predicate - collect + sort operands of the instructions
7406 // so that each side is more likely to have the same opcode.
7408 "Commutative Predicate mismatch");
7409 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7410 } else {
7411 // Collect operands - commute if it uses the swapped predicate.
7412 for (Value *V : VL) {
7413 auto *Cmp = cast<CmpInst>(V);
7414 Value *LHS = Cmp->getOperand(0);
7415 Value *RHS = Cmp->getOperand(1);
7416 if (Cmp->getPredicate() != P0)
7417 std::swap(LHS, RHS);
7418 Left.push_back(LHS);
7419 Right.push_back(RHS);
7420 }
7421 }
7422 TE->setOperand(0, Left);
7423 TE->setOperand(1, Right);
7424 buildTree_rec(Left, Depth + 1, {TE, 0});
7425 buildTree_rec(Right, Depth + 1, {TE, 1});
7426 if (ShuffleOrOp == Instruction::ICmp) {
7427 unsigned NumSignBits0 =
7428 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7429 if (NumSignBits0 * 2 >=
7430 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7431 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7432 unsigned NumSignBits1 =
7433 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7434 if (NumSignBits1 * 2 >=
7435 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7436 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7437 }
7438 return;
7439 }
7440 case Instruction::Select:
7441 case Instruction::FNeg:
7442 case Instruction::Add:
7443 case Instruction::FAdd:
7444 case Instruction::Sub:
7445 case Instruction::FSub:
7446 case Instruction::Mul:
7447 case Instruction::FMul:
7448 case Instruction::UDiv:
7449 case Instruction::SDiv:
7450 case Instruction::FDiv:
7451 case Instruction::URem:
7452 case Instruction::SRem:
7453 case Instruction::FRem:
7454 case Instruction::Shl:
7455 case Instruction::LShr:
7456 case Instruction::AShr:
7457 case Instruction::And:
7458 case Instruction::Or:
7459 case Instruction::Xor:
7460 case Instruction::Freeze: {
7461 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7462 ReuseShuffleIndices);
7463 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7464
7465 // Sort operands of the instructions so that each side is more likely to
7466 // have the same opcode.
7467 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7469 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7470 TE->setOperand(0, Left);
7471 TE->setOperand(1, Right);
7472 buildTree_rec(Left, Depth + 1, {TE, 0});
7473 buildTree_rec(Right, Depth + 1, {TE, 1});
7474 return;
7475 }
7476
7477 TE->setOperandsInOrder();
7478 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7479 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7480 return;
7481 }
7482 case Instruction::GetElementPtr: {
7483 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7484 ReuseShuffleIndices);
7485 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7487 // Prepare the operand vector for pointer operands.
7488 for (Value *V : VL) {
7489 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7490 if (!GEP) {
7491 Operands.front().push_back(V);
7492 continue;
7493 }
7494 Operands.front().push_back(GEP->getPointerOperand());
7495 }
7496 TE->setOperand(0, Operands.front());
7497 // Need to cast all indices to the same type before vectorization to
7498 // avoid crash.
7499 // Required to be able to find correct matches between different gather
7500 // nodes and reuse the vectorized values rather than trying to gather them
7501 // again.
7502 int IndexIdx = 1;
7503 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7504 Type *Ty = all_of(VL,
7505 [VL0Ty, IndexIdx](Value *V) {
7506 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7507 if (!GEP)
7508 return true;
7509 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7510 })
7511 ? VL0Ty
7512 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7513 ->getPointerOperandType()
7514 ->getScalarType());
7515 // Prepare the operand vector.
7516 for (Value *V : VL) {
7517 auto *I = dyn_cast<GetElementPtrInst>(V);
7518 if (!I) {
7519 Operands.back().push_back(
7520 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7521 continue;
7522 }
7523 auto *Op = I->getOperand(IndexIdx);
7524 auto *CI = dyn_cast<ConstantInt>(Op);
7525 if (!CI)
7526 Operands.back().push_back(Op);
7527 else
7528 Operands.back().push_back(ConstantFoldIntegerCast(
7529 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7530 }
7531 TE->setOperand(IndexIdx, Operands.back());
7532
7533 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7534 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7535 return;
7536 }
7537 case Instruction::Store: {
7538 bool Consecutive = CurrentOrder.empty();
7539 if (!Consecutive)
7540 fixupOrderingIndices(CurrentOrder);
7541 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7542 ReuseShuffleIndices, CurrentOrder);
7543 TE->setOperandsInOrder();
7544 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
7545 if (Consecutive)
7546 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7547 else
7548 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7549 return;
7550 }
7551 case Instruction::Call: {
7552 // Check if the calls are all to the same vectorizable intrinsic or
7553 // library function.
7554 CallInst *CI = cast<CallInst>(VL0);
7556
7557 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7558 ReuseShuffleIndices);
7559 // Sort operands of the instructions so that each side is more likely to
7560 // have the same opcode.
7561 if (isCommutative(VL0)) {
7563 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7564 TE->setOperand(0, Left);
7565 TE->setOperand(1, Right);
7567 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7568 Operands.emplace_back();
7570 continue;
7571 for (Value *V : VL) {
7572 auto *CI2 = cast<CallInst>(V);
7573 Operands.back().push_back(CI2->getArgOperand(I));
7574 }
7575 TE->setOperand(I, Operands.back());
7576 }
7577 buildTree_rec(Left, Depth + 1, {TE, 0});
7578 buildTree_rec(Right, Depth + 1, {TE, 1});
7579 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7580 if (Operands[I - 2].empty())
7581 continue;
7582 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7583 }
7584 return;
7585 }
7586 TE->setOperandsInOrder();
7587 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7588 // For scalar operands no need to create an entry since no need to
7589 // vectorize it.
7591 continue;
7593 // Prepare the operand vector.
7594 for (Value *V : VL) {
7595 auto *CI2 = cast<CallInst>(V);
7596 Operands.push_back(CI2->getArgOperand(I));
7597 }
7598 buildTree_rec(Operands, Depth + 1, {TE, I});
7599 }
7600 return;
7601 }
7602 case Instruction::ShuffleVector: {
7603 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7604 ReuseShuffleIndices);
7605 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7606
7607 // Reorder operands if reordering would enable vectorization.
7608 auto *CI = dyn_cast<CmpInst>(VL0);
7609 if (isa<BinaryOperator>(VL0) || CI) {
7611 if (!CI || all_of(VL, [](Value *V) {
7612 return cast<CmpInst>(V)->isCommutative();
7613 })) {
7614 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7615 } else {
7616 auto *MainCI = cast<CmpInst>(S.MainOp);
7617 auto *AltCI = cast<CmpInst>(S.AltOp);
7618 CmpInst::Predicate MainP = MainCI->getPredicate();
7619 CmpInst::Predicate AltP = AltCI->getPredicate();
7620 assert(MainP != AltP &&
7621 "Expected different main/alternate predicates.");
7622 // Collect operands - commute if it uses the swapped predicate or
7623 // alternate operation.
7624 for (Value *V : VL) {
7625 auto *Cmp = cast<CmpInst>(V);
7626 Value *LHS = Cmp->getOperand(0);
7627 Value *RHS = Cmp->getOperand(1);
7628
7629 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7630 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7631 std::swap(LHS, RHS);
7632 } else {
7633 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7634 std::swap(LHS, RHS);
7635 }
7636 Left.push_back(LHS);
7637 Right.push_back(RHS);
7638 }
7639 }
7640 TE->setOperand(0, Left);
7641 TE->setOperand(1, Right);
7642 buildTree_rec(Left, Depth + 1, {TE, 0});
7643 buildTree_rec(Right, Depth + 1, {TE, 1});
7644 return;
7645 }
7646
7647 TE->setOperandsInOrder();
7648 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7649 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7650 return;
7651 }
7652 default:
7653 break;
7654 }
7655 llvm_unreachable("Unexpected vectorization of the instructions.");
7656}
7657
7659 unsigned N = 1;
7660 Type *EltTy = T;
7661
7662 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7663 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7664 // Check that struct is homogeneous.
7665 for (const auto *Ty : ST->elements())
7666 if (Ty != *ST->element_begin())
7667 return 0;
7668 N *= ST->getNumElements();
7669 EltTy = *ST->element_begin();
7670 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7671 N *= AT->getNumElements();
7672 EltTy = AT->getElementType();
7673 } else {
7674 auto *VT = cast<FixedVectorType>(EltTy);
7675 N *= VT->getNumElements();
7676 EltTy = VT->getElementType();
7677 }
7678 }
7679
7680 if (!isValidElementType(EltTy))
7681 return 0;
7682 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
7683 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7684 VTSize != DL->getTypeStoreSizeInBits(T))
7685 return 0;
7686 return N;
7687}
7688
7689bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7690 SmallVectorImpl<unsigned> &CurrentOrder,
7691 bool ResizeAllowed) const {
7692 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7693 assert(It != VL.end() && "Expected at least one extract instruction.");
7694 auto *E0 = cast<Instruction>(*It);
7695 assert(
7696 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7697 "Invalid opcode");
7698 // Check if all of the extracts come from the same vector and from the
7699 // correct offset.
7700 Value *Vec = E0->getOperand(0);
7701
7702 CurrentOrder.clear();
7703
7704 // We have to extract from a vector/aggregate with the same number of elements.
7705 unsigned NElts;
7706 if (E0->getOpcode() == Instruction::ExtractValue) {
7707 NElts = canMapToVector(Vec->getType());
7708 if (!NElts)
7709 return false;
7710 // Check if load can be rewritten as load of vector.
7711 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7712 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7713 return false;
7714 } else {
7715 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7716 }
7717
7718 unsigned E = VL.size();
7719 if (!ResizeAllowed && NElts != E)
7720 return false;
7721 SmallVector<int> Indices(E, PoisonMaskElem);
7722 unsigned MinIdx = NElts, MaxIdx = 0;
7723 for (auto [I, V] : enumerate(VL)) {
7724 auto *Inst = dyn_cast<Instruction>(V);
7725 if (!Inst)
7726 continue;
7727 if (Inst->getOperand(0) != Vec)
7728 return false;
7729 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7730 if (isa<UndefValue>(EE->getIndexOperand()))
7731 continue;
7732 std::optional<unsigned> Idx = getExtractIndex(Inst);
7733 if (!Idx)
7734 return false;
7735 const unsigned ExtIdx = *Idx;
7736 if (ExtIdx >= NElts)
7737 continue;
7738 Indices[I] = ExtIdx;
7739 if (MinIdx > ExtIdx)
7740 MinIdx = ExtIdx;
7741 if (MaxIdx < ExtIdx)
7742 MaxIdx = ExtIdx;
7743 }
7744 if (MaxIdx - MinIdx + 1 > E)
7745 return false;
7746 if (MaxIdx + 1 <= E)
7747 MinIdx = 0;
7748
7749 // Check that all of the indices extract from the correct offset.
7750 bool ShouldKeepOrder = true;
7751 // Assign to all items the initial value E + 1 so we can check if the extract
7752 // instruction index was used already.
7753 // Also, later we can check that all the indices are used and we have a
7754 // consecutive access in the extract instructions, by checking that no
7755 // element of CurrentOrder still has value E + 1.
7756 CurrentOrder.assign(E, E);
7757 for (unsigned I = 0; I < E; ++I) {
7758 if (Indices[I] == PoisonMaskElem)
7759 continue;
7760 const unsigned ExtIdx = Indices[I] - MinIdx;
7761 if (CurrentOrder[ExtIdx] != E) {
7762 CurrentOrder.clear();
7763 return false;
7764 }
7765 ShouldKeepOrder &= ExtIdx == I;
7766 CurrentOrder[ExtIdx] = I;
7767 }
7768 if (ShouldKeepOrder)
7769 CurrentOrder.clear();
7770
7771 return ShouldKeepOrder;
7772}
7773
7774bool BoUpSLP::areAllUsersVectorized(
7775 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7776 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7777 all_of(I->users(), [this](User *U) {
7778 return ScalarToTreeEntry.contains(U) ||
7779 isVectorLikeInstWithConstOps(U) ||
7780 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7781 });
7782}
7783
7784static std::pair<InstructionCost, InstructionCost>
7787 ArrayRef<Type *> ArgTys) {
7789
7790 // Calculate the cost of the scalar and vector calls.
7791 FastMathFlags FMF;
7792 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7793 FMF = FPCI->getFastMathFlags();
7795 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7796 dyn_cast<IntrinsicInst>(CI));
7797 auto IntrinsicCost =
7799
7800 auto Shape = VFShape::get(CI->getFunctionType(),
7802 false /*HasGlobalPred*/);
7803 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7804 auto LibCost = IntrinsicCost;
7805 if (!CI->isNoBuiltin() && VecFunc) {
7806 // Calculate the cost of the vector library call.
7807 // If the corresponding vector call is cheaper, return its cost.
7808 LibCost =
7809 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7810 }
7811 return {IntrinsicCost, LibCost};
7812}
7813
7814void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7815 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7816 SmallVectorImpl<Value *> *OpScalars,
7817 SmallVectorImpl<Value *> *AltScalars) const {
7818 unsigned Sz = Scalars.size();
7819 Mask.assign(Sz, PoisonMaskElem);
7820 SmallVector<int> OrderMask;
7821 if (!ReorderIndices.empty())
7822 inversePermutation(ReorderIndices, OrderMask);
7823 for (unsigned I = 0; I < Sz; ++I) {
7824 unsigned Idx = I;
7825 if (!ReorderIndices.empty())
7826 Idx = OrderMask[I];
7827 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7828 if (IsAltOp(OpInst)) {
7829 Mask[I] = Sz + Idx;
7830 if (AltScalars)
7831 AltScalars->push_back(OpInst);
7832 } else {
7833 Mask[I] = Idx;
7834 if (OpScalars)
7835 OpScalars->push_back(OpInst);
7836 }
7837 }
7838 if (!ReuseShuffleIndices.empty()) {
7839 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7840 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7841 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7842 });
7843 Mask.swap(NewMask);
7844 }
7845}
7846
7848 const Instruction *MainOp,
7849 const Instruction *AltOp,
7850 const TargetLibraryInfo &TLI) {
7851 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7852 auto *AltCI = cast<CmpInst>(AltOp);
7853 CmpInst::Predicate MainP = MainCI->getPredicate();
7854 CmpInst::Predicate AltP = AltCI->getPredicate();
7855 assert(MainP != AltP && "Expected different main/alternate predicates.");
7856 auto *CI = cast<CmpInst>(I);
7857 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7858 return false;
7859 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7860 return true;
7861 CmpInst::Predicate P = CI->getPredicate();
7863
7864 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7865 "CmpInst expected to match either main or alternate predicate or "
7866 "their swap.");
7867 (void)AltP;
7868 return MainP != P && MainP != SwappedP;
7869 }
7870 return I->getOpcode() == AltOp->getOpcode();
7871}
7872
7873TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7874 assert(!Ops.empty());
7875 const auto *Op0 = Ops.front();
7876
7877 const bool IsConstant = all_of(Ops, [](Value *V) {
7878 // TODO: We should allow undef elements here
7879 return isConstant(V) && !isa<UndefValue>(V);
7880 });
7881 const bool IsUniform = all_of(Ops, [=](Value *V) {
7882 // TODO: We should allow undef elements here
7883 return V == Op0;
7884 });
7885 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7886 // TODO: We should allow undef elements here
7887 if (auto *CI = dyn_cast<ConstantInt>(V))
7888 return CI->getValue().isPowerOf2();
7889 return false;
7890 });
7891 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7892 // TODO: We should allow undef elements here
7893 if (auto *CI = dyn_cast<ConstantInt>(V))
7894 return CI->getValue().isNegatedPowerOf2();
7895 return false;
7896 });
7897
7899 if (IsConstant && IsUniform)
7901 else if (IsConstant)
7903 else if (IsUniform)
7905
7907 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7908 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7909
7910 return {VK, VP};
7911}
7912
7913namespace {
7914/// The base class for shuffle instruction emission and shuffle cost estimation.
7915class BaseShuffleAnalysis {
7916protected:
7917 Type *ScalarTy = nullptr;
7918
7919 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
7920
7921 /// V is expected to be a vectorized value.
7922 /// When REVEC is disabled, there is no difference between VF and
7923 /// VNumElements.
7924 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
7925 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
7926 /// of 8.
7927 unsigned getVF(Value *V) const {
7928 assert(V && "V cannot be nullptr");
7929 assert(isa<FixedVectorType>(V->getType()) &&
7930 "V does not have FixedVectorType");
7931 assert(ScalarTy && "ScalarTy cannot be nullptr");
7932 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
7933 unsigned VNumElements =
7934 cast<FixedVectorType>(V->getType())->getNumElements();
7935 assert(VNumElements > ScalarTyNumElements &&
7936 "the number of elements of V is not large enough");
7937 assert(VNumElements % ScalarTyNumElements == 0 &&
7938 "the number of elements of V is not a vectorized value");
7939 return VNumElements / ScalarTyNumElements;
7940 }
7941
7942 /// Checks if the mask is an identity mask.
7943 /// \param IsStrict if is true the function returns false if mask size does
7944 /// not match vector size.
7945 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7946 bool IsStrict) {
7947 int Limit = Mask.size();
7948 int VF = VecTy->getNumElements();
7949 int Index = -1;
7950 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7951 return true;
7952 if (!IsStrict) {
7953 // Consider extract subvector starting from index 0.
7955 Index == 0)
7956 return true;
7957 // All VF-size submasks are identity (e.g.
7958 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7959 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7960 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7961 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7963 }))
7964 return true;
7965 }
7966 return false;
7967 }
7968
7969 /// Tries to combine 2 different masks into single one.
7970 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7971 /// change the size of the vector, \p LocalVF is the original size of the
7972 /// shuffled vector.
7973 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7974 ArrayRef<int> ExtMask) {
7975 unsigned VF = Mask.size();
7976 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7977 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7978 if (ExtMask[I] == PoisonMaskElem)
7979 continue;
7980 int MaskedIdx = Mask[ExtMask[I] % VF];
7981 NewMask[I] =
7982 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7983 }
7984 Mask.swap(NewMask);
7985 }
7986
7987 /// Looks through shuffles trying to reduce final number of shuffles in the
7988 /// code. The function looks through the previously emitted shuffle
7989 /// instructions and properly mark indices in mask as undef.
7990 /// For example, given the code
7991 /// \code
7992 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7993 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7994 /// \endcode
7995 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7996 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7997 /// <0, 1, 2, 3> for the shuffle.
7998 /// If 2 operands are of different size, the smallest one will be resized and
7999 /// the mask recalculated properly.
8000 /// For example, given the code
8001 /// \code
8002 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
8003 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
8004 /// \endcode
8005 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
8006 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
8007 /// <0, 1, 2, 3> for the shuffle.
8008 /// So, it tries to transform permutations to simple vector merge, if
8009 /// possible.
8010 /// \param V The input vector which must be shuffled using the given \p Mask.
8011 /// If the better candidate is found, \p V is set to this best candidate
8012 /// vector.
8013 /// \param Mask The input mask for the shuffle. If the best candidate is found
8014 /// during looking-through-shuffles attempt, it is updated accordingly.
8015 /// \param SinglePermute true if the shuffle operation is originally a
8016 /// single-value-permutation. In this case the look-through-shuffles procedure
8017 /// may look for resizing shuffles as the best candidates.
8018 /// \return true if the shuffle results in the non-resizing identity shuffle
8019 /// (and thus can be ignored), false - otherwise.
8020 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
8021 bool SinglePermute) {
8022 Value *Op = V;
8023 ShuffleVectorInst *IdentityOp = nullptr;
8024 SmallVector<int> IdentityMask;
8025 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
8026 // Exit if not a fixed vector type or changing size shuffle.
8027 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
8028 if (!SVTy)
8029 break;
8030 // Remember the identity or broadcast mask, if it is not a resizing
8031 // shuffle. If no better candidates are found, this Op and Mask will be
8032 // used in the final shuffle.
8033 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
8034 if (!IdentityOp || !SinglePermute ||
8035 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
8037 IdentityMask.size()))) {
8038 IdentityOp = SV;
8039 // Store current mask in the IdentityMask so later we did not lost
8040 // this info if IdentityOp is selected as the best candidate for the
8041 // permutation.
8042 IdentityMask.assign(Mask);
8043 }
8044 }
8045 // Remember the broadcast mask. If no better candidates are found, this Op
8046 // and Mask will be used in the final shuffle.
8047 // Zero splat can be used as identity too, since it might be used with
8048 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
8049 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
8050 // expensive, the analysis founds out, that the source vector is just a
8051 // broadcast, this original mask can be transformed to identity mask <0,
8052 // 1, 2, 3>.
8053 // \code
8054 // %0 = shuffle %v, poison, zeroinitalizer
8055 // %res = shuffle %0, poison, <3, 1, 2, 0>
8056 // \endcode
8057 // may be transformed to
8058 // \code
8059 // %0 = shuffle %v, poison, zeroinitalizer
8060 // %res = shuffle %0, poison, <0, 1, 2, 3>
8061 // \endcode
8062 if (SV->isZeroEltSplat()) {
8063 IdentityOp = SV;
8064 IdentityMask.assign(Mask);
8065 }
8066 int LocalVF = Mask.size();
8067 if (auto *SVOpTy =
8068 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
8069 LocalVF = SVOpTy->getNumElements();
8070 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
8071 for (auto [Idx, I] : enumerate(Mask)) {
8072 if (I == PoisonMaskElem ||
8073 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
8074 continue;
8075 ExtMask[Idx] = SV->getMaskValue(I);
8076 }
8077 bool IsOp1Undef =
8078 isUndefVector(SV->getOperand(0),
8079 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
8080 .all();
8081 bool IsOp2Undef =
8082 isUndefVector(SV->getOperand(1),
8083 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
8084 .all();
8085 if (!IsOp1Undef && !IsOp2Undef) {
8086 // Update mask and mark undef elems.
8087 for (int &I : Mask) {
8088 if (I == PoisonMaskElem)
8089 continue;
8090 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
8092 I = PoisonMaskElem;
8093 }
8094 break;
8095 }
8096 SmallVector<int> ShuffleMask(SV->getShuffleMask());
8097 combineMasks(LocalVF, ShuffleMask, Mask);
8098 Mask.swap(ShuffleMask);
8099 if (IsOp2Undef)
8100 Op = SV->getOperand(0);
8101 else
8102 Op = SV->getOperand(1);
8103 }
8104 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
8105 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
8107 if (IdentityOp) {
8108 V = IdentityOp;
8109 assert(Mask.size() == IdentityMask.size() &&
8110 "Expected masks of same sizes.");
8111 // Clear known poison elements.
8112 for (auto [I, Idx] : enumerate(Mask))
8113 if (Idx == PoisonMaskElem)
8114 IdentityMask[I] = PoisonMaskElem;
8115 Mask.swap(IdentityMask);
8116 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
8117 return SinglePermute &&
8118 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
8119 /*IsStrict=*/true) ||
8120 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
8121 Shuffle->isZeroEltSplat() &&
8123 }
8124 V = Op;
8125 return false;
8126 }
8127 V = Op;
8128 return true;
8129 }
8130
8131 /// Smart shuffle instruction emission, walks through shuffles trees and
8132 /// tries to find the best matching vector for the actual shuffle
8133 /// instruction.
8134 template <typename T, typename ShuffleBuilderTy>
8135 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
8136 ShuffleBuilderTy &Builder) {
8137 assert(V1 && "Expected at least one vector value.");
8138 if (V2)
8139 Builder.resizeToMatch(V1, V2);
8140 int VF = Mask.size();
8141 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
8142 VF = FTy->getNumElements();
8143 if (V2 &&
8144 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
8145 // Peek through shuffles.
8146 Value *Op1 = V1;
8147 Value *Op2 = V2;
8148 int VF =
8149 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8150 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
8151 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
8152 for (int I = 0, E = Mask.size(); I < E; ++I) {
8153 if (Mask[I] < VF)
8154 CombinedMask1[I] = Mask[I];
8155 else
8156 CombinedMask2[I] = Mask[I] - VF;
8157 }
8158 Value *PrevOp1;
8159 Value *PrevOp2;
8160 do {
8161 PrevOp1 = Op1;
8162 PrevOp2 = Op2;
8163 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
8164 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
8165 // Check if we have 2 resizing shuffles - need to peek through operands
8166 // again.
8167 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
8168 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
8169 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
8170 for (auto [Idx, I] : enumerate(CombinedMask1)) {
8171 if (I == PoisonMaskElem)
8172 continue;
8173 ExtMask1[Idx] = SV1->getMaskValue(I);
8174 }
8175 SmallBitVector UseMask1 = buildUseMask(
8176 cast<FixedVectorType>(SV1->getOperand(1)->getType())
8177 ->getNumElements(),
8178 ExtMask1, UseMask::SecondArg);
8179 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
8180 for (auto [Idx, I] : enumerate(CombinedMask2)) {
8181 if (I == PoisonMaskElem)
8182 continue;
8183 ExtMask2[Idx] = SV2->getMaskValue(I);
8184 }
8185 SmallBitVector UseMask2 = buildUseMask(
8186 cast<FixedVectorType>(SV2->getOperand(1)->getType())
8187 ->getNumElements(),
8188 ExtMask2, UseMask::SecondArg);
8189 if (SV1->getOperand(0)->getType() ==
8190 SV2->getOperand(0)->getType() &&
8191 SV1->getOperand(0)->getType() != SV1->getType() &&
8192 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
8193 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
8194 Op1 = SV1->getOperand(0);
8195 Op2 = SV2->getOperand(0);
8196 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
8197 int LocalVF = ShuffleMask1.size();
8198 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
8199 LocalVF = FTy->getNumElements();
8200 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
8201 CombinedMask1.swap(ShuffleMask1);
8202 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
8203 LocalVF = ShuffleMask2.size();
8204 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
8205 LocalVF = FTy->getNumElements();
8206 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
8207 CombinedMask2.swap(ShuffleMask2);
8208 }
8209 }
8210 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
8211 Builder.resizeToMatch(Op1, Op2);
8212 VF = std::max(cast<VectorType>(Op1->getType())
8213 ->getElementCount()
8214 .getKnownMinValue(),
8215 cast<VectorType>(Op2->getType())
8216 ->getElementCount()
8217 .getKnownMinValue());
8218 for (int I = 0, E = Mask.size(); I < E; ++I) {
8219 if (CombinedMask2[I] != PoisonMaskElem) {
8220 assert(CombinedMask1[I] == PoisonMaskElem &&
8221 "Expected undefined mask element");
8222 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
8223 }
8224 }
8225 if (Op1 == Op2 &&
8226 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
8227 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
8228 isa<ShuffleVectorInst>(Op1) &&
8229 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8230 ArrayRef(CombinedMask1))))
8231 return Builder.createIdentity(Op1);
8232 return Builder.createShuffleVector(
8233 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
8234 CombinedMask1);
8235 }
8236 if (isa<PoisonValue>(V1))
8237 return Builder.createPoison(
8238 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
8239 SmallVector<int> NewMask(Mask);
8240 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
8241 assert(V1 && "Expected non-null value after looking through shuffles.");
8242
8243 if (!IsIdentity)
8244 return Builder.createShuffleVector(V1, NewMask);
8245 return Builder.createIdentity(V1);
8246 }
8247};
8248} // namespace
8249
8250/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8251static std::pair<InstructionCost, InstructionCost>
8253 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
8254 Type *ScalarTy, VectorType *VecTy) {
8255 InstructionCost ScalarCost = 0;
8256 InstructionCost VecCost = 0;
8257 // Here we differentiate two cases: (1) when Ptrs represent a regular
8258 // vectorization tree node (as they are pointer arguments of scattered
8259 // loads) or (2) when Ptrs are the arguments of loads or stores being
8260 // vectorized as plane wide unit-stride load/store since all the
8261 // loads/stores are known to be from/to adjacent locations.
8262 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8263 // Case 2: estimate costs for pointer related costs when vectorizing to
8264 // a wide load/store.
8265 // Scalar cost is estimated as a set of pointers with known relationship
8266 // between them.
8267 // For vector code we will use BasePtr as argument for the wide load/store
8268 // but we also need to account all the instructions which are going to
8269 // stay in vectorized code due to uses outside of these scalar
8270 // loads/stores.
8271 ScalarCost = TTI.getPointersChainCost(
8272 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
8273 CostKind);
8274
8275 SmallVector<const Value *> PtrsRetainedInVecCode;
8276 for (Value *V : Ptrs) {
8277 if (V == BasePtr) {
8278 PtrsRetainedInVecCode.push_back(V);
8279 continue;
8280 }
8281 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8282 // For simplicity assume Ptr to stay in vectorized code if it's not a
8283 // GEP instruction. We don't care since it's cost considered free.
8284 // TODO: We should check for any uses outside of vectorizable tree
8285 // rather than just single use.
8286 if (!Ptr || !Ptr->hasOneUse())
8287 PtrsRetainedInVecCode.push_back(V);
8288 }
8289
8290 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
8291 // If all pointers stay in vectorized code then we don't have
8292 // any savings on that.
8293 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
8294 }
8295 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
8296 TTI::PointersChainInfo::getKnownStride(),
8297 VecTy, CostKind);
8298 } else {
8299 // Case 1: Ptrs are the arguments of loads that we are going to transform
8300 // into masked gather load intrinsic.
8301 // All the scalar GEPs will be removed as a result of vectorization.
8302 // For any external uses of some lanes extract element instructions will
8303 // be generated (which cost is estimated separately).
8304 TTI::PointersChainInfo PtrsInfo =
8305 all_of(Ptrs,
8306 [](const Value *V) {
8307 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8308 return Ptr && !Ptr->hasAllConstantIndices();
8309 })
8310 ? TTI::PointersChainInfo::getUnknownStride()
8311 : TTI::PointersChainInfo::getKnownStride();
8312
8313 ScalarCost =
8314 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
8315 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
8316 if (!BaseGEP) {
8317 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
8318 if (It != Ptrs.end())
8319 BaseGEP = cast<GEPOperator>(*It);
8320 }
8321 if (BaseGEP) {
8322 SmallVector<const Value *> Indices(BaseGEP->indices());
8323 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
8324 BaseGEP->getPointerOperand(), Indices, VecTy,
8325 CostKind);
8326 }
8327 }
8328
8329 return std::make_pair(ScalarCost, VecCost);
8330}
8331
8334 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8335 TreeEntry &E = *TE;
8336 switch (E.getOpcode()) {
8337 case Instruction::Load: {
8338 // No need to reorder masked gather loads, just reorder the scalar
8339 // operands.
8340 if (E.State != TreeEntry::Vectorize)
8341 break;
8342 Type *ScalarTy = E.getMainOp()->getType();
8343 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8344 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8345 // Check if profitable to represent consecutive load + reverse as strided
8346 // load with stride -1.
8347 if (isReverseOrder(E.ReorderIndices) &&
8348 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8349 SmallVector<int> Mask;
8350 inversePermutation(E.ReorderIndices, Mask);
8351 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8352 InstructionCost OriginalVecCost =
8353 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8358 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8359 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8360 if (StridedCost < OriginalVecCost)
8361 // Strided load is more profitable than consecutive load + reverse -
8362 // transform the node to strided load.
8363 E.State = TreeEntry::StridedVectorize;
8364 }
8365 break;
8366 }
8367 case Instruction::Store: {
8368 Type *ScalarTy =
8369 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8370 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8371 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8372 // Check if profitable to represent consecutive load + reverse as strided
8373 // load with stride -1.
8374 if (isReverseOrder(E.ReorderIndices) &&
8375 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8376 SmallVector<int> Mask;
8377 inversePermutation(E.ReorderIndices, Mask);
8378 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8379 InstructionCost OriginalVecCost =
8380 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8385 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8386 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8387 if (StridedCost < OriginalVecCost)
8388 // Strided load is more profitable than consecutive load + reverse -
8389 // transform the node to strided load.
8390 E.State = TreeEntry::StridedVectorize;
8391 }
8392 break;
8393 }
8394 case Instruction::Select: {
8395 if (E.State != TreeEntry::Vectorize)
8396 break;
8397 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
8398 if (MinMaxID == Intrinsic::not_intrinsic)
8399 break;
8400 // This node is a minmax node.
8401 E.CombinedOp = TreeEntry::MinMax;
8402 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
8403 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
8404 CondEntry->State == TreeEntry::Vectorize) {
8405 // The condition node is part of the combined minmax node.
8406 CondEntry->State = TreeEntry::CombinedVectorize;
8407 }
8408 break;
8409 }
8410 default:
8411 break;
8412 }
8413 }
8414}
8415
8416/// Merges shuffle masks and emits final shuffle instruction, if required. It
8417/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8418/// when the actual shuffle instruction is generated only if this is actually
8419/// required. Otherwise, the shuffle instruction emission is delayed till the
8420/// end of the process, to reduce the number of emitted instructions and further
8421/// analysis/transformations.
8422class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8423 bool IsFinalized = false;
8424 SmallVector<int> CommonMask;
8426 const TargetTransformInfo &TTI;
8428 SmallDenseSet<Value *> VectorizedVals;
8429 BoUpSLP &R;
8430 SmallPtrSetImpl<Value *> &CheckedExtracts;
8431 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8432 /// While set, still trying to estimate the cost for the same nodes and we
8433 /// can delay actual cost estimation (virtual shuffle instruction emission).
8434 /// May help better estimate the cost if same nodes must be permuted + allows
8435 /// to move most of the long shuffles cost estimation to TTI.
8436 bool SameNodesEstimated = true;
8437
8438 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8439 if (Ty->getScalarType()->isPointerTy()) {
8443 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8444 Ty->getScalarType());
8445 if (auto *VTy = dyn_cast<VectorType>(Ty))
8446 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8447 return Res;
8448 }
8449 return Constant::getAllOnesValue(Ty);
8450 }
8451
8452 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8453 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8454 return TTI::TCC_Free;
8455 auto *VecTy = getWidenedType(ScalarTy, VL.size());
8456 InstructionCost GatherCost = 0;
8457 SmallVector<Value *> Gathers(VL);
8458 // Improve gather cost for gather of loads, if we can group some of the
8459 // loads into vector loads.
8460 InstructionsState S = getSameOpcode(VL, *R.TLI);
8461 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8462 unsigned MinVF = R.getMinVF(2 * Sz);
8463 if (VL.size() > 2 &&
8464 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8465 (InVectors.empty() &&
8466 any_of(seq<unsigned>(0, VL.size() / MinVF),
8467 [&](unsigned Idx) {
8468 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8469 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8470 return S.getOpcode() == Instruction::Load &&
8471 !S.isAltShuffle();
8472 }))) &&
8473 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8474 !isSplat(Gathers)) {
8475 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8476 SetVector<Value *> VectorizedLoads;
8478 SmallVector<unsigned> ScatterVectorized;
8479 unsigned StartIdx = 0;
8480 unsigned VF = VL.size() / 2;
8481 for (; VF >= MinVF; VF /= 2) {
8482 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8483 Cnt += VF) {
8484 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8485 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8486 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8487 if (SliceS.getOpcode() != Instruction::Load ||
8488 SliceS.isAltShuffle())
8489 continue;
8490 }
8491 if (!VectorizedLoads.count(Slice.front()) &&
8492 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8493 SmallVector<Value *> PointerOps;
8494 OrdersType CurrentOrder;
8495 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8496 CurrentOrder, PointerOps);
8497 switch (LS) {
8501 // Mark the vectorized loads so that we don't vectorize them
8502 // again.
8503 // TODO: better handling of loads with reorders.
8504 if (((LS == LoadsState::Vectorize ||
8506 CurrentOrder.empty()) ||
8508 isReverseOrder(CurrentOrder)))
8509 VectorizedStarts.emplace_back(Cnt, LS);
8510 else
8511 ScatterVectorized.push_back(Cnt);
8512 VectorizedLoads.insert(Slice.begin(), Slice.end());
8513 // If we vectorized initial block, no need to try to vectorize
8514 // it again.
8515 if (Cnt == StartIdx)
8516 StartIdx += VF;
8517 break;
8518 case LoadsState::Gather:
8519 break;
8520 }
8521 }
8522 }
8523 // Check if the whole array was vectorized already - exit.
8524 if (StartIdx >= VL.size())
8525 break;
8526 // Found vectorizable parts - exit.
8527 if (!VectorizedLoads.empty())
8528 break;
8529 }
8530 if (!VectorizedLoads.empty()) {
8531 unsigned NumParts = TTI.getNumberOfParts(VecTy);
8532 bool NeedInsertSubvectorAnalysis =
8533 !NumParts || (VL.size() / VF) > NumParts;
8534 // Get the cost for gathered loads.
8535 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8536 if (VectorizedLoads.contains(VL[I]))
8537 continue;
8538 GatherCost +=
8539 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8540 }
8541 // Exclude potentially vectorized loads from list of gathered
8542 // scalars.
8543 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8544 // The cost for vectorized loads.
8545 InstructionCost ScalarsCost = 0;
8546 for (Value *V : VectorizedLoads) {
8547 auto *LI = cast<LoadInst>(V);
8548 ScalarsCost +=
8549 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8550 LI->getAlign(), LI->getPointerAddressSpace(),
8551 CostKind, TTI::OperandValueInfo(), LI);
8552 }
8553 auto *LoadTy = getWidenedType(VL.front()->getType(), VF);
8554 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8555 auto *LI = cast<LoadInst>(VL[P.first]);
8556 Align Alignment = LI->getAlign();
8557 GatherCost +=
8558 P.second == LoadsState::Vectorize
8559 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8560 LI->getPointerAddressSpace(), CostKind,
8563 Instruction::Load, LoadTy, LI->getPointerOperand(),
8564 /*VariableMask=*/false, Alignment, CostKind, LI);
8565 // Add external uses costs.
8566 for (auto [Idx, V] : enumerate(VL.slice(
8567 P.first, std::min<unsigned>(VL.size() - P.first, VF))))
8568 if (!R.areAllUsersVectorized(cast<Instruction>(V)))
8569 GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
8570 LoadTy, CostKind, Idx);
8571 // Estimate GEP cost.
8572 SmallVector<Value *> PointerOps(VF);
8573 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8574 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8575 auto [ScalarGEPCost, VectorGEPCost] =
8576 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8577 Instruction::Load, CostKind, LI->getType(), LoadTy);
8578 GatherCost += VectorGEPCost - ScalarGEPCost;
8579 }
8580 for (unsigned P : ScatterVectorized) {
8581 auto *LI0 = cast<LoadInst>(VL[P]);
8582 ArrayRef<Value *> Slice = VL.slice(P, VF);
8583 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8584 GatherCost += TTI.getGatherScatterOpCost(
8585 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8586 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8587 // Estimate GEP cost.
8588 SmallVector<Value *> PointerOps(VF);
8589 for (auto [I, V] : enumerate(Slice))
8590 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8591 OrdersType Order;
8592 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8593 Order)) {
8594 // TODO: improve checks if GEPs can be vectorized.
8595 Value *Ptr0 = PointerOps.front();
8596 Type *ScalarTy = Ptr0->getType();
8597 auto *VecTy = getWidenedType(ScalarTy, VF);
8598 auto [ScalarGEPCost, VectorGEPCost] =
8599 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8600 CostKind, ScalarTy, VecTy);
8601 GatherCost += VectorGEPCost - ScalarGEPCost;
8602 if (!Order.empty()) {
8603 SmallVector<int> Mask;
8604 inversePermutation(Order, Mask);
8606 VecTy, Mask, CostKind);
8607 }
8608 } else {
8609 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8610 PointerOps.front()->getType());
8611 }
8612 }
8613 if (NeedInsertSubvectorAnalysis) {
8614 // Add the cost for the subvectors insert.
8615 SmallVector<int> ShuffleMask(VL.size());
8616 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8617 for (unsigned Idx : seq<unsigned>(0, E))
8618 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8619 GatherCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy,
8620 ShuffleMask, CostKind, I, LoadTy);
8621 }
8622 }
8623 GatherCost -= ScalarsCost;
8624 }
8625 GatherCost = std::min(BaseCost, GatherCost);
8626 } else if (!Root && isSplat(VL)) {
8627 // Found the broadcasting of the single scalar, calculate the cost as
8628 // the broadcast.
8629 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8630 assert(It != VL.end() && "Expected at least one non-undef value.");
8631 // Add broadcast for non-identity shuffle only.
8632 bool NeedShuffle =
8633 count(VL, *It) > 1 &&
8634 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8635 if (!NeedShuffle)
8636 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8637 CostKind, std::distance(VL.begin(), It),
8638 PoisonValue::get(VecTy), *It);
8639
8640 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8641 transform(VL, ShuffleMask.begin(), [](Value *V) {
8642 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8643 });
8644 InstructionCost InsertCost =
8645 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8646 PoisonValue::get(VecTy), *It);
8647 return InsertCost + ::getShuffleCost(TTI,
8649 VecTy, ShuffleMask, CostKind,
8650 /*Index=*/0, /*SubTp=*/nullptr,
8651 /*Args=*/*It);
8652 }
8653 return GatherCost +
8654 (all_of(Gathers, IsaPred<UndefValue>)
8656 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8657 ScalarTy));
8658 };
8659
8660 /// Compute the cost of creating a vector containing the extracted values from
8661 /// \p VL.
8663 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8664 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8665 unsigned NumParts) {
8666 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8667 unsigned NumElts =
8668 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8669 auto *EE = dyn_cast<ExtractElementInst>(V);
8670 if (!EE)
8671 return Sz;
8672 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8673 if (!VecTy)
8674 return Sz;
8675 return std::max(Sz, VecTy->getNumElements());
8676 });
8677 // FIXME: this must be moved to TTI for better estimation.
8678 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
8679 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8681 -> std::optional<TTI::ShuffleKind> {
8682 if (NumElts <= EltsPerVector)
8683 return std::nullopt;
8684 int OffsetReg0 =
8685 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8686 [](int S, int I) {
8687 if (I == PoisonMaskElem)
8688 return S;
8689 return std::min(S, I);
8690 }),
8691 EltsPerVector);
8692 int OffsetReg1 = OffsetReg0;
8693 DenseSet<int> RegIndices;
8694 // Check that if trying to permute same single/2 input vectors.
8696 int FirstRegId = -1;
8697 Indices.assign(1, OffsetReg0);
8698 for (auto [Pos, I] : enumerate(Mask)) {
8699 if (I == PoisonMaskElem)
8700 continue;
8701 int Idx = I - OffsetReg0;
8702 int RegId =
8703 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8704 if (FirstRegId < 0)
8705 FirstRegId = RegId;
8706 RegIndices.insert(RegId);
8707 if (RegIndices.size() > 2)
8708 return std::nullopt;
8709 if (RegIndices.size() == 2) {
8710 ShuffleKind = TTI::SK_PermuteTwoSrc;
8711 if (Indices.size() == 1) {
8712 OffsetReg1 = alignDown(
8713 std::accumulate(
8714 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8715 [&](int S, int I) {
8716 if (I == PoisonMaskElem)
8717 return S;
8718 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8719 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8720 if (RegId == FirstRegId)
8721 return S;
8722 return std::min(S, I);
8723 }),
8724 EltsPerVector);
8725 Indices.push_back(OffsetReg1 % NumElts);
8726 }
8727 Idx = I - OffsetReg1;
8728 }
8729 I = (Idx % NumElts) % EltsPerVector +
8730 (RegId == FirstRegId ? 0 : EltsPerVector);
8731 }
8732 return ShuffleKind;
8733 };
8735
8736 // Process extracts in blocks of EltsPerVector to check if the source vector
8737 // operand can be re-used directly. If not, add the cost of creating a
8738 // shuffle to extract the values into a vector register.
8739 for (unsigned Part : seq<unsigned>(NumParts)) {
8740 if (!ShuffleKinds[Part])
8741 continue;
8742 ArrayRef<int> MaskSlice = Mask.slice(
8743 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
8744 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8745 copy(MaskSlice, SubMask.begin());
8747 std::optional<TTI::ShuffleKind> RegShuffleKind =
8748 CheckPerRegistersShuffle(SubMask, Indices);
8749 if (!RegShuffleKind) {
8750 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
8752 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
8753 Cost +=
8754 ::getShuffleCost(TTI, *ShuffleKinds[Part],
8755 getWidenedType(ScalarTy, NumElts), MaskSlice);
8756 continue;
8757 }
8758 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8759 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8760 Cost +=
8761 ::getShuffleCost(TTI, *RegShuffleKind,
8762 getWidenedType(ScalarTy, EltsPerVector), SubMask);
8763 }
8764 for (unsigned Idx : Indices) {
8765 assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8766 "SK_ExtractSubvector index out of range");
8769 getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)),
8770 std::nullopt, CostKind, Idx,
8771 getWidenedType(ScalarTy, EltsPerVector));
8772 }
8773 // Second attempt to check, if just a permute is better estimated than
8774 // subvector extract.
8775 SubMask.assign(NumElts, PoisonMaskElem);
8776 copy(MaskSlice, SubMask.begin());
8777 InstructionCost OriginalCost = ::getShuffleCost(
8778 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
8779 if (OriginalCost < Cost)
8780 Cost = OriginalCost;
8781 }
8782 return Cost;
8783 }
8784 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8785 /// shuffle emission.
8786 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8787 ArrayRef<int> Mask) {
8788 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8789 if (Mask[Idx] != PoisonMaskElem)
8790 CommonMask[Idx] = Idx;
8791 }
8792 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8793 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8794 /// elements.
8795 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8796 ArrayRef<int> Mask, unsigned Part,
8797 unsigned SliceSize) {
8798 if (SameNodesEstimated) {
8799 // Delay the cost estimation if the same nodes are reshuffling.
8800 // If we already requested the cost of reshuffling of E1 and E2 before, no
8801 // need to estimate another cost with the sub-Mask, instead include this
8802 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8803 // estimation.
8804 if ((InVectors.size() == 2 &&
8805 InVectors.front().get<const TreeEntry *>() == &E1 &&
8806 InVectors.back().get<const TreeEntry *>() == E2) ||
8807 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8808 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
8809 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8810 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8811 "Expected all poisoned elements.");
8812 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
8813 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8814 return;
8815 }
8816 // Found non-matching nodes - need to estimate the cost for the matched
8817 // and transform mask.
8818 Cost += createShuffle(InVectors.front(),
8819 InVectors.size() == 1 ? nullptr : InVectors.back(),
8820 CommonMask);
8821 transformMaskAfterShuffle(CommonMask, CommonMask);
8822 }
8823 SameNodesEstimated = false;
8824 if (!E2 && InVectors.size() == 1) {
8825 unsigned VF = E1.getVectorFactor();
8826 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8827 VF = std::max(VF,
8828 cast<FixedVectorType>(V1->getType())->getNumElements());
8829 } else {
8830 const auto *E = InVectors.front().get<const TreeEntry *>();
8831 VF = std::max(VF, E->getVectorFactor());
8832 }
8833 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8834 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8835 CommonMask[Idx] = Mask[Idx] + VF;
8836 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8837 transformMaskAfterShuffle(CommonMask, CommonMask);
8838 } else {
8839 Cost += createShuffle(&E1, E2, Mask);
8840 transformMaskAfterShuffle(CommonMask, Mask);
8841 }
8842 }
8843
8844 class ShuffleCostBuilder {
8845 const TargetTransformInfo &TTI;
8846
8847 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8848 int Index = -1;
8849 return Mask.empty() ||
8850 (VF == Mask.size() &&
8853 Index == 0);
8854 }
8855
8856 public:
8857 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8858 ~ShuffleCostBuilder() = default;
8859 InstructionCost createShuffleVector(Value *V1, Value *,
8860 ArrayRef<int> Mask) const {
8861 // Empty mask or identity mask are free.
8862 unsigned VF =
8863 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8864 if (isEmptyOrIdentity(Mask, VF))
8865 return TTI::TCC_Free;
8866 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8867 cast<VectorType>(V1->getType()), Mask);
8868 }
8869 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8870 // Empty mask or identity mask are free.
8871 unsigned VF =
8872 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8873 if (isEmptyOrIdentity(Mask, VF))
8874 return TTI::TCC_Free;
8875 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
8876 cast<VectorType>(V1->getType()), Mask);
8877 }
8878 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8879 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8880 return TTI::TCC_Free;
8881 }
8882 void resizeToMatch(Value *&, Value *&) const {}
8883 };
8884
8885 /// Smart shuffle instruction emission, walks through shuffles trees and
8886 /// tries to find the best matching vector for the actual shuffle
8887 /// instruction.
8889 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8891 ArrayRef<int> Mask) {
8892 ShuffleCostBuilder Builder(TTI);
8893 SmallVector<int> CommonMask(Mask);
8894 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8895 unsigned CommonVF = Mask.size();
8896 InstructionCost ExtraCost = 0;
8897 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8898 unsigned VF) -> InstructionCost {
8899 if (E.isGather() && allConstant(E.Scalars))
8900 return TTI::TCC_Free;
8901 Type *EScalarTy = E.Scalars.front()->getType();
8902 bool IsSigned = true;
8903 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8904 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8905 IsSigned = It->second.second;
8906 }
8907 if (EScalarTy != ScalarTy) {
8908 unsigned CastOpcode = Instruction::Trunc;
8909 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8910 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8911 if (DstSz > SrcSz)
8912 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8913 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
8914 getWidenedType(EScalarTy, VF),
8915 TTI::CastContextHint::None, CostKind);
8916 }
8917 return TTI::TCC_Free;
8918 };
8919 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8920 if (isa<Constant>(V))
8921 return TTI::TCC_Free;
8922 auto *VecTy = cast<VectorType>(V->getType());
8923 Type *EScalarTy = VecTy->getElementType();
8924 if (EScalarTy != ScalarTy) {
8925 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8926 unsigned CastOpcode = Instruction::Trunc;
8927 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8928 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8929 if (DstSz > SrcSz)
8930 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8931 return TTI.getCastInstrCost(
8932 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8933 VecTy, TTI::CastContextHint::None, CostKind);
8934 }
8935 return TTI::TCC_Free;
8936 };
8937 if (!V1 && !V2 && !P2.isNull()) {
8938 // Shuffle 2 entry nodes.
8939 const TreeEntry *E = P1.get<const TreeEntry *>();
8940 unsigned VF = E->getVectorFactor();
8941 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8942 CommonVF = std::max(VF, E2->getVectorFactor());
8943 assert(all_of(Mask,
8944 [=](int Idx) {
8945 return Idx < 2 * static_cast<int>(CommonVF);
8946 }) &&
8947 "All elements in mask must be less than 2 * CommonVF.");
8948 if (E->Scalars.size() == E2->Scalars.size()) {
8949 SmallVector<int> EMask = E->getCommonMask();
8950 SmallVector<int> E2Mask = E2->getCommonMask();
8951 if (!EMask.empty() || !E2Mask.empty()) {
8952 for (int &Idx : CommonMask) {
8953 if (Idx == PoisonMaskElem)
8954 continue;
8955 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8956 Idx = EMask[Idx];
8957 else if (Idx >= static_cast<int>(CommonVF))
8958 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8959 E->Scalars.size();
8960 }
8961 }
8962 CommonVF = E->Scalars.size();
8963 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8964 GetNodeMinBWAffectedCost(*E2, CommonVF);
8965 } else {
8966 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8967 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8968 }
8969 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8970 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8971 } else if (!V1 && P2.isNull()) {
8972 // Shuffle single entry node.
8973 const TreeEntry *E = P1.get<const TreeEntry *>();
8974 unsigned VF = E->getVectorFactor();
8975 CommonVF = VF;
8976 assert(
8977 all_of(Mask,
8978 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8979 "All elements in mask must be less than CommonVF.");
8980 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8981 SmallVector<int> EMask = E->getCommonMask();
8982 assert(!EMask.empty() && "Expected non-empty common mask.");
8983 for (int &Idx : CommonMask) {
8984 if (Idx != PoisonMaskElem)
8985 Idx = EMask[Idx];
8986 }
8987 CommonVF = E->Scalars.size();
8988 }
8989 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8990 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8991 // Not identity/broadcast? Try to see if the original vector is better.
8992 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8993 CommonVF == CommonMask.size() &&
8994 any_of(enumerate(CommonMask),
8995 [](const auto &&P) {
8996 return P.value() != PoisonMaskElem &&
8997 static_cast<unsigned>(P.value()) != P.index();
8998 }) &&
8999 any_of(CommonMask,
9000 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
9001 SmallVector<int> ReorderMask;
9002 inversePermutation(E->ReorderIndices, ReorderMask);
9003 ::addMask(CommonMask, ReorderMask);
9004 }
9005 } else if (V1 && P2.isNull()) {
9006 // Shuffle single vector.
9007 ExtraCost += GetValueMinBWAffectedCost(V1);
9008 CommonVF = getVF(V1);
9009 assert(
9010 all_of(Mask,
9011 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
9012 "All elements in mask must be less than CommonVF.");
9013 } else if (V1 && !V2) {
9014 // Shuffle vector and tree node.
9015 unsigned VF = getVF(V1);
9016 const TreeEntry *E2 = P2.get<const TreeEntry *>();
9017 CommonVF = std::max(VF, E2->getVectorFactor());
9018 assert(all_of(Mask,
9019 [=](int Idx) {
9020 return Idx < 2 * static_cast<int>(CommonVF);
9021 }) &&
9022 "All elements in mask must be less than 2 * CommonVF.");
9023 if (E2->Scalars.size() == VF && VF != CommonVF) {
9024 SmallVector<int> E2Mask = E2->getCommonMask();
9025 assert(!E2Mask.empty() && "Expected non-empty common mask.");
9026 for (int &Idx : CommonMask) {
9027 if (Idx == PoisonMaskElem)
9028 continue;
9029 if (Idx >= static_cast<int>(CommonVF))
9030 Idx = E2Mask[Idx - CommonVF] + VF;
9031 }
9032 CommonVF = VF;
9033 }
9034 ExtraCost += GetValueMinBWAffectedCost(V1);
9035 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9036 ExtraCost += GetNodeMinBWAffectedCost(
9037 *E2, std::min(CommonVF, E2->getVectorFactor()));
9038 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9039 } else if (!V1 && V2) {
9040 // Shuffle vector and tree node.
9041 unsigned VF = getVF(V2);
9042 const TreeEntry *E1 = P1.get<const TreeEntry *>();
9043 CommonVF = std::max(VF, E1->getVectorFactor());
9044 assert(all_of(Mask,
9045 [=](int Idx) {
9046 return Idx < 2 * static_cast<int>(CommonVF);
9047 }) &&
9048 "All elements in mask must be less than 2 * CommonVF.");
9049 if (E1->Scalars.size() == VF && VF != CommonVF) {
9050 SmallVector<int> E1Mask = E1->getCommonMask();
9051 assert(!E1Mask.empty() && "Expected non-empty common mask.");
9052 for (int &Idx : CommonMask) {
9053 if (Idx == PoisonMaskElem)
9054 continue;
9055 if (Idx >= static_cast<int>(CommonVF))
9056 Idx = E1Mask[Idx - CommonVF] + VF;
9057 else
9058 Idx = E1Mask[Idx];
9059 }
9060 CommonVF = VF;
9061 }
9062 ExtraCost += GetNodeMinBWAffectedCost(
9063 *E1, std::min(CommonVF, E1->getVectorFactor()));
9064 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9065 ExtraCost += GetValueMinBWAffectedCost(V2);
9066 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9067 } else {
9068 assert(V1 && V2 && "Expected both vectors.");
9069 unsigned VF = getVF(V1);
9070 CommonVF = std::max(VF, getVF(V2));
9071 assert(all_of(Mask,
9072 [=](int Idx) {
9073 return Idx < 2 * static_cast<int>(CommonVF);
9074 }) &&
9075 "All elements in mask must be less than 2 * CommonVF.");
9076 ExtraCost +=
9077 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
9078 if (V1->getType() != V2->getType()) {
9079 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9080 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9081 } else {
9082 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
9083 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9084 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
9085 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9086 }
9087 }
9088 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9089 assert(SLPReVec && "FixedVectorType is not expected.");
9091 CommonMask);
9092 }
9093 InVectors.front() =
9094 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
9095 if (InVectors.size() == 2)
9096 InVectors.pop_back();
9097 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
9098 V1, V2, CommonMask, Builder);
9099 }
9100
9101public:
9103 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
9104 SmallPtrSetImpl<Value *> &CheckedExtracts)
9105 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
9106 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
9107 CheckedExtracts(CheckedExtracts) {}
9108 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
9109 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
9110 unsigned NumParts, bool &UseVecBaseAsInput) {
9111 UseVecBaseAsInput = false;
9112 if (Mask.empty())
9113 return nullptr;
9114 Value *VecBase = nullptr;
9115 ArrayRef<Value *> VL = E->Scalars;
9116 // If the resulting type is scalarized, do not adjust the cost.
9117 if (NumParts == VL.size())
9118 return nullptr;
9119 // Check if it can be considered reused if same extractelements were
9120 // vectorized already.
9121 bool PrevNodeFound = any_of(
9122 ArrayRef(R.VectorizableTree).take_front(E->Idx),
9123 [&](const std::unique_ptr<TreeEntry> &TE) {
9124 return ((!TE->isAltShuffle() &&
9125 TE->getOpcode() == Instruction::ExtractElement) ||
9126 TE->isGather()) &&
9127 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
9128 return VL.size() > Data.index() &&
9129 (Mask[Data.index()] == PoisonMaskElem ||
9130 isa<UndefValue>(VL[Data.index()]) ||
9131 Data.value() == VL[Data.index()]);
9132 });
9133 });
9134 SmallPtrSet<Value *, 4> UniqueBases;
9135 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
9136 for (unsigned Part : seq<unsigned>(NumParts)) {
9137 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
9138 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
9139 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
9140 // Ignore non-extractelement scalars.
9141 if (isa<UndefValue>(V) ||
9142 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
9143 continue;
9144 // If all users of instruction are going to be vectorized and this
9145 // instruction itself is not going to be vectorized, consider this
9146 // instruction as dead and remove its cost from the final cost of the
9147 // vectorized tree.
9148 // Also, avoid adjusting the cost for extractelements with multiple uses
9149 // in different graph entries.
9150 auto *EE = cast<ExtractElementInst>(V);
9151 VecBase = EE->getVectorOperand();
9152 UniqueBases.insert(VecBase);
9153 const TreeEntry *VE = R.getTreeEntry(V);
9154 if (!CheckedExtracts.insert(V).second ||
9155 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
9156 any_of(EE->users(),
9157 [&](User *U) {
9158 return isa<GetElementPtrInst>(U) &&
9159 !R.areAllUsersVectorized(cast<Instruction>(U),
9160 &VectorizedVals);
9161 }) ||
9162 (VE && VE != E))
9163 continue;
9164 std::optional<unsigned> EEIdx = getExtractIndex(EE);
9165 if (!EEIdx)
9166 continue;
9167 unsigned Idx = *EEIdx;
9168 // Take credit for instruction that will become dead.
9169 if (EE->hasOneUse() || !PrevNodeFound) {
9170 Instruction *Ext = EE->user_back();
9171 if (isa<SExtInst, ZExtInst>(Ext) &&
9172 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9173 // Use getExtractWithExtendCost() to calculate the cost of
9174 // extractelement/ext pair.
9175 Cost -=
9176 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
9177 EE->getVectorOperandType(), Idx);
9178 // Add back the cost of s|zext which is subtracted separately.
9180 Ext->getOpcode(), Ext->getType(), EE->getType(),
9181 TTI::getCastContextHint(Ext), CostKind, Ext);
9182 continue;
9183 }
9184 }
9185 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
9186 CostKind, Idx);
9187 }
9188 }
9189 // Check that gather of extractelements can be represented as just a
9190 // shuffle of a single/two vectors the scalars are extracted from.
9191 // Found the bunch of extractelement instructions that must be gathered
9192 // into a vector and can be represented as a permutation elements in a
9193 // single input vector or of 2 input vectors.
9194 // Done for reused if same extractelements were vectorized already.
9195 if (!PrevNodeFound)
9196 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
9197 InVectors.assign(1, E);
9198 CommonMask.assign(Mask.begin(), Mask.end());
9199 transformMaskAfterShuffle(CommonMask, CommonMask);
9200 SameNodesEstimated = false;
9201 if (NumParts != 1 && UniqueBases.size() != 1) {
9202 UseVecBaseAsInput = true;
9203 VecBase =
9204 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
9205 }
9206 return VecBase;
9207 }
9208 /// Checks if the specified entry \p E needs to be delayed because of its
9209 /// dependency nodes.
9210 std::optional<InstructionCost>
9211 needToDelay(const TreeEntry *,
9213 // No need to delay the cost estimation during analysis.
9214 return std::nullopt;
9215 }
9216 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9217 if (&E1 == &E2) {
9218 assert(all_of(Mask,
9219 [&](int Idx) {
9220 return Idx < static_cast<int>(E1.getVectorFactor());
9221 }) &&
9222 "Expected single vector shuffle mask.");
9223 add(E1, Mask);
9224 return;
9225 }
9226 if (InVectors.empty()) {
9227 CommonMask.assign(Mask.begin(), Mask.end());
9228 InVectors.assign({&E1, &E2});
9229 return;
9230 }
9231 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9232 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9233 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9234 if (NumParts == 0 || NumParts >= Mask.size())
9235 NumParts = 1;
9236 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9237 const auto *It =
9238 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9239 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9240 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9241 }
9242 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9243 if (InVectors.empty()) {
9244 CommonMask.assign(Mask.begin(), Mask.end());
9245 InVectors.assign(1, &E1);
9246 return;
9247 }
9248 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9249 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9250 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9251 if (NumParts == 0 || NumParts >= Mask.size())
9252 NumParts = 1;
9253 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9254 const auto *It =
9255 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9256 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9257 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
9258 if (!SameNodesEstimated && InVectors.size() == 1)
9259 InVectors.emplace_back(&E1);
9260 }
9261 /// Adds 2 input vectors and the mask for their shuffling.
9262 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
9263 // May come only for shuffling of 2 vectors with extractelements, already
9264 // handled in adjustExtracts.
9265 assert(InVectors.size() == 1 &&
9266 all_of(enumerate(CommonMask),
9267 [&](auto P) {
9268 if (P.value() == PoisonMaskElem)
9269 return Mask[P.index()] == PoisonMaskElem;
9270 auto *EI =
9271 cast<ExtractElementInst>(InVectors.front()
9272 .get<const TreeEntry *>()
9273 ->Scalars[P.index()]);
9274 return EI->getVectorOperand() == V1 ||
9275 EI->getVectorOperand() == V2;
9276 }) &&
9277 "Expected extractelement vectors.");
9278 }
9279 /// Adds another one input vector and the mask for the shuffling.
9280 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
9281 if (InVectors.empty()) {
9282 assert(CommonMask.empty() && !ForExtracts &&
9283 "Expected empty input mask/vectors.");
9284 CommonMask.assign(Mask.begin(), Mask.end());
9285 InVectors.assign(1, V1);
9286 return;
9287 }
9288 if (ForExtracts) {
9289 // No need to add vectors here, already handled them in adjustExtracts.
9290 assert(InVectors.size() == 1 &&
9291 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
9292 all_of(enumerate(CommonMask),
9293 [&](auto P) {
9294 Value *Scalar = InVectors.front()
9295 .get<const TreeEntry *>()
9296 ->Scalars[P.index()];
9297 if (P.value() == PoisonMaskElem)
9298 return P.value() == Mask[P.index()] ||
9299 isa<UndefValue>(Scalar);
9300 if (isa<Constant>(V1))
9301 return true;
9302 auto *EI = cast<ExtractElementInst>(Scalar);
9303 return EI->getVectorOperand() == V1;
9304 }) &&
9305 "Expected only tree entry for extractelement vectors.");
9306 return;
9307 }
9308 assert(!InVectors.empty() && !CommonMask.empty() &&
9309 "Expected only tree entries from extracts/reused buildvectors.");
9310 unsigned VF = getVF(V1);
9311 if (InVectors.size() == 2) {
9312 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
9313 transformMaskAfterShuffle(CommonMask, CommonMask);
9314 VF = std::max<unsigned>(VF, CommonMask.size());
9315 } else if (const auto *InTE =
9316 InVectors.front().dyn_cast<const TreeEntry *>()) {
9317 VF = std::max(VF, InTE->getVectorFactor());
9318 } else {
9319 VF = std::max(
9320 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
9321 ->getNumElements());
9322 }
9323 InVectors.push_back(V1);
9324 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9325 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
9326 CommonMask[Idx] = Mask[Idx] + VF;
9327 }
9328 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
9329 Value *Root = nullptr) {
9330 Cost += getBuildVectorCost(VL, Root);
9331 if (!Root) {
9332 // FIXME: Need to find a way to avoid use of getNullValue here.
9334 unsigned VF = VL.size();
9335 if (MaskVF != 0)
9336 VF = std::min(VF, MaskVF);
9337 for (Value *V : VL.take_front(VF)) {
9338 if (isa<UndefValue>(V)) {
9339 Vals.push_back(cast<Constant>(V));
9340 continue;
9341 }
9342 Vals.push_back(Constant::getNullValue(V->getType()));
9343 }
9344 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
9345 assert(SLPReVec && "FixedVectorType is not expected.");
9346 // When REVEC is enabled, we need to expand vector types into scalar
9347 // types.
9348 unsigned VecTyNumElements = VecTy->getNumElements();
9349 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
9350 for (auto [I, V] : enumerate(Vals)) {
9351 Type *ScalarTy = V->getType()->getScalarType();
9352 Constant *NewVal;
9353 if (isa<PoisonValue>(V))
9354 NewVal = PoisonValue::get(ScalarTy);
9355 else if (isa<UndefValue>(V))
9356 NewVal = UndefValue::get(ScalarTy);
9357 else
9358 NewVal = Constant::getNullValue(ScalarTy);
9359 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
9360 NewVal);
9361 }
9362 Vals.swap(NewVals);
9363 }
9364 return ConstantVector::get(Vals);
9365 }
9368 cast<FixedVectorType>(Root->getType())->getNumElements()),
9369 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
9370 }
9372 /// Finalize emission of the shuffles.
9374 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
9375 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
9376 IsFinalized = true;
9377 if (Action) {
9378 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
9379 if (InVectors.size() == 2)
9380 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
9381 else
9382 Cost += createShuffle(Vec, nullptr, CommonMask);
9383 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9384 if (CommonMask[Idx] != PoisonMaskElem)
9385 CommonMask[Idx] = Idx;
9386 assert(VF > 0 &&
9387 "Expected vector length for the final value before action.");
9388 Value *V = Vec.get<Value *>();
9389 Action(V, CommonMask);
9390 InVectors.front() = V;
9391 }
9392 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
9393 if (CommonMask.empty()) {
9394 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
9395 return Cost;
9396 }
9397 return Cost +
9398 createShuffle(InVectors.front(),
9399 InVectors.size() == 2 ? InVectors.back() : nullptr,
9400 CommonMask);
9401 }
9402
9404 assert((IsFinalized || CommonMask.empty()) &&
9405 "Shuffle construction must be finalized.");
9406 }
9407};
9408
9409const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9410 unsigned Idx) const {
9411 Value *Op = E->getOperand(Idx).front();
9412 if (const TreeEntry *TE = getTreeEntry(Op)) {
9413 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9414 return EI.EdgeIdx == Idx && EI.UserTE == E;
9415 }) != TE->UserTreeIndices.end())
9416 return TE;
9417 auto MIt = MultiNodeScalars.find(Op);
9418 if (MIt != MultiNodeScalars.end()) {
9419 for (const TreeEntry *TE : MIt->second) {
9420 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9421 return EI.EdgeIdx == Idx && EI.UserTE == E;
9422 }) != TE->UserTreeIndices.end())
9423 return TE;
9424 }
9425 }
9426 }
9427 const auto *It =
9428 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9429 return TE->isGather() &&
9430 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9431 return EI.EdgeIdx == Idx && EI.UserTE == E;
9432 }) != TE->UserTreeIndices.end();
9433 });
9434 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9435 return It->get();
9436}
9437
9438TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9439 if (TE.State == TreeEntry::ScatterVectorize ||
9440 TE.State == TreeEntry::StridedVectorize)
9442 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9443 !TE.isAltShuffle()) {
9444 if (TE.ReorderIndices.empty())
9447 inversePermutation(TE.ReorderIndices, Mask);
9448 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9450 }
9452}
9453
9454/// Builds the arguments types vector for the given call instruction with the
9455/// given \p ID for the specified vector factor.
9457 const Intrinsic::ID ID,
9458 const unsigned VF,
9459 unsigned MinBW) {
9460 SmallVector<Type *> ArgTys;
9461 for (auto [Idx, Arg] : enumerate(CI->args())) {
9464 ArgTys.push_back(Arg->getType());
9465 continue;
9466 }
9467 if (MinBW > 0) {
9468 ArgTys.push_back(
9469 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9470 continue;
9471 }
9472 }
9473 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9474 }
9475 return ArgTys;
9476}
9477
9479BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9480 SmallPtrSetImpl<Value *> &CheckedExtracts) {
9481 ArrayRef<Value *> VL = E->Scalars;
9482
9483 Type *ScalarTy = VL[0]->getType();
9484 if (!E->isGather()) {
9485 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9486 ScalarTy = SI->getValueOperand()->getType();
9487 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9488 ScalarTy = CI->getOperand(0)->getType();
9489 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9490 ScalarTy = IE->getOperand(1)->getType();
9491 }
9492 if (!isValidElementType(ScalarTy))
9495
9496 // If we have computed a smaller type for the expression, update VecTy so
9497 // that the costs will be accurate.
9498 auto It = MinBWs.find(E);
9499 Type *OrigScalarTy = ScalarTy;
9500 if (It != MinBWs.end()) {
9501 auto VecTy = dyn_cast<FixedVectorType>(ScalarTy);
9502 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9503 if (VecTy)
9504 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
9505 }
9506 auto *VecTy = getWidenedType(ScalarTy, VL.size());
9507 unsigned EntryVF = E->getVectorFactor();
9508 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
9509
9510 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9511 if (E->isGather()) {
9512 if (allConstant(VL))
9513 return 0;
9514 if (isa<InsertElementInst>(VL[0]))
9516 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9517 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9518 }
9519 InstructionCost CommonCost = 0;
9521 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9522 if (!E->ReorderIndices.empty() &&
9523 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9524 SmallVector<int> NewMask;
9525 if (E->getOpcode() == Instruction::Store) {
9526 // For stores the order is actually a mask.
9527 NewMask.resize(E->ReorderIndices.size());
9528 copy(E->ReorderIndices, NewMask.begin());
9529 } else {
9530 inversePermutation(E->ReorderIndices, NewMask);
9531 }
9532 ::addMask(Mask, NewMask);
9533 }
9534 if (NeedToShuffleReuses)
9535 ::addMask(Mask, E->ReuseShuffleIndices);
9536 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9537 CommonCost =
9538 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9539 assert((E->State == TreeEntry::Vectorize ||
9540 E->State == TreeEntry::ScatterVectorize ||
9541 E->State == TreeEntry::StridedVectorize) &&
9542 "Unhandled state");
9543 assert(E->getOpcode() &&
9544 ((allSameType(VL) && allSameBlock(VL)) ||
9545 (E->getOpcode() == Instruction::GetElementPtr &&
9546 E->getMainOp()->getType()->isPointerTy())) &&
9547 "Invalid VL");
9548 Instruction *VL0 = E->getMainOp();
9549 unsigned ShuffleOrOp =
9550 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9551 if (E->CombinedOp != TreeEntry::NotCombinedOp)
9552 ShuffleOrOp = E->CombinedOp;
9553 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9554 const unsigned Sz = UniqueValues.size();
9555 SmallBitVector UsedScalars(Sz, false);
9556 for (unsigned I = 0; I < Sz; ++I) {
9557 if (getTreeEntry(UniqueValues[I]) == E)
9558 continue;
9559 UsedScalars.set(I);
9560 }
9561 auto GetCastContextHint = [&](Value *V) {
9562 if (const TreeEntry *OpTE = getTreeEntry(V))
9563 return getCastContextHint(*OpTE);
9564 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9565 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9568 };
9569 auto GetCostDiff =
9570 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9572 // Calculate the cost of this instruction.
9573 InstructionCost ScalarCost = 0;
9574 if (isa<CastInst, CallInst>(VL0)) {
9575 // For some of the instructions no need to calculate cost for each
9576 // particular instruction, we can use the cost of the single
9577 // instruction x total number of scalar instructions.
9578 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9579 } else {
9580 for (unsigned I = 0; I < Sz; ++I) {
9581 if (UsedScalars.test(I))
9582 continue;
9583 ScalarCost += ScalarEltCost(I);
9584 }
9585 }
9586
9587 InstructionCost VecCost = VectorCost(CommonCost);
9588 // Check if the current node must be resized, if the parent node is not
9589 // resized.
9590 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9591 const EdgeInfo &EI = E->UserTreeIndices.front();
9592 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9593 EI.EdgeIdx != 0) &&
9594 It != MinBWs.end()) {
9595 auto UserBWIt = MinBWs.find(EI.UserTE);
9596 Type *UserScalarTy =
9597 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9598 if (UserBWIt != MinBWs.end())
9599 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9600 UserBWIt->second.first);
9601 if (ScalarTy != UserScalarTy) {
9602 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9603 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9604 unsigned VecOpcode;
9605 auto *UserVecTy =
9606 getWidenedType(UserScalarTy, E->getVectorFactor());
9607 if (BWSz > SrcBWSz)
9608 VecOpcode = Instruction::Trunc;
9609 else
9610 VecOpcode =
9611 It->second.second ? Instruction::SExt : Instruction::ZExt;
9612 TTI::CastContextHint CCH = GetCastContextHint(VL0);
9613 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9614 CostKind);
9615 }
9616 }
9617 }
9618 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9619 ScalarCost, "Calculated costs for Tree"));
9620 return VecCost - ScalarCost;
9621 };
9622 // Calculate cost difference from vectorizing set of GEPs.
9623 // Negative value means vectorizing is profitable.
9624 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9625 assert((E->State == TreeEntry::Vectorize ||
9626 E->State == TreeEntry::StridedVectorize) &&
9627 "Entry state expected to be Vectorize or StridedVectorize here.");
9628 InstructionCost ScalarCost = 0;
9629 InstructionCost VecCost = 0;
9630 std::tie(ScalarCost, VecCost) = getGEPCosts(
9631 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9632 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9633 "Calculated GEPs cost for Tree"));
9634
9635 return VecCost - ScalarCost;
9636 };
9637
9638 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
9639 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
9640 if (MinMaxID == Intrinsic::not_intrinsic)
9642 Type *CanonicalType = Ty;
9643 if (CanonicalType->isPtrOrPtrVectorTy())
9644 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9645 CanonicalType->getContext(),
9646 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9647
9648 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9649 {CanonicalType, CanonicalType});
9650 InstructionCost IntrinsicCost =
9651 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9652 // If the selects are the only uses of the compares, they will be
9653 // dead and we can adjust the cost by removing their cost.
9654 if (VI && SelectOnly) {
9655 assert(!Ty->isVectorTy() && "Expected only for scalar type.");
9656 auto *CI = cast<CmpInst>(VI->getOperand(0));
9657 IntrinsicCost -=
9658 TTI->getCmpSelInstrCost(CI->getOpcode(), Ty, Builder.getInt1Ty(),
9659 CI->getPredicate(), CostKind, CI);
9660 }
9661 return IntrinsicCost;
9662 };
9663 switch (ShuffleOrOp) {
9664 case Instruction::PHI: {
9665 // Count reused scalars.
9666 InstructionCost ScalarCost = 0;
9668 for (Value *V : UniqueValues) {
9669 auto *PHI = dyn_cast<PHINode>(V);
9670 if (!PHI)
9671 continue;
9672
9673 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9674 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9675 Value *Op = PHI->getIncomingValue(I);
9676 Operands[I] = Op;
9677 }
9678 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9679 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9680 if (!OpTE->ReuseShuffleIndices.empty())
9681 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9682 OpTE->Scalars.size());
9683 }
9684
9685 return CommonCost - ScalarCost;
9686 }
9687 case Instruction::ExtractValue:
9688 case Instruction::ExtractElement: {
9689 auto GetScalarCost = [&](unsigned Idx) {
9690 auto *I = cast<Instruction>(UniqueValues[Idx]);
9691 VectorType *SrcVecTy;
9692 if (ShuffleOrOp == Instruction::ExtractElement) {
9693 auto *EE = cast<ExtractElementInst>(I);
9694 SrcVecTy = EE->getVectorOperandType();
9695 } else {
9696 auto *EV = cast<ExtractValueInst>(I);
9697 Type *AggregateTy = EV->getAggregateOperand()->getType();
9698 unsigned NumElts;
9699 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9700 NumElts = ATy->getNumElements();
9701 else
9702 NumElts = AggregateTy->getStructNumElements();
9703 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
9704 }
9705 if (I->hasOneUse()) {
9706 Instruction *Ext = I->user_back();
9707 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9708 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9709 // Use getExtractWithExtendCost() to calculate the cost of
9710 // extractelement/ext pair.
9712 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9713 // Subtract the cost of s|zext which is subtracted separately.
9715 Ext->getOpcode(), Ext->getType(), I->getType(),
9717 return Cost;
9718 }
9719 }
9720 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9722 };
9723 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9724 return GetCostDiff(GetScalarCost, GetVectorCost);
9725 }
9726 case Instruction::InsertElement: {
9727 assert(E->ReuseShuffleIndices.empty() &&
9728 "Unique insertelements only are expected.");
9729 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9730 unsigned const NumElts = SrcVecTy->getNumElements();
9731 unsigned const NumScalars = VL.size();
9732
9733 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9734
9735 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9736 unsigned OffsetBeg = *getElementIndex(VL.front());
9737 unsigned OffsetEnd = OffsetBeg;
9738 InsertMask[OffsetBeg] = 0;
9739 for (auto [I, V] : enumerate(VL.drop_front())) {
9740 unsigned Idx = *getElementIndex(V);
9741 if (OffsetBeg > Idx)
9742 OffsetBeg = Idx;
9743 else if (OffsetEnd < Idx)
9744 OffsetEnd = Idx;
9745 InsertMask[Idx] = I + 1;
9746 }
9747 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9748 if (NumOfParts > 0)
9749 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9750 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9751 VecScalarsSz;
9752 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9753 unsigned InsertVecSz = std::min<unsigned>(
9754 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9755 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9756 bool IsWholeSubvector =
9757 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9758 // Check if we can safely insert a subvector. If it is not possible, just
9759 // generate a whole-sized vector and shuffle the source vector and the new
9760 // subvector.
9761 if (OffsetBeg + InsertVecSz > VecSz) {
9762 // Align OffsetBeg to generate correct mask.
9763 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9764 InsertVecSz = VecSz;
9765 }
9766
9767 APInt DemandedElts = APInt::getZero(NumElts);
9768 // TODO: Add support for Instruction::InsertValue.
9770 if (!E->ReorderIndices.empty()) {
9771 inversePermutation(E->ReorderIndices, Mask);
9772 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9773 } else {
9774 Mask.assign(VecSz, PoisonMaskElem);
9775 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9776 }
9777 bool IsIdentity = true;
9778 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9779 Mask.swap(PrevMask);
9780 for (unsigned I = 0; I < NumScalars; ++I) {
9781 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
9782 DemandedElts.setBit(InsertIdx);
9783 IsIdentity &= InsertIdx - OffsetBeg == I;
9784 Mask[InsertIdx - OffsetBeg] = I;
9785 }
9786 assert(Offset < NumElts && "Failed to find vector index offset");
9787
9789 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9790 /*Insert*/ true, /*Extract*/ false,
9791 CostKind);
9792
9793 // First cost - resize to actual vector size if not identity shuffle or
9794 // need to shift the vector.
9795 // Do not calculate the cost if the actual size is the register size and
9796 // we can merge this shuffle with the following SK_Select.
9797 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
9798 if (!IsIdentity)
9800 InsertVecTy, Mask);
9801 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9802 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9803 }));
9804 // Second cost - permutation with subvector, if some elements are from the
9805 // initial vector or inserting a subvector.
9806 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9807 // subvector of ActualVecTy.
9808 SmallBitVector InMask =
9809 isUndefVector(FirstInsert->getOperand(0),
9810 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9811 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9812 if (InsertVecSz != VecSz) {
9813 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
9815 std::nullopt, CostKind, OffsetBeg - Offset,
9816 InsertVecTy);
9817 } else {
9818 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9819 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9820 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9821 I <= End; ++I)
9822 if (Mask[I] != PoisonMaskElem)
9823 Mask[I] = I + VecSz;
9824 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9825 Mask[I] =
9826 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9827 Cost +=
9828 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9829 }
9830 }
9831 return Cost;
9832 }
9833 case Instruction::ZExt:
9834 case Instruction::SExt:
9835 case Instruction::FPToUI:
9836 case Instruction::FPToSI:
9837 case Instruction::FPExt:
9838 case Instruction::PtrToInt:
9839 case Instruction::IntToPtr:
9840 case Instruction::SIToFP:
9841 case Instruction::UIToFP:
9842 case Instruction::Trunc:
9843 case Instruction::FPTrunc:
9844 case Instruction::BitCast: {
9845 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9846 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9847 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9848 unsigned Opcode = ShuffleOrOp;
9849 unsigned VecOpcode = Opcode;
9850 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
9851 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9852 // Check if the values are candidates to demote.
9853 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
9854 if (SrcIt != MinBWs.end()) {
9855 SrcBWSz = SrcIt->second.first;
9856 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
9857 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9858 SrcVecTy =
9859 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
9860 }
9861 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
9862 if (BWSz == SrcBWSz) {
9863 VecOpcode = Instruction::BitCast;
9864 } else if (BWSz < SrcBWSz) {
9865 VecOpcode = Instruction::Trunc;
9866 } else if (It != MinBWs.end()) {
9867 assert(BWSz > SrcBWSz && "Invalid cast!");
9868 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9869 } else if (SrcIt != MinBWs.end()) {
9870 assert(BWSz > SrcBWSz && "Invalid cast!");
9871 VecOpcode =
9872 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9873 }
9874 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9875 !SrcIt->second.second) {
9876 VecOpcode = Instruction::UIToFP;
9877 }
9878 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9879 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9880 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9881 VL0->getOperand(0)->getType(),
9883 };
9884 auto GetVectorCost = [=](InstructionCost CommonCost) {
9885 // Do not count cost here if minimum bitwidth is in effect and it is just
9886 // a bitcast (here it is just a noop).
9887 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9888 return CommonCost;
9889 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9890 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9891 return CommonCost +
9892 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9893 VecOpcode == Opcode ? VI : nullptr);
9894 };
9895 return GetCostDiff(GetScalarCost, GetVectorCost);
9896 }
9897 case Instruction::FCmp:
9898 case Instruction::ICmp:
9899 case Instruction::Select: {
9900 CmpInst::Predicate VecPred, SwappedVecPred;
9901 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9902 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9903 match(VL0, MatchCmp))
9904 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9905 else
9906 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9909 auto GetScalarCost = [&](unsigned Idx) {
9910 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9911 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9914 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9915 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9916 !match(VI, MatchCmp)) ||
9917 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9918 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9921
9923 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9924 CostKind, VI);
9925 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
9926 if (IntrinsicCost.isValid())
9927 ScalarCost = IntrinsicCost;
9928
9929 return ScalarCost;
9930 };
9931 auto GetVectorCost = [&](InstructionCost CommonCost) {
9932 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9933
9935 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9936 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
9937 auto *CondType =
9938 getWidenedType(SI->getCondition()->getType(), VL.size());
9939 unsigned CondNumElements = CondType->getNumElements();
9940 unsigned VecTyNumElements = getNumElements(VecTy);
9941 assert(VecTyNumElements >= CondNumElements &&
9942 VecTyNumElements % CondNumElements == 0 &&
9943 "Cannot vectorize Instruction::Select");
9944 if (CondNumElements != VecTyNumElements) {
9945 // When the return type is i1 but the source is fixed vector type, we
9946 // need to duplicate the condition value.
9947 VecCost += ::getShuffleCost(
9948 *TTI, TTI::SK_PermuteSingleSrc, CondType,
9949 createReplicatedMask(VecTyNumElements / CondNumElements,
9950 CondNumElements));
9951 }
9952 }
9953 return VecCost + CommonCost;
9954 };
9955 return GetCostDiff(GetScalarCost, GetVectorCost);
9956 }
9957 case TreeEntry::MinMax: {
9958 auto GetScalarCost = [&](unsigned Idx) {
9959 return GetMinMaxCost(OrigScalarTy);
9960 };
9961 auto GetVectorCost = [&](InstructionCost CommonCost) {
9962 InstructionCost VecCost = GetMinMaxCost(VecTy);
9963 return VecCost + CommonCost;
9964 };
9965 return GetCostDiff(GetScalarCost, GetVectorCost);
9966 }
9967 case Instruction::FNeg:
9968 case Instruction::Add:
9969 case Instruction::FAdd:
9970 case Instruction::Sub:
9971 case Instruction::FSub:
9972 case Instruction::Mul:
9973 case Instruction::FMul:
9974 case Instruction::UDiv:
9975 case Instruction::SDiv:
9976 case Instruction::FDiv:
9977 case Instruction::URem:
9978 case Instruction::SRem:
9979 case Instruction::FRem:
9980 case Instruction::Shl:
9981 case Instruction::LShr:
9982 case Instruction::AShr:
9983 case Instruction::And:
9984 case Instruction::Or:
9985 case Instruction::Xor: {
9986 auto GetScalarCost = [&](unsigned Idx) {
9987 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9988 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9989 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9990 TTI::OperandValueInfo Op2Info =
9991 TTI::getOperandInfo(VI->getOperand(OpIdx));
9992 SmallVector<const Value *> Operands(VI->operand_values());
9993 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9994 Op1Info, Op2Info, Operands, VI);
9995 };
9996 auto GetVectorCost = [=](InstructionCost CommonCost) {
9997 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9998 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9999 ArrayRef<Value *> Ops = E->getOperand(I);
10000 if (all_of(Ops, [&](Value *Op) {
10001 auto *CI = dyn_cast<ConstantInt>(Op);
10002 return CI && CI->getValue().countr_one() >= It->second.first;
10003 }))
10004 return CommonCost;
10005 }
10006 }
10007 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
10008 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
10009 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
10010 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
10011 Op2Info, std::nullopt, nullptr, TLI) +
10012 CommonCost;
10013 };
10014 return GetCostDiff(GetScalarCost, GetVectorCost);
10015 }
10016 case Instruction::GetElementPtr: {
10017 return CommonCost + GetGEPCostDiff(VL, VL0);
10018 }
10019 case Instruction::Load: {
10020 auto GetScalarCost = [&](unsigned Idx) {
10021 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
10022 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
10023 VI->getAlign(), VI->getPointerAddressSpace(),
10025 };
10026 auto *LI0 = cast<LoadInst>(VL0);
10027 auto GetVectorCost = [&](InstructionCost CommonCost) {
10028 InstructionCost VecLdCost;
10029 if (E->State == TreeEntry::Vectorize) {
10030 VecLdCost = TTI->getMemoryOpCost(
10031 Instruction::Load, VecTy, LI0->getAlign(),
10032 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
10033 } else if (E->State == TreeEntry::StridedVectorize) {
10034 Align CommonAlignment =
10035 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
10036 VecLdCost = TTI->getStridedMemoryOpCost(
10037 Instruction::Load, VecTy, LI0->getPointerOperand(),
10038 /*VariableMask=*/false, CommonAlignment, CostKind);
10039 } else {
10040 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
10041 Align CommonAlignment =
10042 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
10043 VecLdCost = TTI->getGatherScatterOpCost(
10044 Instruction::Load, VecTy, LI0->getPointerOperand(),
10045 /*VariableMask=*/false, CommonAlignment, CostKind);
10046 }
10047 return VecLdCost + CommonCost;
10048 };
10049
10050 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
10051 // If this node generates masked gather load then it is not a terminal node.
10052 // Hence address operand cost is estimated separately.
10053 if (E->State == TreeEntry::ScatterVectorize)
10054 return Cost;
10055
10056 // Estimate cost of GEPs since this tree node is a terminator.
10057 SmallVector<Value *> PointerOps(VL.size());
10058 for (auto [I, V] : enumerate(VL))
10059 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
10060 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
10061 }
10062 case Instruction::Store: {
10063 bool IsReorder = !E->ReorderIndices.empty();
10064 auto GetScalarCost = [=](unsigned Idx) {
10065 auto *VI = cast<StoreInst>(VL[Idx]);
10066 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
10067 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
10068 VI->getAlign(), VI->getPointerAddressSpace(),
10069 CostKind, OpInfo, VI);
10070 };
10071 auto *BaseSI =
10072 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
10073 auto GetVectorCost = [=](InstructionCost CommonCost) {
10074 // We know that we can merge the stores. Calculate the cost.
10075 InstructionCost VecStCost;
10076 if (E->State == TreeEntry::StridedVectorize) {
10077 Align CommonAlignment =
10078 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
10079 VecStCost = TTI->getStridedMemoryOpCost(
10080 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10081 /*VariableMask=*/false, CommonAlignment, CostKind);
10082 } else {
10083 assert(E->State == TreeEntry::Vectorize &&
10084 "Expected either strided or consecutive stores.");
10085 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
10086 VecStCost = TTI->getMemoryOpCost(
10087 Instruction::Store, VecTy, BaseSI->getAlign(),
10088 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
10089 }
10090 return VecStCost + CommonCost;
10091 };
10092 SmallVector<Value *> PointerOps(VL.size());
10093 for (auto [I, V] : enumerate(VL)) {
10094 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
10095 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
10096 }
10097
10098 return GetCostDiff(GetScalarCost, GetVectorCost) +
10099 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
10100 }
10101 case Instruction::Call: {
10102 auto GetScalarCost = [&](unsigned Idx) {
10103 auto *CI = cast<CallInst>(UniqueValues[Idx]);
10106 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
10107 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
10108 }
10111 CI->getFunctionType()->params(), CostKind);
10112 };
10113 auto GetVectorCost = [=](InstructionCost CommonCost) {
10114 auto *CI = cast<CallInst>(VL0);
10116 SmallVector<Type *> ArgTys =
10118 It != MinBWs.end() ? It->second.first : 0);
10119 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10120 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
10121 };
10122 return GetCostDiff(GetScalarCost, GetVectorCost);
10123 }
10124 case Instruction::ShuffleVector: {
10125 if (!SLPReVec || E->isAltShuffle())
10126 assert(E->isAltShuffle() &&
10127 ((Instruction::isBinaryOp(E->getOpcode()) &&
10128 Instruction::isBinaryOp(E->getAltOpcode())) ||
10129 (Instruction::isCast(E->getOpcode()) &&
10130 Instruction::isCast(E->getAltOpcode())) ||
10131 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
10132 "Invalid Shuffle Vector Operand");
10133 // Try to find the previous shuffle node with the same operands and same
10134 // main/alternate ops.
10135 auto TryFindNodeWithEqualOperands = [=]() {
10136 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10137 if (TE.get() == E)
10138 break;
10139 if (TE->isAltShuffle() &&
10140 ((TE->getOpcode() == E->getOpcode() &&
10141 TE->getAltOpcode() == E->getAltOpcode()) ||
10142 (TE->getOpcode() == E->getAltOpcode() &&
10143 TE->getAltOpcode() == E->getOpcode())) &&
10144 TE->hasEqualOperands(*E))
10145 return true;
10146 }
10147 return false;
10148 };
10149 auto GetScalarCost = [&](unsigned Idx) {
10150 auto *VI = cast<Instruction>(UniqueValues[Idx]);
10151 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
10152 (void)E;
10153 return TTI->getInstructionCost(VI, CostKind);
10154 };
10155 // Need to clear CommonCost since the final shuffle cost is included into
10156 // vector cost.
10157 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
10158 // VecCost is equal to sum of the cost of creating 2 vectors
10159 // and the cost of creating shuffle.
10160 InstructionCost VecCost = 0;
10161 if (TryFindNodeWithEqualOperands()) {
10162 LLVM_DEBUG({
10163 dbgs() << "SLP: diamond match for alternate node found.\n";
10164 E->dump();
10165 });
10166 // No need to add new vector costs here since we're going to reuse
10167 // same main/alternate vector ops, just do different shuffling.
10168 } else if (Instruction::isBinaryOp(E->getOpcode())) {
10169 VecCost =
10170 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
10171 VecCost +=
10172 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
10173 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
10174 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
10175 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
10176 CI0->getPredicate(), CostKind, VL0);
10177 VecCost += TTIRef.getCmpSelInstrCost(
10178 E->getOpcode(), VecTy, MaskTy,
10179 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
10180 E->getAltOp());
10181 } else {
10182 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
10183 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
10184 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
10185 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
10186 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
10187 unsigned SrcBWSz =
10188 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
10189 if (SrcIt != MinBWs.end()) {
10190 SrcBWSz = SrcIt->second.first;
10191 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
10192 SrcTy = getWidenedType(SrcSclTy, VL.size());
10193 }
10194 if (BWSz <= SrcBWSz) {
10195 if (BWSz < SrcBWSz)
10196 VecCost =
10197 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
10199 LLVM_DEBUG({
10200 dbgs()
10201 << "SLP: alternate extension, which should be truncated.\n";
10202 E->dump();
10203 });
10204 return VecCost;
10205 }
10206 }
10207 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
10209 VecCost +=
10210 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
10212 }
10214 E->buildAltOpShuffleMask(
10215 [&](Instruction *I) {
10216 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
10217 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
10218 *TLI);
10219 },
10220 Mask);
10222 FinalVecTy, Mask, CostKind);
10223 // Patterns like [fadd,fsub] can be combined into a single instruction
10224 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
10225 // need to take into account their order when looking for the most used
10226 // order.
10227 unsigned Opcode0 = E->getOpcode();
10228 unsigned Opcode1 = E->getAltOpcode();
10229 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
10230 // If this pattern is supported by the target then we consider the
10231 // order.
10232 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
10233 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
10234 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
10235 return AltVecCost < VecCost ? AltVecCost : VecCost;
10236 }
10237 // TODO: Check the reverse order too.
10238 return VecCost;
10239 };
10240 if (SLPReVec && !E->isAltShuffle())
10241 return GetCostDiff(GetScalarCost, [](InstructionCost) {
10242 // shufflevector will be eliminated by instcombine because the
10243 // shufflevector masks are used in order (guaranteed by
10244 // getShufflevectorNumGroups). The vector cost is 0.
10245 return TTI::TCC_Free;
10246 });
10247 return GetCostDiff(GetScalarCost, GetVectorCost);
10248 }
10249 case Instruction::Freeze:
10250 return CommonCost;
10251 default:
10252 llvm_unreachable("Unknown instruction");
10253 }
10254}
10255
10256bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
10257 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
10258 << VectorizableTree.size() << " is fully vectorizable .\n");
10259
10260 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
10262 return TE->isGather() &&
10263 !any_of(TE->Scalars,
10264 [this](Value *V) { return EphValues.contains(V); }) &&
10265 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
10266 TE->Scalars.size() < Limit ||
10267 ((TE->getOpcode() == Instruction::ExtractElement ||
10268 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
10269 isFixedVectorShuffle(TE->Scalars, Mask)) ||
10270 (TE->isGather() && TE->getOpcode() == Instruction::Load &&
10271 !TE->isAltShuffle()));
10272 };
10273
10274 // We only handle trees of heights 1 and 2.
10275 if (VectorizableTree.size() == 1 &&
10276 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10277 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
10278 (ForReduction &&
10279 AreVectorizableGathers(VectorizableTree[0].get(),
10280 VectorizableTree[0]->Scalars.size()) &&
10281 VectorizableTree[0]->getVectorFactor() > 2)))
10282 return true;
10283
10284 if (VectorizableTree.size() != 2)
10285 return false;
10286
10287 // Handle splat and all-constants stores. Also try to vectorize tiny trees
10288 // with the second gather nodes if they have less scalar operands rather than
10289 // the initial tree element (may be profitable to shuffle the second gather)
10290 // or they are extractelements, which form shuffle.
10292 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10293 AreVectorizableGathers(VectorizableTree[1].get(),
10294 VectorizableTree[0]->Scalars.size()))
10295 return true;
10296
10297 // Gathering cost would be too much for tiny trees.
10298 if (VectorizableTree[0]->isGather() ||
10299 (VectorizableTree[1]->isGather() &&
10300 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10301 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10302 return false;
10303
10304 return true;
10305}
10306
10307static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
10309 bool MustMatchOrInst) {
10310 // Look past the root to find a source value. Arbitrarily follow the
10311 // path through operand 0 of any 'or'. Also, peek through optional
10312 // shift-left-by-multiple-of-8-bits.
10313 Value *ZextLoad = Root;
10314 const APInt *ShAmtC;
10315 bool FoundOr = false;
10316 while (!isa<ConstantExpr>(ZextLoad) &&
10317 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
10318 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
10319 ShAmtC->urem(8) == 0))) {
10320 auto *BinOp = cast<BinaryOperator>(ZextLoad);
10321 ZextLoad = BinOp->getOperand(0);
10322 if (BinOp->getOpcode() == Instruction::Or)
10323 FoundOr = true;
10324 }
10325 // Check if the input is an extended load of the required or/shift expression.
10326 Value *Load;
10327 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10328 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
10329 return false;
10330
10331 // Require that the total load bit width is a legal integer type.
10332 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
10333 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
10334 Type *SrcTy = Load->getType();
10335 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
10336 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
10337 return false;
10338
10339 // Everything matched - assume that we can fold the whole sequence using
10340 // load combining.
10341 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
10342 << *(cast<Instruction>(Root)) << "\n");
10343
10344 return true;
10345}
10346
10348 if (RdxKind != RecurKind::Or)
10349 return false;
10350
10351 unsigned NumElts = VectorizableTree[0]->Scalars.size();
10352 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10353 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
10354 /* MatchOr */ false);
10355}
10356
10358 // Peek through a final sequence of stores and check if all operations are
10359 // likely to be load-combined.
10360 unsigned NumElts = Stores.size();
10361 for (Value *Scalar : Stores) {
10362 Value *X;
10363 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
10364 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
10365 return false;
10366 }
10367 return true;
10368}
10369
10370bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
10371 // No need to vectorize inserts of gathered values.
10372 if (VectorizableTree.size() == 2 &&
10373 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
10374 VectorizableTree[1]->isGather() &&
10375 (VectorizableTree[1]->getVectorFactor() <= 2 ||
10376 !(isSplat(VectorizableTree[1]->Scalars) ||
10377 allConstant(VectorizableTree[1]->Scalars))))
10378 return true;
10379
10380 // If the graph includes only PHI nodes and gathers, it is defnitely not
10381 // profitable for the vectorization, we can skip it, if the cost threshold is
10382 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
10383 // gathers/buildvectors.
10384 constexpr int Limit = 4;
10385 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
10386 !VectorizableTree.empty() &&
10387 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10388 return (TE->isGather() &&
10389 TE->getOpcode() != Instruction::ExtractElement &&
10390 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
10391 TE->getOpcode() == Instruction::PHI;
10392 }))
10393 return true;
10394
10395 // We can vectorize the tree if its size is greater than or equal to the
10396 // minimum size specified by the MinTreeSize command line option.
10397 if (VectorizableTree.size() >= MinTreeSize)
10398 return false;
10399
10400 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
10401 // can vectorize it if we can prove it fully vectorizable.
10402 if (isFullyVectorizableTinyTree(ForReduction))
10403 return false;
10404
10405 // Check if any of the gather node forms an insertelement buildvector
10406 // somewhere.
10407 bool IsAllowedSingleBVNode =
10408 VectorizableTree.size() > 1 ||
10409 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10410 !VectorizableTree.front()->isAltShuffle() &&
10411 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10412 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10413 allSameBlock(VectorizableTree.front()->Scalars));
10414 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10415 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
10416 return isa<ExtractElementInst, UndefValue>(V) ||
10417 (IsAllowedSingleBVNode &&
10418 !V->hasNUsesOrMore(UsesLimit) &&
10419 any_of(V->users(), IsaPred<InsertElementInst>));
10420 });
10421 }))
10422 return false;
10423
10424 assert(VectorizableTree.empty()
10425 ? ExternalUses.empty()
10426 : true && "We shouldn't have any external users");
10427
10428 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
10429 // vectorizable.
10430 return true;
10431}
10432
10434 // Walk from the bottom of the tree to the top, tracking which values are
10435 // live. When we see a call instruction that is not part of our tree,
10436 // query TTI to see if there is a cost to keeping values live over it
10437 // (for example, if spills and fills are required).
10438 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10440
10442 Instruction *PrevInst = nullptr;
10443
10444 // The entries in VectorizableTree are not necessarily ordered by their
10445 // position in basic blocks. Collect them and order them by dominance so later
10446 // instructions are guaranteed to be visited first. For instructions in
10447 // different basic blocks, we only scan to the beginning of the block, so
10448 // their order does not matter, as long as all instructions in a basic block
10449 // are grouped together. Using dominance ensures a deterministic order.
10450 SmallVector<Instruction *, 16> OrderedScalars;
10451 for (const auto &TEPtr : VectorizableTree) {
10452 if (TEPtr->State != TreeEntry::Vectorize)
10453 continue;
10454 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10455 if (!Inst)
10456 continue;
10457 OrderedScalars.push_back(Inst);
10458 }
10459 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
10460 auto *NodeA = DT->getNode(A->getParent());
10461 auto *NodeB = DT->getNode(B->getParent());
10462 assert(NodeA && "Should only process reachable instructions");
10463 assert(NodeB && "Should only process reachable instructions");
10464 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10465 "Different nodes should have different DFS numbers");
10466 if (NodeA != NodeB)
10467 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10468 return B->comesBefore(A);
10469 });
10470
10471 for (Instruction *Inst : OrderedScalars) {
10472 if (!PrevInst) {
10473 PrevInst = Inst;
10474 continue;
10475 }
10476
10477 // Update LiveValues.
10478 LiveValues.erase(PrevInst);
10479 for (auto &J : PrevInst->operands()) {
10480 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10481 LiveValues.insert(cast<Instruction>(&*J));
10482 }
10483
10484 LLVM_DEBUG({
10485 dbgs() << "SLP: #LV: " << LiveValues.size();
10486 for (auto *X : LiveValues)
10487 dbgs() << " " << X->getName();
10488 dbgs() << ", Looking at ";
10489 Inst->dump();
10490 });
10491
10492 // Now find the sequence of instructions between PrevInst and Inst.
10493 unsigned NumCalls = 0;
10494 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10495 PrevInstIt =
10496 PrevInst->getIterator().getReverse();
10497 while (InstIt != PrevInstIt) {
10498 if (PrevInstIt == PrevInst->getParent()->rend()) {
10499 PrevInstIt = Inst->getParent()->rbegin();
10500 continue;
10501 }
10502
10503 auto NoCallIntrinsic = [this](Instruction *I) {
10504 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10505 if (II->isAssumeLikeIntrinsic())
10506 return true;
10507 FastMathFlags FMF;
10509 for (auto &ArgOp : II->args())
10510 Tys.push_back(ArgOp->getType());
10511 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10512 FMF = FPMO->getFastMathFlags();
10513 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10514 FMF);
10515 InstructionCost IntrCost =
10518 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10519 if (IntrCost < CallCost)
10520 return true;
10521 }
10522 return false;
10523 };
10524
10525 // Debug information does not impact spill cost.
10526 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10527 &*PrevInstIt != PrevInst)
10528 NumCalls++;
10529
10530 ++PrevInstIt;
10531 }
10532
10533 if (NumCalls) {
10535 for (auto *II : LiveValues) {
10536 auto *ScalarTy = II->getType();
10537 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10538 ScalarTy = VectorTy->getElementType();
10539 V.push_back(getWidenedType(ScalarTy, BundleWidth));
10540 }
10541 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10542 }
10543
10544 PrevInst = Inst;
10545 }
10546
10547 return Cost;
10548}
10549
10550/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10551/// buildvector sequence.
10553 const InsertElementInst *IE2) {
10554 if (IE1 == IE2)
10555 return false;
10556 const auto *I1 = IE1;
10557 const auto *I2 = IE2;
10558 const InsertElementInst *PrevI1;
10559 const InsertElementInst *PrevI2;
10560 unsigned Idx1 = *getElementIndex(IE1);
10561 unsigned Idx2 = *getElementIndex(IE2);
10562 do {
10563 if (I2 == IE1)
10564 return true;
10565 if (I1 == IE2)
10566 return false;
10567 PrevI1 = I1;
10568 PrevI2 = I2;
10569 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10570 getElementIndex(I1).value_or(Idx2) != Idx2)
10571 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10572 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10573 getElementIndex(I2).value_or(Idx1) != Idx1)
10574 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10575 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10576 llvm_unreachable("Two different buildvectors not expected.");
10577}
10578
10579namespace {
10580/// Returns incoming Value *, if the requested type is Value * too, or a default
10581/// value, otherwise.
10582struct ValueSelect {
10583 template <typename U>
10584 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10585 return V;
10586 }
10587 template <typename U>
10588 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10589 return U();
10590 }
10591};
10592} // namespace
10593
10594/// Does the analysis of the provided shuffle masks and performs the requested
10595/// actions on the vectors with the given shuffle masks. It tries to do it in
10596/// several steps.
10597/// 1. If the Base vector is not undef vector, resizing the very first mask to
10598/// have common VF and perform action for 2 input vectors (including non-undef
10599/// Base). Other shuffle masks are combined with the resulting after the 1 stage
10600/// and processed as a shuffle of 2 elements.
10601/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10602/// action only for 1 vector with the given mask, if it is not the identity
10603/// mask.
10604/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10605/// vectors, combing the masks properly between the steps.
10606template <typename T>
10608 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10609 function_ref<unsigned(T *)> GetVF,
10610 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10612 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10613 SmallVector<int> Mask(ShuffleMask.begin()->second);
10614 auto VMIt = std::next(ShuffleMask.begin());
10615 T *Prev = nullptr;
10616 SmallBitVector UseMask =
10617 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10618 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10619 if (!IsBaseUndef.all()) {
10620 // Base is not undef, need to combine it with the next subvectors.
10621 std::pair<T *, bool> Res =
10622 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10623 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10624 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10625 if (Mask[Idx] == PoisonMaskElem)
10626 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10627 else
10628 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10629 }
10630 auto *V = ValueSelect::get<T *>(Base);
10631 (void)V;
10632 assert((!V || GetVF(V) == Mask.size()) &&
10633 "Expected base vector of VF number of elements.");
10634 Prev = Action(Mask, {nullptr, Res.first});
10635 } else if (ShuffleMask.size() == 1) {
10636 // Base is undef and only 1 vector is shuffled - perform the action only for
10637 // single vector, if the mask is not the identity mask.
10638 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10639 /*ForSingleMask=*/true);
10640 if (Res.second)
10641 // Identity mask is found.
10642 Prev = Res.first;
10643 else
10644 Prev = Action(Mask, {ShuffleMask.begin()->first});
10645 } else {
10646 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10647 // shuffles step by step, combining shuffle between the steps.
10648 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10649 unsigned Vec2VF = GetVF(VMIt->first);
10650 if (Vec1VF == Vec2VF) {
10651 // No need to resize the input vectors since they are of the same size, we
10652 // can shuffle them directly.
10653 ArrayRef<int> SecMask = VMIt->second;
10654 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10655 if (SecMask[I] != PoisonMaskElem) {
10656 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10657 Mask[I] = SecMask[I] + Vec1VF;
10658 }
10659 }
10660 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10661 } else {
10662 // Vectors of different sizes - resize and reshuffle.
10663 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10664 /*ForSingleMask=*/false);
10665 std::pair<T *, bool> Res2 =
10666 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10667 ArrayRef<int> SecMask = VMIt->second;
10668 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10669 if (Mask[I] != PoisonMaskElem) {
10670 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10671 if (Res1.second)
10672 Mask[I] = I;
10673 } else if (SecMask[I] != PoisonMaskElem) {
10674 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10675 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10676 }
10677 }
10678 Prev = Action(Mask, {Res1.first, Res2.first});
10679 }
10680 VMIt = std::next(VMIt);
10681 }
10682 bool IsBaseNotUndef = !IsBaseUndef.all();
10683 (void)IsBaseNotUndef;
10684 // Perform requested actions for the remaining masks/vectors.
10685 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10686 // Shuffle other input vectors, if any.
10687 std::pair<T *, bool> Res =
10688 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10689 ArrayRef<int> SecMask = VMIt->second;
10690 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10691 if (SecMask[I] != PoisonMaskElem) {
10692 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10693 "Multiple uses of scalars.");
10694 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10695 } else if (Mask[I] != PoisonMaskElem) {
10696 Mask[I] = I;
10697 }
10698 }
10699 Prev = Action(Mask, {Prev, Res.first});
10700 }
10701 return Prev;
10702}
10703
10706 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10707 << VectorizableTree.size() << ".\n");
10708
10709 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10710
10711 SmallPtrSet<Value *, 4> CheckedExtracts;
10712 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10713 TreeEntry &TE = *VectorizableTree[I];
10714 // No need to count the cost for combined entries, they are combined and
10715 // just skip their cost.
10716 if (TE.State == TreeEntry::CombinedVectorize) {
10717 LLVM_DEBUG(
10718 dbgs() << "SLP: Skipping cost for combined node that starts with "
10719 << *TE.Scalars[0] << ".\n";
10720 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10721 continue;
10722 }
10723 if (TE.isGather()) {
10724 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10725 E && E->getVectorFactor() == TE.getVectorFactor() &&
10726 E->isSame(TE.Scalars)) {
10727 // Some gather nodes might be absolutely the same as some vectorizable
10728 // nodes after reordering, need to handle it.
10729 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10730 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
10731 << "SLP: Current total cost = " << Cost << "\n");
10732 continue;
10733 }
10734 }
10735
10736 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10737 Cost += C;
10738 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10739 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
10740 << "SLP: Current total cost = " << Cost << "\n");
10741 }
10742
10743 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10744 InstructionCost ExtractCost = 0;
10747 SmallVector<APInt> DemandedElts;
10748 SmallDenseSet<Value *, 4> UsedInserts;
10750 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10752 for (ExternalUser &EU : ExternalUses) {
10753 // We only add extract cost once for the same scalar.
10754 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10755 !ExtractCostCalculated.insert(EU.Scalar).second)
10756 continue;
10757
10758 // Uses by ephemeral values are free (because the ephemeral value will be
10759 // removed prior to code generation, and so the extraction will be
10760 // removed as well).
10761 if (EphValues.count(EU.User))
10762 continue;
10763
10764 // No extract cost for vector "scalar"
10765 if (isa<FixedVectorType>(EU.Scalar->getType()))
10766 continue;
10767
10768 // If found user is an insertelement, do not calculate extract cost but try
10769 // to detect it as a final shuffled/identity match.
10770 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10771 VU && VU->getOperand(1) == EU.Scalar) {
10772 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10773 if (!UsedInserts.insert(VU).second)
10774 continue;
10775 std::optional<unsigned> InsertIdx = getElementIndex(VU);
10776 if (InsertIdx) {
10777 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10778 auto *It = find_if(
10779 FirstUsers,
10780 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10782 VU, cast<InsertElementInst>(Pair.first),
10783 [this](InsertElementInst *II) -> Value * {
10784 Value *Op0 = II->getOperand(0);
10785 if (getTreeEntry(II) && !getTreeEntry(Op0))
10786 return nullptr;
10787 return Op0;
10788 });
10789 });
10790 int VecId = -1;
10791 if (It == FirstUsers.end()) {
10792 (void)ShuffleMasks.emplace_back();
10793 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10794 if (Mask.empty())
10795 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10796 // Find the insertvector, vectorized in tree, if any.
10797 Value *Base = VU;
10798 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10799 if (IEBase != EU.User &&
10800 (!IEBase->hasOneUse() ||
10801 getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10802 break;
10803 // Build the mask for the vectorized insertelement instructions.
10804 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10805 VU = IEBase;
10806 do {
10807 IEBase = cast<InsertElementInst>(Base);
10808 int Idx = *getElementIndex(IEBase);
10809 assert(Mask[Idx] == PoisonMaskElem &&
10810 "InsertElementInstruction used already.");
10811 Mask[Idx] = Idx;
10812 Base = IEBase->getOperand(0);
10813 } while (E == getTreeEntry(Base));
10814 break;
10815 }
10816 Base = cast<InsertElementInst>(Base)->getOperand(0);
10817 }
10818 FirstUsers.emplace_back(VU, ScalarTE);
10819 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10820 VecId = FirstUsers.size() - 1;
10821 auto It = MinBWs.find(ScalarTE);
10822 if (It != MinBWs.end() &&
10823 VectorCasts
10824 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10825 .second) {
10826 unsigned BWSz = It->second.first;
10827 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10828 unsigned VecOpcode;
10829 if (DstBWSz < BWSz)
10830 VecOpcode = Instruction::Trunc;
10831 else
10832 VecOpcode =
10833 It->second.second ? Instruction::SExt : Instruction::ZExt;
10836 VecOpcode, FTy,
10837 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
10838 FTy->getNumElements()),
10840 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10841 << " for extending externally used vector with "
10842 "non-equal minimum bitwidth.\n");
10843 Cost += C;
10844 }
10845 } else {
10846 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10847 It->first = VU;
10848 VecId = std::distance(FirstUsers.begin(), It);
10849 }
10850 int InIdx = *InsertIdx;
10851 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10852 if (Mask.empty())
10853 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10854 Mask[InIdx] = EU.Lane;
10855 DemandedElts[VecId].setBit(InIdx);
10856 continue;
10857 }
10858 }
10859 }
10860
10862 // If we plan to rewrite the tree in a smaller type, we will need to sign
10863 // extend the extracted value back to the original type. Here, we account
10864 // for the extract and the added cost of the sign extend if needed.
10865 InstructionCost ExtraCost = TTI::TCC_Free;
10866 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10867 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
10868 auto It = MinBWs.find(Entry);
10869 if (It != MinBWs.end()) {
10870 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10871 unsigned Extend =
10872 It->second.second ? Instruction::SExt : Instruction::ZExt;
10873 VecTy = getWidenedType(MinTy, BundleWidth);
10874 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10875 VecTy, EU.Lane);
10876 } else {
10877 ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10878 CostKind, EU.Lane);
10879 }
10880 // Leave the scalar instructions as is if they are cheaper than extracts.
10881 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
10882 Entry->getOpcode() == Instruction::Load) {
10883 if (!ValueToExtUses) {
10884 ValueToExtUses.emplace();
10885 for_each(enumerate(ExternalUses), [&](const auto &P) {
10886 // Ignore phis in loops.
10887 if (auto *Phi = dyn_cast_if_present<PHINode>(P.value().User)) {
10888 auto *I = cast<Instruction>(P.value().Scalar);
10889 const Loop *L = LI->getLoopFor(Phi->getParent());
10890 if (L && (Phi->getParent() == I->getParent() ||
10891 L == LI->getLoopFor(I->getParent())))
10892 return;
10893 }
10894
10895 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10896 });
10897 }
10898 // Can use original instruction, if no operands vectorized or they are
10899 // marked as externally used already.
10900 auto *Inst = cast<Instruction>(EU.Scalar);
10901 bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
10902 if (!getTreeEntry(V)) {
10903 // Some extractelements might be not vectorized, but
10904 // transformed into shuffle and removed from the function,
10905 // consider it here.
10906 if (auto *EE = dyn_cast<ExtractElementInst>(V))
10907 return !EE->hasOneUse() || !MustGather.contains(EE);
10908 return true;
10909 }
10910 return ValueToExtUses->contains(V);
10911 });
10912 if (CanBeUsedAsScalar) {
10913 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
10914 bool KeepScalar = ScalarCost <= ExtraCost;
10915 if (KeepScalar && ScalarCost != TTI::TCC_Free &&
10916 ExtraCost - ScalarCost <= TTI::TCC_Basic) {
10917 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
10918 return ValueToExtUses->contains(V);
10919 });
10920 auto It = ExtractsCount.find(Entry);
10921 if (It != ExtractsCount.end())
10922 ScalarUsesCount -= It->getSecond().size();
10923 // Keep original scalar if number of externally used instructions in
10924 // the same entry is not power of 2. It may help to do some extra
10925 // vectorization for now.
10926 KeepScalar = ScalarUsesCount <= 1 || !isPowerOf2_32(ScalarUsesCount);
10927 }
10928 if (KeepScalar) {
10929 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
10930 for_each(Inst->operands(), [&](Value *V) {
10931 auto It = ValueToExtUses->find(V);
10932 if (It != ValueToExtUses->end()) {
10933 // Replace all uses to avoid compiler crash.
10934 ExternalUses[It->second].User = nullptr;
10935 }
10936 });
10937 ExtraCost = ScalarCost;
10938 ExtractsCount[Entry].insert(Inst);
10939 }
10940 }
10941 }
10942
10943 ExtractCost += ExtraCost;
10944 }
10945 // Add reduced value cost, if resized.
10946 if (!VectorizedVals.empty()) {
10947 const TreeEntry &Root = *VectorizableTree.front();
10948 auto BWIt = MinBWs.find(&Root);
10949 if (BWIt != MinBWs.end()) {
10950 Type *DstTy = Root.Scalars.front()->getType();
10951 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10952 unsigned SrcSz =
10953 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10954 if (OriginalSz != SrcSz) {
10955 unsigned Opcode = Instruction::Trunc;
10956 if (OriginalSz > SrcSz)
10957 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10958 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10959 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10962 }
10963 }
10964 }
10965
10966 InstructionCost SpillCost = getSpillCost();
10967 Cost += SpillCost + ExtractCost;
10968 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10969 bool) {
10970 InstructionCost C = 0;
10971 unsigned VF = Mask.size();
10972 unsigned VecVF = TE->getVectorFactor();
10973 if (VF != VecVF &&
10974 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10976 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10977 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10978 OrigMask.begin());
10980 getWidenedType(TE->getMainOp()->getType(), VecVF),
10981 OrigMask);
10982 LLVM_DEBUG(
10983 dbgs() << "SLP: Adding cost " << C
10984 << " for final shuffle of insertelement external users.\n";
10985 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10986 Cost += C;
10987 return std::make_pair(TE, true);
10988 }
10989 return std::make_pair(TE, false);
10990 };
10991 // Calculate the cost of the reshuffled vectors, if any.
10992 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10993 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10994 auto Vector = ShuffleMasks[I].takeVector();
10995 unsigned VF = 0;
10996 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10998 assert((TEs.size() == 1 || TEs.size() == 2) &&
10999 "Expected exactly 1 or 2 tree entries.");
11000 if (TEs.size() == 1) {
11001 if (VF == 0)
11002 VF = TEs.front()->getVectorFactor();
11003 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
11004 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
11005 !all_of(enumerate(Mask), [=](const auto &Data) {
11006 return Data.value() == PoisonMaskElem ||
11007 (Data.index() < VF &&
11008 static_cast<int>(Data.index()) == Data.value());
11009 })) {
11012 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
11013 << " for final shuffle of insertelement "
11014 "external users.\n";
11015 TEs.front()->dump();
11016 dbgs() << "SLP: Current total cost = " << Cost << "\n");
11017 Cost += C;
11018 }
11019 } else {
11020 if (VF == 0) {
11021 if (TEs.front() &&
11022 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
11023 VF = TEs.front()->getVectorFactor();
11024 else
11025 VF = Mask.size();
11026 }
11027 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
11030 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
11031 << " for final shuffle of vector node and external "
11032 "insertelement users.\n";
11033 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
11034 dbgs() << "SLP: Current total cost = " << Cost << "\n");
11035 Cost += C;
11036 }
11037 VF = Mask.size();
11038 return TEs.back();
11039 };
11040 (void)performExtractsShuffleAction<const TreeEntry>(
11041 MutableArrayRef(Vector.data(), Vector.size()), Base,
11042 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
11043 EstimateShufflesCost);
11045 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
11046 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
11047 Cost -= InsertCost;
11048 }
11049
11050 // Add the cost for reduced value resize (if required).
11051 if (ReductionBitWidth != 0) {
11052 assert(UserIgnoreList && "Expected reduction tree.");
11053 const TreeEntry &E = *VectorizableTree.front();
11054 auto It = MinBWs.find(&E);
11055 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
11056 unsigned SrcSize = It->second.first;
11057 unsigned DstSize = ReductionBitWidth;
11058 unsigned Opcode = Instruction::Trunc;
11059 if (SrcSize < DstSize)
11060 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11061 auto *SrcVecTy =
11062 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
11063 auto *DstVecTy =
11064 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
11065 TTI::CastContextHint CCH = getCastContextHint(E);
11066 InstructionCost CastCost;
11067 switch (E.getOpcode()) {
11068 case Instruction::SExt:
11069 case Instruction::ZExt:
11070 case Instruction::Trunc: {
11071 const TreeEntry *OpTE = getOperandEntry(&E, 0);
11072 CCH = getCastContextHint(*OpTE);
11073 break;
11074 }
11075 default:
11076 break;
11077 }
11078 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
11080 Cost += CastCost;
11081 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
11082 << " for final resize for reduction from " << SrcVecTy
11083 << " to " << DstVecTy << "\n";
11084 dbgs() << "SLP: Current total cost = " << Cost << "\n");
11085 }
11086 }
11087
11088#ifndef NDEBUG
11089 SmallString<256> Str;
11090 {
11092 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
11093 << "SLP: Extract Cost = " << ExtractCost << ".\n"
11094 << "SLP: Total Cost = " << Cost << ".\n";
11095 }
11096 LLVM_DEBUG(dbgs() << Str);
11097 if (ViewSLPTree)
11098 ViewGraph(this, "SLP" + F->getName(), false, Str);
11099#endif
11100
11101 return Cost;
11102}
11103
11104/// Tries to find extractelement instructions with constant indices from fixed
11105/// vector type and gather such instructions into a bunch, which highly likely
11106/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
11107/// successful, the matched scalars are replaced by poison values in \p VL for
11108/// future analysis.
11109std::optional<TTI::ShuffleKind>
11110BoUpSLP::tryToGatherSingleRegisterExtractElements(
11112 // Scan list of gathered scalars for extractelements that can be represented
11113 // as shuffles.
11115 SmallVector<int> UndefVectorExtracts;
11116 for (int I = 0, E = VL.size(); I < E; ++I) {
11117 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
11118 if (!EI) {
11119 if (isa<UndefValue>(VL[I]))
11120 UndefVectorExtracts.push_back(I);
11121 continue;
11122 }
11123 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
11124 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
11125 continue;
11126 std::optional<unsigned> Idx = getExtractIndex(EI);
11127 // Undefined index.
11128 if (!Idx) {
11129 UndefVectorExtracts.push_back(I);
11130 continue;
11131 }
11132 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
11133 ExtractMask.reset(*Idx);
11134 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
11135 UndefVectorExtracts.push_back(I);
11136 continue;
11137 }
11138 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
11139 }
11140 // Sort the vector operands by the maximum number of uses in extractelements.
11142 VectorOpToIdx.takeVector();
11143 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
11144 return P1.second.size() > P2.second.size();
11145 });
11146 // Find the best pair of the vectors or a single vector.
11147 const int UndefSz = UndefVectorExtracts.size();
11148 unsigned SingleMax = 0;
11149 unsigned PairMax = 0;
11150 if (!Vectors.empty()) {
11151 SingleMax = Vectors.front().second.size() + UndefSz;
11152 if (Vectors.size() > 1) {
11153 auto *ItNext = std::next(Vectors.begin());
11154 PairMax = SingleMax + ItNext->second.size();
11155 }
11156 }
11157 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
11158 return std::nullopt;
11159 // Check if better to perform a shuffle of 2 vectors or just of a single
11160 // vector.
11161 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
11162 SmallVector<Value *> GatheredExtracts(
11163 VL.size(), PoisonValue::get(VL.front()->getType()));
11164 if (SingleMax >= PairMax && SingleMax) {
11165 for (int Idx : Vectors.front().second)
11166 std::swap(GatheredExtracts[Idx], VL[Idx]);
11167 } else if (!Vectors.empty()) {
11168 for (unsigned Idx : {0, 1})
11169 for (int Idx : Vectors[Idx].second)
11170 std::swap(GatheredExtracts[Idx], VL[Idx]);
11171 }
11172 // Add extracts from undefs too.
11173 for (int Idx : UndefVectorExtracts)
11174 std::swap(GatheredExtracts[Idx], VL[Idx]);
11175 // Check that gather of extractelements can be represented as just a
11176 // shuffle of a single/two vectors the scalars are extracted from.
11177 std::optional<TTI::ShuffleKind> Res =
11178 isFixedVectorShuffle(GatheredExtracts, Mask);
11179 if (!Res) {
11180 // TODO: try to check other subsets if possible.
11181 // Restore the original VL if attempt was not successful.
11182 copy(SavedVL, VL.begin());
11183 return std::nullopt;
11184 }
11185 // Restore unused scalars from mask, if some of the extractelements were not
11186 // selected for shuffle.
11187 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
11188 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
11189 isa<UndefValue>(GatheredExtracts[I])) {
11190 std::swap(VL[I], GatheredExtracts[I]);
11191 continue;
11192 }
11193 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
11194 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
11195 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
11196 is_contained(UndefVectorExtracts, I))
11197 continue;
11198 }
11199 return Res;
11200}
11201
11202/// Tries to find extractelement instructions with constant indices from fixed
11203/// vector type and gather such instructions into a bunch, which highly likely
11204/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
11205/// successful, the matched scalars are replaced by poison values in \p VL for
11206/// future analysis.
11208BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
11210 unsigned NumParts) const {
11211 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
11212 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
11213 Mask.assign(VL.size(), PoisonMaskElem);
11214 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11215 for (unsigned Part : seq<unsigned>(NumParts)) {
11216 // Scan list of gathered scalars for extractelements that can be represented
11217 // as shuffles.
11219 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11220 SmallVector<int> SubMask;
11221 std::optional<TTI::ShuffleKind> Res =
11222 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
11223 ShufflesRes[Part] = Res;
11224 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
11225 }
11226 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
11227 return Res.has_value();
11228 }))
11229 ShufflesRes.clear();
11230 return ShufflesRes;
11231}
11232
11233std::optional<TargetTransformInfo::ShuffleKind>
11234BoUpSLP::isGatherShuffledSingleRegisterEntry(
11235 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
11236 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
11237 Entries.clear();
11238 // TODO: currently checking only for Scalars in the tree entry, need to count
11239 // reused elements too for better cost estimation.
11240 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
11241 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
11242 const BasicBlock *TEInsertBlock = nullptr;
11243 // Main node of PHI entries keeps the correct order of operands/incoming
11244 // blocks.
11245 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
11246 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
11247 TEInsertPt = TEInsertBlock->getTerminator();
11248 } else {
11249 TEInsertBlock = TEInsertPt->getParent();
11250 }
11251 if (!DT->isReachableFromEntry(TEInsertBlock))
11252 return std::nullopt;
11253 auto *NodeUI = DT->getNode(TEInsertBlock);
11254 assert(NodeUI && "Should only process reachable instructions");
11255 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
11256 auto CheckOrdering = [&](const Instruction *InsertPt) {
11257 // Argument InsertPt is an instruction where vector code for some other
11258 // tree entry (one that shares one or more scalars with TE) is going to be
11259 // generated. This lambda returns true if insertion point of vector code
11260 // for the TE dominates that point (otherwise dependency is the other way
11261 // around). The other node is not limited to be of a gather kind. Gather
11262 // nodes are not scheduled and their vector code is inserted before their
11263 // first user. If user is PHI, that is supposed to be at the end of a
11264 // predecessor block. Otherwise it is the last instruction among scalars of
11265 // the user node. So, instead of checking dependency between instructions
11266 // themselves, we check dependency between their insertion points for vector
11267 // code (since each scalar instruction ends up as a lane of a vector
11268 // instruction).
11269 const BasicBlock *InsertBlock = InsertPt->getParent();
11270 auto *NodeEUI = DT->getNode(InsertBlock);
11271 if (!NodeEUI)
11272 return false;
11273 assert((NodeUI == NodeEUI) ==
11274 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
11275 "Different nodes should have different DFS numbers");
11276 // Check the order of the gather nodes users.
11277 if (TEInsertPt->getParent() != InsertBlock &&
11278 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
11279 return false;
11280 if (TEInsertPt->getParent() == InsertBlock &&
11281 TEInsertPt->comesBefore(InsertPt))
11282 return false;
11283 return true;
11284 };
11285 // Find all tree entries used by the gathered values. If no common entries
11286 // found - not a shuffle.
11287 // Here we build a set of tree nodes for each gathered value and trying to
11288 // find the intersection between these sets. If we have at least one common
11289 // tree node for each gathered value - we have just a permutation of the
11290 // single vector. If we have 2 different sets, we're in situation where we
11291 // have a permutation of 2 input vectors.
11293 DenseMap<Value *, int> UsedValuesEntry;
11294 for (Value *V : VL) {
11295 if (isConstant(V))
11296 continue;
11297 // Build a list of tree entries where V is used.
11299 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
11300 if (TEPtr == TE)
11301 continue;
11302 assert(any_of(TEPtr->Scalars,
11303 [&](Value *V) { return GatheredScalars.contains(V); }) &&
11304 "Must contain at least single gathered value.");
11305 assert(TEPtr->UserTreeIndices.size() == 1 &&
11306 "Expected only single user of a gather node.");
11307 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11308
11309 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
11310 const Instruction *InsertPt =
11311 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
11312 : &getLastInstructionInBundle(UseEI.UserTE);
11313 if (TEInsertPt == InsertPt) {
11314 // If 2 gathers are operands of the same entry (regardless of whether
11315 // user is PHI or else), compare operands indices, use the earlier one
11316 // as the base.
11317 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11318 continue;
11319 // If the user instruction is used for some reason in different
11320 // vectorized nodes - make it depend on index.
11321 if (TEUseEI.UserTE != UseEI.UserTE &&
11322 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11323 continue;
11324 }
11325
11326 // Check if the user node of the TE comes after user node of TEPtr,
11327 // otherwise TEPtr depends on TE.
11328 if ((TEInsertBlock != InsertPt->getParent() ||
11329 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11330 !CheckOrdering(InsertPt))
11331 continue;
11332 VToTEs.insert(TEPtr);
11333 }
11334 if (const TreeEntry *VTE = getTreeEntry(V)) {
11335 if (ForOrder) {
11336 if (VTE->State != TreeEntry::Vectorize) {
11337 auto It = MultiNodeScalars.find(V);
11338 if (It == MultiNodeScalars.end())
11339 continue;
11340 VTE = *It->getSecond().begin();
11341 // Iterate through all vectorized nodes.
11342 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
11343 return MTE->State == TreeEntry::Vectorize;
11344 });
11345 if (MIt == It->getSecond().end())
11346 continue;
11347 VTE = *MIt;
11348 }
11349 }
11350 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
11351 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11352 continue;
11353 VToTEs.insert(VTE);
11354 }
11355 if (VToTEs.empty())
11356 continue;
11357 if (UsedTEs.empty()) {
11358 // The first iteration, just insert the list of nodes to vector.
11359 UsedTEs.push_back(VToTEs);
11360 UsedValuesEntry.try_emplace(V, 0);
11361 } else {
11362 // Need to check if there are any previously used tree nodes which use V.
11363 // If there are no such nodes, consider that we have another one input
11364 // vector.
11365 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
11366 unsigned Idx = 0;
11367 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
11368 // Do we have a non-empty intersection of previously listed tree entries
11369 // and tree entries using current V?
11370 set_intersect(VToTEs, Set);
11371 if (!VToTEs.empty()) {
11372 // Yes, write the new subset and continue analysis for the next
11373 // scalar.
11374 Set.swap(VToTEs);
11375 break;
11376 }
11377 VToTEs = SavedVToTEs;
11378 ++Idx;
11379 }
11380 // No non-empty intersection found - need to add a second set of possible
11381 // source vectors.
11382 if (Idx == UsedTEs.size()) {
11383 // If the number of input vectors is greater than 2 - not a permutation,
11384 // fallback to the regular gather.
11385 // TODO: support multiple reshuffled nodes.
11386 if (UsedTEs.size() == 2)
11387 continue;
11388 UsedTEs.push_back(SavedVToTEs);
11389 Idx = UsedTEs.size() - 1;
11390 }
11391 UsedValuesEntry.try_emplace(V, Idx);
11392 }
11393 }
11394
11395 if (UsedTEs.empty()) {
11396 Entries.clear();
11397 return std::nullopt;
11398 }
11399
11400 unsigned VF = 0;
11401 if (UsedTEs.size() == 1) {
11402 // Keep the order to avoid non-determinism.
11403 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
11404 UsedTEs.front().end());
11405 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11406 return TE1->Idx < TE2->Idx;
11407 });
11408 // Try to find the perfect match in another gather node at first.
11409 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
11410 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
11411 });
11412 if (It != FirstEntries.end() &&
11413 ((*It)->getVectorFactor() == VL.size() ||
11414 ((*It)->getVectorFactor() == TE->Scalars.size() &&
11415 TE->ReuseShuffleIndices.size() == VL.size() &&
11416 (*It)->isSame(TE->Scalars)))) {
11417 Entries.push_back(*It);
11418 if ((*It)->getVectorFactor() == VL.size()) {
11419 std::iota(std::next(Mask.begin(), Part * VL.size()),
11420 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
11421 } else {
11422 SmallVector<int> CommonMask = TE->getCommonMask();
11423 copy(CommonMask, Mask.begin());
11424 }
11425 // Clear undef scalars.
11426 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11427 if (isa<PoisonValue>(VL[I]))
11430 }
11431 // No perfect match, just shuffle, so choose the first tree node from the
11432 // tree.
11433 Entries.push_back(FirstEntries.front());
11434 } else {
11435 // Try to find nodes with the same vector factor.
11436 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
11437 // Keep the order of tree nodes to avoid non-determinism.
11439 for (const TreeEntry *TE : UsedTEs.front()) {
11440 unsigned VF = TE->getVectorFactor();
11441 auto It = VFToTE.find(VF);
11442 if (It != VFToTE.end()) {
11443 if (It->second->Idx > TE->Idx)
11444 It->getSecond() = TE;
11445 continue;
11446 }
11447 VFToTE.try_emplace(VF, TE);
11448 }
11449 // Same, keep the order to avoid non-determinism.
11450 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
11451 UsedTEs.back().end());
11452 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11453 return TE1->Idx < TE2->Idx;
11454 });
11455 for (const TreeEntry *TE : SecondEntries) {
11456 auto It = VFToTE.find(TE->getVectorFactor());
11457 if (It != VFToTE.end()) {
11458 VF = It->first;
11459 Entries.push_back(It->second);
11460 Entries.push_back(TE);
11461 break;
11462 }
11463 }
11464 // No 2 source vectors with the same vector factor - just choose 2 with max
11465 // index.
11466 if (Entries.empty()) {
11467 Entries.push_back(*llvm::max_element(
11468 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
11469 return TE1->Idx < TE2->Idx;
11470 }));
11471 Entries.push_back(SecondEntries.front());
11472 VF = std::max(Entries.front()->getVectorFactor(),
11473 Entries.back()->getVectorFactor());
11474 }
11475 }
11476
11477 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
11478 // Checks if the 2 PHIs are compatible in terms of high possibility to be
11479 // vectorized.
11480 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
11481 auto *PHI = cast<PHINode>(V);
11482 auto *PHI1 = cast<PHINode>(V1);
11483 // Check that all incoming values are compatible/from same parent (if they
11484 // are instructions).
11485 // The incoming values are compatible if they all are constants, or
11486 // instruction with the same/alternate opcodes from the same basic block.
11487 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
11488 Value *In = PHI->getIncomingValue(I);
11489 Value *In1 = PHI1->getIncomingValue(I);
11490 if (isConstant(In) && isConstant(In1))
11491 continue;
11492 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
11493 return false;
11494 if (cast<Instruction>(In)->getParent() !=
11495 cast<Instruction>(In1)->getParent())
11496 return false;
11497 }
11498 return true;
11499 };
11500 // Check if the value can be ignored during analysis for shuffled gathers.
11501 // We suppose it is better to ignore instruction, which do not form splats,
11502 // are not vectorized/not extractelements (these instructions will be handled
11503 // by extractelements processing) or may form vector node in future.
11504 auto MightBeIgnored = [=](Value *V) {
11505 auto *I = dyn_cast<Instruction>(V);
11506 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
11508 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
11509 };
11510 // Check that the neighbor instruction may form a full vector node with the
11511 // current instruction V. It is possible, if they have same/alternate opcode
11512 // and same parent basic block.
11513 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11514 Value *V1 = VL[Idx];
11515 bool UsedInSameVTE = false;
11516 auto It = UsedValuesEntry.find(V1);
11517 if (It != UsedValuesEntry.end())
11518 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11519 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11520 getSameOpcode({V, V1}, *TLI).getOpcode() &&
11521 cast<Instruction>(V)->getParent() ==
11522 cast<Instruction>(V1)->getParent() &&
11523 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11524 };
11525 // Build a shuffle mask for better cost estimation and vector emission.
11526 SmallBitVector UsedIdxs(Entries.size());
11528 for (int I = 0, E = VL.size(); I < E; ++I) {
11529 Value *V = VL[I];
11530 auto It = UsedValuesEntry.find(V);
11531 if (It == UsedValuesEntry.end())
11532 continue;
11533 // Do not try to shuffle scalars, if they are constants, or instructions
11534 // that can be vectorized as a result of the following vector build
11535 // vectorization.
11536 if (isConstant(V) || (MightBeIgnored(V) &&
11537 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11538 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11539 continue;
11540 unsigned Idx = It->second;
11541 EntryLanes.emplace_back(Idx, I);
11542 UsedIdxs.set(Idx);
11543 }
11544 // Iterate through all shuffled scalars and select entries, which can be used
11545 // for final shuffle.
11547 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11548 if (!UsedIdxs.test(I))
11549 continue;
11550 // Fix the entry number for the given scalar. If it is the first entry, set
11551 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11552 // These indices are used when calculating final shuffle mask as the vector
11553 // offset.
11554 for (std::pair<unsigned, int> &Pair : EntryLanes)
11555 if (Pair.first == I)
11556 Pair.first = TempEntries.size();
11557 TempEntries.push_back(Entries[I]);
11558 }
11559 Entries.swap(TempEntries);
11560 if (EntryLanes.size() == Entries.size() &&
11561 !VL.equals(ArrayRef(TE->Scalars)
11562 .slice(Part * VL.size(),
11563 std::min<int>(VL.size(), TE->Scalars.size())))) {
11564 // We may have here 1 or 2 entries only. If the number of scalars is equal
11565 // to the number of entries, no need to do the analysis, it is not very
11566 // profitable. Since VL is not the same as TE->Scalars, it means we already
11567 // have some shuffles before. Cut off not profitable case.
11568 Entries.clear();
11569 return std::nullopt;
11570 }
11571 // Build the final mask, check for the identity shuffle, if possible.
11572 bool IsIdentity = Entries.size() == 1;
11573 // Pair.first is the offset to the vector, while Pair.second is the index of
11574 // scalar in the list.
11575 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11576 unsigned Idx = Part * VL.size() + Pair.second;
11577 Mask[Idx] =
11578 Pair.first * VF +
11579 (ForOrder ? std::distance(
11580 Entries[Pair.first]->Scalars.begin(),
11581 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11582 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11583 IsIdentity &= Mask[Idx] == Pair.second;
11584 }
11585 switch (Entries.size()) {
11586 case 1:
11587 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11589 break;
11590 case 2:
11591 if (EntryLanes.size() > 2 || VL.size() <= 2)
11593 break;
11594 default:
11595 break;
11596 }
11597 Entries.clear();
11598 // Clear the corresponding mask elements.
11599 std::fill(std::next(Mask.begin(), Part * VL.size()),
11600 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11601 return std::nullopt;
11602}
11603
11605BoUpSLP::isGatherShuffledEntry(
11606 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11607 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11608 bool ForOrder) {
11609 assert(NumParts > 0 && NumParts < VL.size() &&
11610 "Expected positive number of registers.");
11611 Entries.clear();
11612 // No need to check for the topmost gather node.
11613 if (TE == VectorizableTree.front().get())
11614 return {};
11615 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11616 if (TE->isNonPowOf2Vec())
11617 return {};
11618 Mask.assign(VL.size(), PoisonMaskElem);
11619 assert(TE->UserTreeIndices.size() == 1 &&
11620 "Expected only single user of the gather node.");
11621 assert(VL.size() % NumParts == 0 &&
11622 "Number of scalars must be divisible by NumParts.");
11623 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11625 for (unsigned Part : seq<unsigned>(NumParts)) {
11626 ArrayRef<Value *> SubVL =
11627 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11628 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11629 std::optional<TTI::ShuffleKind> SubRes =
11630 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11631 ForOrder);
11632 if (!SubRes)
11633 SubEntries.clear();
11634 Res.push_back(SubRes);
11635 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11636 SubEntries.front()->getVectorFactor() == VL.size() &&
11637 (SubEntries.front()->isSame(TE->Scalars) ||
11638 SubEntries.front()->isSame(VL))) {
11639 SmallVector<const TreeEntry *> LocalSubEntries;
11640 LocalSubEntries.swap(SubEntries);
11641 Entries.clear();
11642 Res.clear();
11643 std::iota(Mask.begin(), Mask.end(), 0);
11644 // Clear undef scalars.
11645 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11646 if (isa<PoisonValue>(VL[I]))
11648 Entries.emplace_back(1, LocalSubEntries.front());
11650 return Res;
11651 }
11652 }
11653 if (all_of(Res,
11654 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11655 Entries.clear();
11656 return {};
11657 }
11658 return Res;
11659}
11660
11661InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11662 Type *ScalarTy) const {
11663 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11664 bool DuplicateNonConst = false;
11665 // Find the cost of inserting/extracting values from the vector.
11666 // Check if the same elements are inserted several times and count them as
11667 // shuffle candidates.
11668 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
11669 APInt ShuffledElements = APInt::getZero(VecTy->getNumElements());
11670 DenseMap<Value *, unsigned> UniqueElements;
11673 auto EstimateInsertCost = [&](unsigned I, Value *V) {
11674 if (V->getType() != ScalarTy) {
11675 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11677 V = nullptr;
11678 }
11679 if (!ForPoisonSrc)
11680 Cost +=
11681 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11682 I, Constant::getNullValue(VecTy), V);
11683 };
11684 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11685 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11686 Value *V = VL[I];
11687 // No need to shuffle duplicates for constants.
11688 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11689 ShuffledElements.setBits(I * ScalarTyNumElements,
11690 I * ScalarTyNumElements + ScalarTyNumElements);
11691 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11692 continue;
11693 }
11694
11695 auto Res = UniqueElements.try_emplace(V, I);
11696 if (Res.second) {
11697 EstimateInsertCost(I, V);
11698 ShuffleMask[I] = I;
11699 continue;
11700 }
11701
11702 DuplicateNonConst = true;
11703 ShuffledElements.setBits(I * ScalarTyNumElements,
11704 I * ScalarTyNumElements + ScalarTyNumElements);
11705 ShuffleMask[I] = Res.first->second;
11706 }
11707 if (ForPoisonSrc)
11708 Cost =
11709 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11710 /*Extract*/ false, CostKind);
11711 if (DuplicateNonConst)
11713 VecTy, ShuffleMask);
11714 return Cost;
11715}
11716
11717// Perform operand reordering on the instructions in VL and return the reordered
11718// operands in Left and Right.
11719void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11722 const BoUpSLP &R) {
11723 if (VL.empty())
11724 return;
11725 VLOperands Ops(VL, R);
11726 // Reorder the operands in place.
11727 Ops.reorder();
11728 Left = Ops.getVL(0);
11729 Right = Ops.getVL(1);
11730}
11731
11732Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11733 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11734 if (Res.second)
11735 return *Res.second;
11736 // Get the basic block this bundle is in. All instructions in the bundle
11737 // should be in this block (except for extractelement-like instructions with
11738 // constant indeces).
11739 auto *Front = E->getMainOp();
11740 auto *BB = Front->getParent();
11741 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11742 if (E->getOpcode() == Instruction::GetElementPtr &&
11743 !isa<GetElementPtrInst>(V))
11744 return true;
11745 auto *I = cast<Instruction>(V);
11746 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11747 isVectorLikeInstWithConstOps(I);
11748 }));
11749
11750 auto FindLastInst = [&]() {
11751 Instruction *LastInst = Front;
11752 for (Value *V : E->Scalars) {
11753 auto *I = dyn_cast<Instruction>(V);
11754 if (!I)
11755 continue;
11756 if (LastInst->getParent() == I->getParent()) {
11757 if (LastInst->comesBefore(I))
11758 LastInst = I;
11759 continue;
11760 }
11761 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11762 !isa<GetElementPtrInst>(I)) ||
11763 (isVectorLikeInstWithConstOps(LastInst) &&
11765 "Expected vector-like or non-GEP in GEP node insts only.");
11766 if (!DT->isReachableFromEntry(LastInst->getParent())) {
11767 LastInst = I;
11768 continue;
11769 }
11770 if (!DT->isReachableFromEntry(I->getParent()))
11771 continue;
11772 auto *NodeA = DT->getNode(LastInst->getParent());
11773 auto *NodeB = DT->getNode(I->getParent());
11774 assert(NodeA && "Should only process reachable instructions");
11775 assert(NodeB && "Should only process reachable instructions");
11776 assert((NodeA == NodeB) ==
11777 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11778 "Different nodes should have different DFS numbers");
11779 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11780 LastInst = I;
11781 }
11782 BB = LastInst->getParent();
11783 return LastInst;
11784 };
11785
11786 auto FindFirstInst = [&]() {
11787 Instruction *FirstInst = Front;
11788 for (Value *V : E->Scalars) {
11789 auto *I = dyn_cast<Instruction>(V);
11790 if (!I)
11791 continue;
11792 if (FirstInst->getParent() == I->getParent()) {
11793 if (I->comesBefore(FirstInst))
11794 FirstInst = I;
11795 continue;
11796 }
11797 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11798 !isa<GetElementPtrInst>(I)) ||
11799 (isVectorLikeInstWithConstOps(FirstInst) &&
11801 "Expected vector-like or non-GEP in GEP node insts only.");
11802 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11803 FirstInst = I;
11804 continue;
11805 }
11806 if (!DT->isReachableFromEntry(I->getParent()))
11807 continue;
11808 auto *NodeA = DT->getNode(FirstInst->getParent());
11809 auto *NodeB = DT->getNode(I->getParent());
11810 assert(NodeA && "Should only process reachable instructions");
11811 assert(NodeB && "Should only process reachable instructions");
11812 assert((NodeA == NodeB) ==
11813 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11814 "Different nodes should have different DFS numbers");
11815 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11816 FirstInst = I;
11817 }
11818 return FirstInst;
11819 };
11820
11821 // Set the insert point to the beginning of the basic block if the entry
11822 // should not be scheduled.
11823 if (doesNotNeedToSchedule(E->Scalars) ||
11824 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11825 if ((E->getOpcode() == Instruction::GetElementPtr &&
11826 any_of(E->Scalars,
11827 [](Value *V) {
11828 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11829 })) ||
11830 all_of(E->Scalars,
11831 [](Value *V) {
11832 return !isVectorLikeInstWithConstOps(V) &&
11833 isUsedOutsideBlock(V);
11834 }) ||
11835 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
11836 return isa<ExtractElementInst, UndefValue>(V) ||
11837 areAllOperandsNonInsts(V);
11838 })))
11839 Res.second = FindLastInst();
11840 else
11841 Res.second = FindFirstInst();
11842 return *Res.second;
11843 }
11844
11845 // Find the last instruction. The common case should be that BB has been
11846 // scheduled, and the last instruction is VL.back(). So we start with
11847 // VL.back() and iterate over schedule data until we reach the end of the
11848 // bundle. The end of the bundle is marked by null ScheduleData.
11849 if (BlocksSchedules.count(BB)) {
11850 Value *V = E->isOneOf(E->Scalars.back());
11852 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11853 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11854 if (Bundle && Bundle->isPartOfBundle())
11855 for (; Bundle; Bundle = Bundle->NextInBundle)
11856 Res.second = Bundle->Inst;
11857 }
11858
11859 // LastInst can still be null at this point if there's either not an entry
11860 // for BB in BlocksSchedules or there's no ScheduleData available for
11861 // VL.back(). This can be the case if buildTree_rec aborts for various
11862 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11863 // size is reached, etc.). ScheduleData is initialized in the scheduling
11864 // "dry-run".
11865 //
11866 // If this happens, we can still find the last instruction by brute force. We
11867 // iterate forwards from Front (inclusive) until we either see all
11868 // instructions in the bundle or reach the end of the block. If Front is the
11869 // last instruction in program order, LastInst will be set to Front, and we
11870 // will visit all the remaining instructions in the block.
11871 //
11872 // One of the reasons we exit early from buildTree_rec is to place an upper
11873 // bound on compile-time. Thus, taking an additional compile-time hit here is
11874 // not ideal. However, this should be exceedingly rare since it requires that
11875 // we both exit early from buildTree_rec and that the bundle be out-of-order
11876 // (causing us to iterate all the way to the end of the block).
11877 if (!Res.second)
11878 Res.second = FindLastInst();
11879 assert(Res.second && "Failed to find last instruction in bundle");
11880 return *Res.second;
11881}
11882
11883void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11884 auto *Front = E->getMainOp();
11885 Instruction *LastInst = &getLastInstructionInBundle(E);
11886 assert(LastInst && "Failed to find last instruction in bundle");
11887 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11888 // If the instruction is PHI, set the insert point after all the PHIs.
11889 bool IsPHI = isa<PHINode>(LastInst);
11890 if (IsPHI)
11891 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11892 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
11893 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11894 } else {
11895 // Set the insertion point after the last instruction in the bundle. Set the
11896 // debug location to Front.
11897 Builder.SetInsertPoint(
11898 LastInst->getParent(),
11900 }
11901 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11902}
11903
11904Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11905 // List of instructions/lanes from current block and/or the blocks which are
11906 // part of the current loop. These instructions will be inserted at the end to
11907 // make it possible to optimize loops and hoist invariant instructions out of
11908 // the loops body with better chances for success.
11910 SmallSet<int, 4> PostponedIndices;
11911 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11912 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11914 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11915 InsertBB = InsertBB->getSinglePredecessor();
11916 return InsertBB && InsertBB == InstBB;
11917 };
11918 for (int I = 0, E = VL.size(); I < E; ++I) {
11919 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11920 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11921 getTreeEntry(Inst) ||
11922 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11923 PostponedIndices.insert(I).second)
11924 PostponedInsts.emplace_back(Inst, I);
11925 }
11926
11927 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11928 Type *Ty) {
11929 Value *Scalar = V;
11930 if (Scalar->getType() != Ty) {
11931 assert(Scalar->getType()->isIntOrIntVectorTy() &&
11932 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
11933 Value *V = Scalar;
11934 if (auto *CI = dyn_cast<CastInst>(Scalar);
11935 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11936 Value *Op = CI->getOperand(0);
11937 if (auto *IOp = dyn_cast<Instruction>(Op);
11938 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
11939 V = Op;
11940 }
11941 Scalar = Builder.CreateIntCast(
11942 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11943 }
11944
11945 Instruction *InsElt;
11946 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
11947 assert(SLPReVec && "FixedVectorType is not expected.");
11948 Vec = InsElt = Builder.CreateInsertVector(
11949 Vec->getType(), Vec, Scalar,
11950 Builder.getInt64(Pos * VecTy->getNumElements()));
11951 auto *II = dyn_cast<IntrinsicInst>(InsElt);
11952 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
11953 return Vec;
11954 } else {
11955 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11956 InsElt = dyn_cast<InsertElementInst>(Vec);
11957 if (!InsElt)
11958 return Vec;
11959 }
11960 GatherShuffleExtractSeq.insert(InsElt);
11961 CSEBlocks.insert(InsElt->getParent());
11962 // Add to our 'need-to-extract' list.
11963 if (isa<Instruction>(V)) {
11964 if (TreeEntry *Entry = getTreeEntry(V)) {
11965 // Find which lane we need to extract.
11966 User *UserOp = nullptr;
11967 if (Scalar != V) {
11968 if (auto *SI = dyn_cast<Instruction>(Scalar))
11969 UserOp = SI;
11970 } else {
11971 UserOp = InsElt;
11972 }
11973 if (UserOp) {
11974 unsigned FoundLane = Entry->findLaneForValue(V);
11975 ExternalUses.emplace_back(V, UserOp, FoundLane);
11976 }
11977 }
11978 }
11979 return Vec;
11980 };
11981 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11982 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11983 SmallVector<int> NonConsts;
11984 // Insert constant values at first.
11985 for (int I = 0, E = VL.size(); I < E; ++I) {
11986 if (PostponedIndices.contains(I))
11987 continue;
11988 if (!isConstant(VL[I])) {
11989 NonConsts.push_back(I);
11990 continue;
11991 }
11992 if (Root) {
11993 if (!isa<UndefValue>(VL[I])) {
11994 NonConsts.push_back(I);
11995 continue;
11996 }
11997 if (isa<PoisonValue>(VL[I]))
11998 continue;
11999 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
12000 if (SV->getMaskValue(I) == PoisonMaskElem)
12001 continue;
12002 }
12003 }
12004 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
12005 }
12006 // Insert non-constant values.
12007 for (int I : NonConsts)
12008 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
12009 // Append instructions, which are/may be part of the loop, in the end to make
12010 // it possible to hoist non-loop-based instructions.
12011 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
12012 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
12013
12014 return Vec;
12015}
12016
12017/// Merges shuffle masks and emits final shuffle instruction, if required. It
12018/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
12019/// when the actual shuffle instruction is generated only if this is actually
12020/// required. Otherwise, the shuffle instruction emission is delayed till the
12021/// end of the process, to reduce the number of emitted instructions and further
12022/// analysis/transformations.
12023/// The class also will look through the previously emitted shuffle instructions
12024/// and properly mark indices in mask as undef.
12025/// For example, given the code
12026/// \code
12027/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12028/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12029/// \endcode
12030/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12031/// look through %s1 and %s2 and emit
12032/// \code
12033/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
12034/// \endcode
12035/// instead.
12036/// If 2 operands are of different size, the smallest one will be resized and
12037/// the mask recalculated properly.
12038/// For example, given the code
12039/// \code
12040/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12041/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12042/// \endcode
12043/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12044/// look through %s1 and %s2 and emit
12045/// \code
12046/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
12047/// \endcode
12048/// instead.
12049class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
12050 bool IsFinalized = false;
12051 /// Combined mask for all applied operands and masks. It is built during
12052 /// analysis and actual emission of shuffle vector instructions.
12053 SmallVector<int> CommonMask;
12054 /// List of operands for the shuffle vector instruction. It hold at max 2
12055 /// operands, if the 3rd is going to be added, the first 2 are combined into
12056 /// shuffle with \p CommonMask mask, the first operand sets to be the
12057 /// resulting shuffle and the second operand sets to be the newly added
12058 /// operand. The \p CommonMask is transformed in the proper way after that.
12059 SmallVector<Value *, 2> InVectors;
12060 IRBuilderBase &Builder;
12061 BoUpSLP &R;
12062
12063 class ShuffleIRBuilder {
12064 IRBuilderBase &Builder;
12065 /// Holds all of the instructions that we gathered.
12066 SetVector<Instruction *> &GatherShuffleExtractSeq;
12067 /// A list of blocks that we are going to CSE.
12068 DenseSet<BasicBlock *> &CSEBlocks;
12069 /// Data layout.
12070 const DataLayout &DL;
12071
12072 public:
12073 ShuffleIRBuilder(IRBuilderBase &Builder,
12074 SetVector<Instruction *> &GatherShuffleExtractSeq,
12075 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
12076 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
12077 CSEBlocks(CSEBlocks), DL(DL) {}
12078 ~ShuffleIRBuilder() = default;
12079 /// Creates shufflevector for the 2 operands with the given mask.
12080 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
12081 if (V1->getType() != V2->getType()) {
12083 V1->getType()->isIntOrIntVectorTy() &&
12084 "Expected integer vector types only.");
12085 if (V1->getType() != V2->getType()) {
12086 if (cast<VectorType>(V2->getType())
12087 ->getElementType()
12088 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
12089 ->getElementType()
12090 ->getIntegerBitWidth())
12091 V2 = Builder.CreateIntCast(
12092 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
12093 else
12094 V1 = Builder.CreateIntCast(
12095 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
12096 }
12097 }
12098 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
12099 if (auto *I = dyn_cast<Instruction>(Vec)) {
12100 GatherShuffleExtractSeq.insert(I);
12101 CSEBlocks.insert(I->getParent());
12102 }
12103 return Vec;
12104 }
12105 /// Creates permutation of the single vector operand with the given mask, if
12106 /// it is not identity mask.
12107 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
12108 if (Mask.empty())
12109 return V1;
12110 unsigned VF = Mask.size();
12111 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
12112 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
12113 return V1;
12114 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
12115 if (auto *I = dyn_cast<Instruction>(Vec)) {
12116 GatherShuffleExtractSeq.insert(I);
12117 CSEBlocks.insert(I->getParent());
12118 }
12119 return Vec;
12120 }
12121 Value *createIdentity(Value *V) { return V; }
12122 Value *createPoison(Type *Ty, unsigned VF) {
12123 return PoisonValue::get(getWidenedType(Ty, VF));
12124 }
12125 /// Resizes 2 input vector to match the sizes, if the they are not equal
12126 /// yet. The smallest vector is resized to the size of the larger vector.
12127 void resizeToMatch(Value *&V1, Value *&V2) {
12128 if (V1->getType() == V2->getType())
12129 return;
12130 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
12131 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
12132 int VF = std::max(V1VF, V2VF);
12133 int MinVF = std::min(V1VF, V2VF);
12134 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
12135 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
12136 0);
12137 Value *&Op = MinVF == V1VF ? V1 : V2;
12138 Op = Builder.CreateShuffleVector(Op, IdentityMask);
12139 if (auto *I = dyn_cast<Instruction>(Op)) {
12140 GatherShuffleExtractSeq.insert(I);
12141 CSEBlocks.insert(I->getParent());
12142 }
12143 if (MinVF == V1VF)
12144 V1 = Op;
12145 else
12146 V2 = Op;
12147 }
12148 };
12149
12150 /// Smart shuffle instruction emission, walks through shuffles trees and
12151 /// tries to find the best matching vector for the actual shuffle
12152 /// instruction.
12153 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
12154 assert(V1 && "Expected at least one vector value.");
12155 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
12156 R.CSEBlocks, *R.DL);
12157 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
12158 ShuffleBuilder);
12159 }
12160
12161 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12162 /// shuffle emission.
12163 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12164 ArrayRef<int> Mask) {
12165 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12166 if (Mask[Idx] != PoisonMaskElem)
12167 CommonMask[Idx] = Idx;
12168 }
12169
12170 /// Cast value \p V to the vector type with the same number of elements, but
12171 /// the base type \p ScalarTy.
12172 Value *castToScalarTyElem(Value *V,
12173 std::optional<bool> IsSigned = std::nullopt) {
12174 auto *VecTy = cast<VectorType>(V->getType());
12175 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
12176 if (VecTy->getElementType() == ScalarTy->getScalarType())
12177 return V;
12178 return Builder.CreateIntCast(
12179 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
12180 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
12181 }
12182
12183public:
12185 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
12186
12187 /// Adjusts extractelements after reusing them.
12188 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
12189 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
12190 unsigned NumParts, bool &UseVecBaseAsInput) {
12191 UseVecBaseAsInput = false;
12192 SmallPtrSet<Value *, 4> UniqueBases;
12193 Value *VecBase = nullptr;
12194 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12195 int Idx = Mask[I];
12196 if (Idx == PoisonMaskElem)
12197 continue;
12198 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12199 VecBase = EI->getVectorOperand();
12200 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
12201 VecBase = TE->VectorizedValue;
12202 assert(VecBase && "Expected vectorized value.");
12203 UniqueBases.insert(VecBase);
12204 // If the only one use is vectorized - can delete the extractelement
12205 // itself.
12206 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
12207 any_of(EI->users(), [&](User *U) {
12208 const TreeEntry *UTE = R.getTreeEntry(U);
12209 return !UTE || R.MultiNodeScalars.contains(U) ||
12210 (isa<GetElementPtrInst>(U) &&
12211 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
12212 count_if(R.VectorizableTree,
12213 [&](const std::unique_ptr<TreeEntry> &TE) {
12214 return any_of(TE->UserTreeIndices,
12215 [&](const EdgeInfo &Edge) {
12216 return Edge.UserTE == UTE;
12217 }) &&
12218 is_contained(TE->Scalars, EI);
12219 }) != 1;
12220 }))
12221 continue;
12222 R.eraseInstruction(EI);
12223 }
12224 if (NumParts == 1 || UniqueBases.size() == 1) {
12225 assert(VecBase && "Expected vectorized value.");
12226 return castToScalarTyElem(VecBase);
12227 }
12228 UseVecBaseAsInput = true;
12229 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
12230 for (auto [I, Idx] : enumerate(Mask))
12231 if (Idx != PoisonMaskElem)
12232 Idx = I;
12233 };
12234 // Perform multi-register vector shuffle, joining them into a single virtual
12235 // long vector.
12236 // Need to shuffle each part independently and then insert all this parts
12237 // into a long virtual vector register, forming the original vector.
12238 Value *Vec = nullptr;
12239 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12240 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12241 for (unsigned Part : seq<unsigned>(NumParts)) {
12242 unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
12244 ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
12245 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
12246 constexpr int MaxBases = 2;
12247 SmallVector<Value *, MaxBases> Bases(MaxBases);
12248 auto VLMask = zip(VL, SubMask);
12249 const unsigned VF = std::accumulate(
12250 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
12251 if (std::get<1>(D) == PoisonMaskElem)
12252 return S;
12253 Value *VecOp =
12254 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
12255 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
12256 VecOp = TE->VectorizedValue;
12257 assert(VecOp && "Expected vectorized value.");
12258 const unsigned Size =
12259 cast<FixedVectorType>(VecOp->getType())->getNumElements();
12260 return std::max(S, Size);
12261 });
12262 for (const auto [V, I] : VLMask) {
12263 if (I == PoisonMaskElem)
12264 continue;
12265 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
12266 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
12267 VecOp = TE->VectorizedValue;
12268 assert(VecOp && "Expected vectorized value.");
12269 VecOp = castToScalarTyElem(VecOp);
12270 Bases[I / VF] = VecOp;
12271 }
12272 if (!Bases.front())
12273 continue;
12274 Value *SubVec;
12275 if (Bases.back()) {
12276 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
12277 TransformToIdentity(SubMask);
12278 } else {
12279 SubVec = Bases.front();
12280 }
12281 if (!Vec) {
12282 Vec = SubVec;
12283 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
12284 [&](unsigned P) {
12285 ArrayRef<int> SubMask =
12286 Mask.slice(P * SliceSize,
12287 getNumElems(Mask.size(),
12288 SliceSize, P));
12289 return all_of(SubMask, [](int Idx) {
12290 return Idx == PoisonMaskElem;
12291 });
12292 })) &&
12293 "Expected first part or all previous parts masked.");
12294 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
12295 } else {
12296 unsigned NewVF =
12297 cast<FixedVectorType>(Vec->getType())->getNumElements();
12298 if (Vec->getType() != SubVec->getType()) {
12299 unsigned SubVecVF =
12300 cast<FixedVectorType>(SubVec->getType())->getNumElements();
12301 NewVF = std::max(NewVF, SubVecVF);
12302 }
12303 // Adjust SubMask.
12304 for (int &Idx : SubMask)
12305 if (Idx != PoisonMaskElem)
12306 Idx += NewVF;
12307 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
12308 Vec = createShuffle(Vec, SubVec, VecMask);
12309 TransformToIdentity(VecMask);
12310 }
12311 }
12312 copy(VecMask, Mask.begin());
12313 return Vec;
12314 }
12315 /// Checks if the specified entry \p E needs to be delayed because of its
12316 /// dependency nodes.
12317 std::optional<Value *>
12318 needToDelay(const TreeEntry *E,
12320 // No need to delay emission if all deps are ready.
12321 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
12322 return all_of(
12323 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
12324 }))
12325 return std::nullopt;
12326 // Postpone gather emission, will be emitted after the end of the
12327 // process to keep correct order.
12328 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
12329 return Builder.CreateAlignedLoad(
12330 ResVecTy,
12332 MaybeAlign());
12333 }
12334 /// Adds 2 input vectors (in form of tree entries) and the mask for their
12335 /// shuffling.
12336 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12337 Value *V1 = E1.VectorizedValue;
12338 if (V1->getType()->isIntOrIntVectorTy())
12339 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12340 return !isKnownNonNegative(
12341 V, SimplifyQuery(*R.DL));
12342 }));
12343 Value *V2 = E2.VectorizedValue;
12344 if (V2->getType()->isIntOrIntVectorTy())
12345 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
12346 return !isKnownNonNegative(
12347 V, SimplifyQuery(*R.DL));
12348 }));
12349 add(V1, V2, Mask);
12350 }
12351 /// Adds single input vector (in form of tree entry) and the mask for its
12352 /// shuffling.
12353 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12354 Value *V1 = E1.VectorizedValue;
12355 if (V1->getType()->isIntOrIntVectorTy())
12356 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12357 return !isKnownNonNegative(
12358 V, SimplifyQuery(*R.DL));
12359 }));
12360 add(V1, Mask);
12361 }
12362 /// Adds 2 input vectors and the mask for their shuffling.
12363 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
12364 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
12365 assert(isa<FixedVectorType>(V1->getType()) &&
12366 isa<FixedVectorType>(V2->getType()) &&
12367 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
12368 V1 = castToScalarTyElem(V1);
12369 V2 = castToScalarTyElem(V2);
12370 if (InVectors.empty()) {
12371 InVectors.push_back(V1);
12372 InVectors.push_back(V2);
12373 CommonMask.assign(Mask.begin(), Mask.end());
12374 return;
12375 }
12376 Value *Vec = InVectors.front();
12377 if (InVectors.size() == 2) {
12378 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12379 transformMaskAfterShuffle(CommonMask, CommonMask);
12380 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
12381 Mask.size()) {
12382 Vec = createShuffle(Vec, nullptr, CommonMask);
12383 transformMaskAfterShuffle(CommonMask, CommonMask);
12384 }
12385 V1 = createShuffle(V1, V2, Mask);
12386 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12387 if (Mask[Idx] != PoisonMaskElem)
12388 CommonMask[Idx] = Idx + Sz;
12389 InVectors.front() = Vec;
12390 if (InVectors.size() == 2)
12391 InVectors.back() = V1;
12392 else
12393 InVectors.push_back(V1);
12394 }
12395 /// Adds another one input vector and the mask for the shuffling.
12396 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
12397 assert(isa<FixedVectorType>(V1->getType()) &&
12398 "castToScalarTyElem expects V1 to be FixedVectorType");
12399 V1 = castToScalarTyElem(V1);
12400 if (InVectors.empty()) {
12401 InVectors.push_back(V1);
12402 CommonMask.assign(Mask.begin(), Mask.end());
12403 return;
12404 }
12405 const auto *It = find(InVectors, V1);
12406 if (It == InVectors.end()) {
12407 if (InVectors.size() == 2 ||
12408 InVectors.front()->getType() != V1->getType()) {
12409 Value *V = InVectors.front();
12410 if (InVectors.size() == 2) {
12411 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12412 transformMaskAfterShuffle(CommonMask, CommonMask);
12413 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
12414 CommonMask.size()) {
12415 V = createShuffle(InVectors.front(), nullptr, CommonMask);
12416 transformMaskAfterShuffle(CommonMask, CommonMask);
12417 }
12418 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12419 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
12420 CommonMask[Idx] =
12421 V->getType() != V1->getType()
12422 ? Idx + Sz
12423 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
12424 ->getNumElements();
12425 if (V->getType() != V1->getType())
12426 V1 = createShuffle(V1, nullptr, Mask);
12427 InVectors.front() = V;
12428 if (InVectors.size() == 2)
12429 InVectors.back() = V1;
12430 else
12431 InVectors.push_back(V1);
12432 return;
12433 }
12434 // Check if second vector is required if the used elements are already
12435 // used from the first one.
12436 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12437 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
12438 InVectors.push_back(V1);
12439 break;
12440 }
12441 }
12442 int VF = getVF(V1);
12443 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12444 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12445 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
12446 }
12447 /// Adds another one input vector and the mask for the shuffling.
12449 SmallVector<int> NewMask;
12450 inversePermutation(Order, NewMask);
12451 add(V1, NewMask);
12452 }
12453 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
12454 Value *Root = nullptr) {
12455 return R.gather(VL, Root, ScalarTy);
12456 }
12457 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
12458 /// Finalize emission of the shuffles.
12459 /// \param Action the action (if any) to be performed before final applying of
12460 /// the \p ExtMask mask.
12461 Value *
12462 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
12463 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
12464 IsFinalized = true;
12465 SmallVector<int> NewExtMask(ExtMask);
12466 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
12467 assert(SLPReVec && "FixedVectorType is not expected.");
12469 CommonMask);
12471 NewExtMask);
12472 ExtMask = NewExtMask;
12473 }
12474 if (Action) {
12475 Value *Vec = InVectors.front();
12476 if (InVectors.size() == 2) {
12477 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12478 InVectors.pop_back();
12479 } else {
12480 Vec = createShuffle(Vec, nullptr, CommonMask);
12481 }
12482 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12483 if (CommonMask[Idx] != PoisonMaskElem)
12484 CommonMask[Idx] = Idx;
12485 assert(VF > 0 &&
12486 "Expected vector length for the final value before action.");
12487 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
12488 if (VecVF < VF) {
12489 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12490 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
12491 Vec = createShuffle(Vec, nullptr, ResizeMask);
12492 }
12493 Action(Vec, CommonMask);
12494 InVectors.front() = Vec;
12495 }
12496 if (!ExtMask.empty()) {
12497 if (CommonMask.empty()) {
12498 CommonMask.assign(ExtMask.begin(), ExtMask.end());
12499 } else {
12500 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12501 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12502 if (ExtMask[I] == PoisonMaskElem)
12503 continue;
12504 NewMask[I] = CommonMask[ExtMask[I]];
12505 }
12506 CommonMask.swap(NewMask);
12507 }
12508 }
12509 if (CommonMask.empty()) {
12510 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12511 return InVectors.front();
12512 }
12513 if (InVectors.size() == 2)
12514 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12515 return createShuffle(InVectors.front(), nullptr, CommonMask);
12516 }
12517
12519 assert((IsFinalized || CommonMask.empty()) &&
12520 "Shuffle construction must be finalized.");
12521 }
12522};
12523
12524Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12525 bool PostponedPHIs) {
12526 ValueList &VL = E->getOperand(NodeIdx);
12527 const unsigned VF = VL.size();
12528 InstructionsState S = getSameOpcode(VL, *TLI);
12529 // Special processing for GEPs bundle, which may include non-gep values.
12530 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12531 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12532 if (It != VL.end())
12533 S = getSameOpcode(*It, *TLI);
12534 }
12535 if (S.getOpcode()) {
12536 auto CheckSameVE = [&](const TreeEntry *VE) {
12537 return VE->isSame(VL) &&
12538 (any_of(VE->UserTreeIndices,
12539 [E, NodeIdx](const EdgeInfo &EI) {
12540 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12541 }) ||
12542 any_of(VectorizableTree,
12543 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12544 return TE->isOperandGatherNode({E, NodeIdx}) &&
12545 VE->isSame(TE->Scalars);
12546 }));
12547 };
12548 TreeEntry *VE = getTreeEntry(S.OpValue);
12549 bool IsSameVE = VE && CheckSameVE(VE);
12550 if (!IsSameVE) {
12551 auto It = MultiNodeScalars.find(S.OpValue);
12552 if (It != MultiNodeScalars.end()) {
12553 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12554 return TE != VE && CheckSameVE(TE);
12555 });
12556 if (I != It->getSecond().end()) {
12557 VE = *I;
12558 IsSameVE = true;
12559 }
12560 }
12561 }
12562 if (IsSameVE) {
12563 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12564 // V may be affected by MinBWs.
12565 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
12566 // factor is the number of elements, not their type.
12567 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
12568 unsigned NumElements = getNumElements(VL.front()->getType());
12569 ShuffleInstructionBuilder ShuffleBuilder(
12570 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
12571 : ScalarTy,
12572 Builder, *this);
12573 ShuffleBuilder.add(V, Mask);
12574 return ShuffleBuilder.finalize(std::nullopt);
12575 };
12576 Value *V = vectorizeTree(VE, PostponedPHIs);
12577 if (VF * getNumElements(VL[0]->getType()) !=
12578 cast<FixedVectorType>(V->getType())->getNumElements()) {
12579 if (!VE->ReuseShuffleIndices.empty()) {
12580 // Reshuffle to get only unique values.
12581 // If some of the scalars are duplicated in the vectorization
12582 // tree entry, we do not vectorize them but instead generate a
12583 // mask for the reuses. But if there are several users of the
12584 // same entry, they may have different vectorization factors.
12585 // This is especially important for PHI nodes. In this case, we
12586 // need to adapt the resulting instruction for the user
12587 // vectorization factor and have to reshuffle it again to take
12588 // only unique elements of the vector. Without this code the
12589 // function incorrectly returns reduced vector instruction with
12590 // the same elements, not with the unique ones.
12591
12592 // block:
12593 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12594 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12595 // ... (use %2)
12596 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12597 // br %block
12599 for (auto [I, V] : enumerate(VL)) {
12600 if (isa<PoisonValue>(V))
12601 continue;
12602 Mask[I] = VE->findLaneForValue(V);
12603 }
12604 V = FinalShuffle(V, Mask);
12605 } else {
12606 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12607 "Expected vectorization factor less "
12608 "than original vector size.");
12609 SmallVector<int> UniformMask(VF, 0);
12610 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12611 V = FinalShuffle(V, UniformMask);
12612 }
12613 }
12614 // Need to update the operand gather node, if actually the operand is not a
12615 // vectorized node, but the buildvector/gather node, which matches one of
12616 // the vectorized nodes.
12617 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12618 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12619 }) == VE->UserTreeIndices.end()) {
12620 auto *It = find_if(
12621 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12622 return TE->isGather() &&
12623 TE->UserTreeIndices.front().UserTE == E &&
12624 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12625 });
12626 assert(It != VectorizableTree.end() && "Expected gather node operand.");
12627 (*It)->VectorizedValue = V;
12628 }
12629 return V;
12630 }
12631 }
12632
12633 // Find the corresponding gather entry and vectorize it.
12634 // Allows to be more accurate with tree/graph transformations, checks for the
12635 // correctness of the transformations in many cases.
12636 auto *I = find_if(VectorizableTree,
12637 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12638 return TE->isOperandGatherNode({E, NodeIdx});
12639 });
12640 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12641 assert(I->get()->UserTreeIndices.size() == 1 &&
12642 "Expected only single user for the gather node.");
12643 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12644 return vectorizeTree(I->get(), PostponedPHIs);
12645}
12646
12647template <typename BVTy, typename ResTy, typename... Args>
12648ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12649 Args &...Params) {
12650 assert(E->isGather() && "Expected gather node.");
12651 unsigned VF = E->getVectorFactor();
12652
12653 bool NeedFreeze = false;
12654 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12655 E->ReuseShuffleIndices.end());
12656 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12657 // Build a mask out of the reorder indices and reorder scalars per this
12658 // mask.
12659 SmallVector<int> ReorderMask;
12660 inversePermutation(E->ReorderIndices, ReorderMask);
12661 if (!ReorderMask.empty())
12662 reorderScalars(GatheredScalars, ReorderMask);
12663 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12664 unsigned I, unsigned SliceSize) {
12665 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12666 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12667 }))
12668 return false;
12669 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12670 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12671 if (UserTE->getNumOperands() != 2)
12672 return false;
12673 auto *It =
12674 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12675 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12676 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12677 }) != TE->UserTreeIndices.end();
12678 });
12679 if (It == VectorizableTree.end())
12680 return false;
12681 int Idx;
12682 if ((Mask.size() < InputVF &&
12684 Idx == 0) ||
12685 (Mask.size() == InputVF &&
12686 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12687 std::iota(
12688 std::next(Mask.begin(), I * SliceSize),
12689 std::next(Mask.begin(),
12690 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12691 0);
12692 } else {
12693 unsigned IVal =
12694 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12695 std::fill(
12696 std::next(Mask.begin(), I * SliceSize),
12697 std::next(Mask.begin(),
12698 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12699 IVal);
12700 }
12701 return true;
12702 };
12703 BVTy ShuffleBuilder(ScalarTy, Params...);
12704 ResTy Res = ResTy();
12706 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12708 Value *ExtractVecBase = nullptr;
12709 bool UseVecBaseAsInput = false;
12712 Type *OrigScalarTy = GatheredScalars.front()->getType();
12713 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12714 unsigned NumParts = TTI->getNumberOfParts(VecTy);
12715 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12716 NumParts = 1;
12717 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12718 // Check for gathered extracts.
12719 bool Resized = false;
12720 ExtractShuffles =
12721 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12722 if (!ExtractShuffles.empty()) {
12723 SmallVector<const TreeEntry *> ExtractEntries;
12724 for (auto [Idx, I] : enumerate(ExtractMask)) {
12725 if (I == PoisonMaskElem)
12726 continue;
12727 if (const auto *TE = getTreeEntry(
12728 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12729 ExtractEntries.push_back(TE);
12730 }
12731 if (std::optional<ResTy> Delayed =
12732 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12733 // Delay emission of gathers which are not ready yet.
12734 PostponedGathers.insert(E);
12735 // Postpone gather emission, will be emitted after the end of the
12736 // process to keep correct order.
12737 return *Delayed;
12738 }
12739 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12740 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12741 ExtractVecBase = VecBase;
12742 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12743 if (VF == VecBaseTy->getNumElements() &&
12744 GatheredScalars.size() != VF) {
12745 Resized = true;
12746 GatheredScalars.append(VF - GatheredScalars.size(),
12747 PoisonValue::get(OrigScalarTy));
12748 }
12749 }
12750 }
12751 // Gather extracts after we check for full matched gathers only.
12752 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12753 E->isAltShuffle() ||
12754 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12755 isSplat(E->Scalars) ||
12756 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12757 GatherShuffles =
12758 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12759 }
12760 if (!GatherShuffles.empty()) {
12761 if (std::optional<ResTy> Delayed =
12762 ShuffleBuilder.needToDelay(E, Entries)) {
12763 // Delay emission of gathers which are not ready yet.
12764 PostponedGathers.insert(E);
12765 // Postpone gather emission, will be emitted after the end of the
12766 // process to keep correct order.
12767 return *Delayed;
12768 }
12769 if (GatherShuffles.size() == 1 &&
12770 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12771 Entries.front().front()->isSame(E->Scalars)) {
12772 // Perfect match in the graph, will reuse the previously vectorized
12773 // node. Cost is 0.
12774 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
12775 << shortBundleName(E->Scalars, E->Idx) << ".\n");
12776 // Restore the mask for previous partially matched values.
12777 Mask.resize(E->Scalars.size());
12778 const TreeEntry *FrontTE = Entries.front().front();
12779 if (FrontTE->ReorderIndices.empty() &&
12780 ((FrontTE->ReuseShuffleIndices.empty() &&
12781 E->Scalars.size() == FrontTE->Scalars.size()) ||
12782 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12783 std::iota(Mask.begin(), Mask.end(), 0);
12784 } else {
12785 for (auto [I, V] : enumerate(E->Scalars)) {
12786 if (isa<PoisonValue>(V)) {
12788 continue;
12789 }
12790 Mask[I] = FrontTE->findLaneForValue(V);
12791 }
12792 }
12793 ShuffleBuilder.add(*FrontTE, Mask);
12794 Res = ShuffleBuilder.finalize(E->getCommonMask());
12795 return Res;
12796 }
12797 if (!Resized) {
12798 if (GatheredScalars.size() != VF &&
12799 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12800 return any_of(TEs, [&](const TreeEntry *TE) {
12801 return TE->getVectorFactor() == VF;
12802 });
12803 }))
12804 GatheredScalars.append(VF - GatheredScalars.size(),
12805 PoisonValue::get(OrigScalarTy));
12806 }
12807 // Remove shuffled elements from list of gathers.
12808 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12809 if (Mask[I] != PoisonMaskElem)
12810 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12811 }
12812 }
12813 }
12814 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12815 SmallVectorImpl<int> &ReuseMask,
12816 bool IsRootPoison) {
12817 // For splats with can emit broadcasts instead of gathers, so try to find
12818 // such sequences.
12819 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12820 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12821 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12822 SmallVector<int> UndefPos;
12823 DenseMap<Value *, unsigned> UniquePositions;
12824 // Gather unique non-const values and all constant values.
12825 // For repeated values, just shuffle them.
12826 int NumNonConsts = 0;
12827 int SinglePos = 0;
12828 for (auto [I, V] : enumerate(Scalars)) {
12829 if (isa<UndefValue>(V)) {
12830 if (!isa<PoisonValue>(V)) {
12831 ReuseMask[I] = I;
12832 UndefPos.push_back(I);
12833 }
12834 continue;
12835 }
12836 if (isConstant(V)) {
12837 ReuseMask[I] = I;
12838 continue;
12839 }
12840 ++NumNonConsts;
12841 SinglePos = I;
12842 Value *OrigV = V;
12843 Scalars[I] = PoisonValue::get(OrigScalarTy);
12844 if (IsSplat) {
12845 Scalars.front() = OrigV;
12846 ReuseMask[I] = 0;
12847 } else {
12848 const auto Res = UniquePositions.try_emplace(OrigV, I);
12849 Scalars[Res.first->second] = OrigV;
12850 ReuseMask[I] = Res.first->second;
12851 }
12852 }
12853 if (NumNonConsts == 1) {
12854 // Restore single insert element.
12855 if (IsSplat) {
12856 ReuseMask.assign(VF, PoisonMaskElem);
12857 std::swap(Scalars.front(), Scalars[SinglePos]);
12858 if (!UndefPos.empty() && UndefPos.front() == 0)
12859 Scalars.front() = UndefValue::get(OrigScalarTy);
12860 }
12861 ReuseMask[SinglePos] = SinglePos;
12862 } else if (!UndefPos.empty() && IsSplat) {
12863 // For undef values, try to replace them with the simple broadcast.
12864 // We can do it if the broadcasted value is guaranteed to be
12865 // non-poisonous, or by freezing the incoming scalar value first.
12866 auto *It = find_if(Scalars, [this, E](Value *V) {
12867 return !isa<UndefValue>(V) &&
12868 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12869 (E->UserTreeIndices.size() == 1 &&
12870 any_of(V->uses(), [E](const Use &U) {
12871 // Check if the value already used in the same operation in
12872 // one of the nodes already.
12873 return E->UserTreeIndices.front().EdgeIdx !=
12874 U.getOperandNo() &&
12875 is_contained(
12876 E->UserTreeIndices.front().UserTE->Scalars,
12877 U.getUser());
12878 })));
12879 });
12880 if (It != Scalars.end()) {
12881 // Replace undefs by the non-poisoned scalars and emit broadcast.
12882 int Pos = std::distance(Scalars.begin(), It);
12883 for (int I : UndefPos) {
12884 // Set the undef position to the non-poisoned scalar.
12885 ReuseMask[I] = Pos;
12886 // Replace the undef by the poison, in the mask it is replaced by
12887 // non-poisoned scalar already.
12888 if (I != Pos)
12889 Scalars[I] = PoisonValue::get(OrigScalarTy);
12890 }
12891 } else {
12892 // Replace undefs by the poisons, emit broadcast and then emit
12893 // freeze.
12894 for (int I : UndefPos) {
12895 ReuseMask[I] = PoisonMaskElem;
12896 if (isa<UndefValue>(Scalars[I]))
12897 Scalars[I] = PoisonValue::get(OrigScalarTy);
12898 }
12899 NeedFreeze = true;
12900 }
12901 }
12902 };
12903 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12904 bool IsNonPoisoned = true;
12905 bool IsUsedInExpr = true;
12906 Value *Vec1 = nullptr;
12907 if (!ExtractShuffles.empty()) {
12908 // Gather of extractelements can be represented as just a shuffle of
12909 // a single/two vectors the scalars are extracted from.
12910 // Find input vectors.
12911 Value *Vec2 = nullptr;
12912 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12913 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12914 ExtractMask[I] = PoisonMaskElem;
12915 }
12916 if (UseVecBaseAsInput) {
12917 Vec1 = ExtractVecBase;
12918 } else {
12919 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12920 if (ExtractMask[I] == PoisonMaskElem)
12921 continue;
12922 if (isa<UndefValue>(E->Scalars[I]))
12923 continue;
12924 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12925 Value *VecOp = EI->getVectorOperand();
12926 if (const auto *TE = getTreeEntry(VecOp))
12927 if (TE->VectorizedValue)
12928 VecOp = TE->VectorizedValue;
12929 if (!Vec1) {
12930 Vec1 = VecOp;
12931 } else if (Vec1 != VecOp) {
12932 assert((!Vec2 || Vec2 == VecOp) &&
12933 "Expected only 1 or 2 vectors shuffle.");
12934 Vec2 = VecOp;
12935 }
12936 }
12937 }
12938 if (Vec2) {
12939 IsUsedInExpr = false;
12940 IsNonPoisoned &=
12942 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12943 } else if (Vec1) {
12944 IsUsedInExpr &= FindReusedSplat(
12945 ExtractMask,
12946 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12947 ExtractMask.size());
12948 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12949 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12950 } else {
12951 IsUsedInExpr = false;
12952 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12953 /*ForExtracts=*/true);
12954 }
12955 }
12956 if (!GatherShuffles.empty()) {
12957 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12958 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12959 for (const auto [I, TEs] : enumerate(Entries)) {
12960 if (TEs.empty()) {
12961 assert(!GatherShuffles[I] &&
12962 "No shuffles with empty entries list expected.");
12963 continue;
12964 }
12965 assert((TEs.size() == 1 || TEs.size() == 2) &&
12966 "Expected shuffle of 1 or 2 entries.");
12967 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
12968 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
12969 VecMask.assign(VecMask.size(), PoisonMaskElem);
12970 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12971 if (TEs.size() == 1) {
12972 IsUsedInExpr &= FindReusedSplat(
12973 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12974 ShuffleBuilder.add(*TEs.front(), VecMask);
12975 if (TEs.front()->VectorizedValue)
12976 IsNonPoisoned &=
12977 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12978 } else {
12979 IsUsedInExpr = false;
12980 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12981 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12982 IsNonPoisoned &=
12983 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12984 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12985 }
12986 }
12987 }
12988 // Try to figure out best way to combine values: build a shuffle and insert
12989 // elements or just build several shuffles.
12990 // Insert non-constant scalars.
12991 SmallVector<Value *> NonConstants(GatheredScalars);
12992 int EMSz = ExtractMask.size();
12993 int MSz = Mask.size();
12994 // Try to build constant vector and shuffle with it only if currently we
12995 // have a single permutation and more than 1 scalar constants.
12996 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12997 bool IsIdentityShuffle =
12998 ((UseVecBaseAsInput ||
12999 all_of(ExtractShuffles,
13000 [](const std::optional<TTI::ShuffleKind> &SK) {
13001 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
13003 })) &&
13004 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
13005 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
13006 (!GatherShuffles.empty() &&
13007 all_of(GatherShuffles,
13008 [](const std::optional<TTI::ShuffleKind> &SK) {
13009 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
13011 }) &&
13012 none_of(Mask, [&](int I) { return I >= MSz; }) &&
13014 bool EnoughConstsForShuffle =
13015 IsSingleShuffle &&
13016 (none_of(GatheredScalars,
13017 [](Value *V) {
13018 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
13019 }) ||
13020 any_of(GatheredScalars,
13021 [](Value *V) {
13022 return isa<Constant>(V) && !isa<UndefValue>(V);
13023 })) &&
13024 (!IsIdentityShuffle ||
13025 (GatheredScalars.size() == 2 &&
13026 any_of(GatheredScalars,
13027 [](Value *V) { return !isa<UndefValue>(V); })) ||
13028 count_if(GatheredScalars, [](Value *V) {
13029 return isa<Constant>(V) && !isa<PoisonValue>(V);
13030 }) > 1);
13031 // NonConstants array contains just non-constant values, GatheredScalars
13032 // contains only constant to build final vector and then shuffle.
13033 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
13034 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
13035 NonConstants[I] = PoisonValue::get(OrigScalarTy);
13036 else
13037 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
13038 }
13039 // Generate constants for final shuffle and build a mask for them.
13040 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
13041 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
13042 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
13043 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
13044 ShuffleBuilder.add(BV, BVMask);
13045 }
13046 if (all_of(NonConstants, [=](Value *V) {
13047 return isa<PoisonValue>(V) ||
13048 (IsSingleShuffle && ((IsIdentityShuffle &&
13049 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
13050 }))
13051 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
13052 else
13053 Res = ShuffleBuilder.finalize(
13054 E->ReuseShuffleIndices, E->Scalars.size(),
13055 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
13056 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
13057 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
13058 });
13059 } else if (!allConstant(GatheredScalars)) {
13060 // Gather unique scalars and all constants.
13061 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
13062 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
13063 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
13064 ShuffleBuilder.add(BV, ReuseMask);
13065 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
13066 } else {
13067 // Gather all constants.
13068 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
13069 for (auto [I, V] : enumerate(GatheredScalars)) {
13070 if (!isa<PoisonValue>(V))
13071 Mask[I] = I;
13072 }
13073 Value *BV = ShuffleBuilder.gather(GatheredScalars);
13074 ShuffleBuilder.add(BV, Mask);
13075 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
13076 }
13077
13078 if (NeedFreeze)
13079 Res = ShuffleBuilder.createFreeze(Res);
13080 return Res;
13081}
13082
13083Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
13084 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
13085 Builder, *this);
13086}
13087
13088Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
13089 IRBuilderBase::InsertPointGuard Guard(Builder);
13090
13091 if (E->VectorizedValue &&
13092 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
13093 E->isAltShuffle())) {
13094 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
13095 return E->VectorizedValue;
13096 }
13097
13098 Value *V = E->Scalars.front();
13099 Type *ScalarTy = V->getType();
13100 if (auto *Store = dyn_cast<StoreInst>(V))
13101 ScalarTy = Store->getValueOperand()->getType();
13102 else if (auto *IE = dyn_cast<InsertElementInst>(V))
13103 ScalarTy = IE->getOperand(1)->getType();
13104 auto It = MinBWs.find(E);
13105 if (It != MinBWs.end()) {
13106 auto VecTy = dyn_cast<FixedVectorType>(ScalarTy);
13107 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
13108 if (VecTy)
13109 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
13110 }
13111 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
13112 if (E->isGather()) {
13113 // Set insert point for non-reduction initial nodes.
13114 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
13115 setInsertPointAfterBundle(E);
13116 Value *Vec = createBuildVector(E, ScalarTy);
13117 E->VectorizedValue = Vec;
13118 return Vec;
13119 }
13120
13121 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
13122 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
13123 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
13124 if (E->getOpcode() == Instruction::Store &&
13125 E->State == TreeEntry::Vectorize) {
13127 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
13128 E->ReorderIndices.size());
13129 ShuffleBuilder.add(V, Mask);
13130 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
13131 ShuffleBuilder.addOrdered(V, std::nullopt);
13132 } else {
13133 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
13134 }
13135 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
13136 };
13137
13138 assert(!E->isGather() && "Unhandled state");
13139 unsigned ShuffleOrOp =
13140 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
13141 Instruction *VL0 = E->getMainOp();
13142 auto GetOperandSignedness = [&](unsigned Idx) {
13143 const TreeEntry *OpE = getOperandEntry(E, Idx);
13144 bool IsSigned = false;
13145 auto It = MinBWs.find(OpE);
13146 if (It != MinBWs.end())
13147 IsSigned = It->second.second;
13148 else
13149 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
13150 return !isKnownNonNegative(R, SimplifyQuery(*DL));
13151 });
13152 return IsSigned;
13153 };
13154 switch (ShuffleOrOp) {
13155 case Instruction::PHI: {
13156 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
13157 E != VectorizableTree.front().get() ||
13158 !E->UserTreeIndices.empty()) &&
13159 "PHI reordering is free.");
13160 if (PostponedPHIs && E->VectorizedValue)
13161 return E->VectorizedValue;
13162 auto *PH = cast<PHINode>(VL0);
13163 Builder.SetInsertPoint(PH->getParent(),
13164 PH->getParent()->getFirstNonPHIIt());
13165 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
13166 if (PostponedPHIs || !E->VectorizedValue) {
13167 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
13168 E->PHI = NewPhi;
13169 Value *V = NewPhi;
13170
13171 // Adjust insertion point once all PHI's have been generated.
13172 Builder.SetInsertPoint(PH->getParent(),
13173 PH->getParent()->getFirstInsertionPt());
13174 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
13175
13176 V = FinalShuffle(V, E, VecTy);
13177
13178 E->VectorizedValue = V;
13179 if (PostponedPHIs)
13180 return V;
13181 }
13182 PHINode *NewPhi = cast<PHINode>(E->PHI);
13183 // If phi node is fully emitted - exit.
13184 if (NewPhi->getNumIncomingValues() != 0)
13185 return NewPhi;
13186
13187 // PHINodes may have multiple entries from the same block. We want to
13188 // visit every block once.
13190
13191 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13193 BasicBlock *IBB = PH->getIncomingBlock(I);
13194
13195 // Stop emission if all incoming values are generated.
13196 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
13197 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13198 return NewPhi;
13199 }
13200
13201 if (!VisitedBBs.insert(IBB).second) {
13202 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
13203 continue;
13204 }
13205
13206 Builder.SetInsertPoint(IBB->getTerminator());
13207 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
13208 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
13209 if (VecTy != Vec->getType()) {
13210 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
13211 MinBWs.contains(getOperandEntry(E, I))) &&
13212 "Expected item in MinBWs.");
13213 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
13214 }
13215 NewPhi->addIncoming(Vec, IBB);
13216 }
13217
13218 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
13219 "Invalid number of incoming values");
13220 return NewPhi;
13221 }
13222
13223 case Instruction::ExtractElement: {
13224 Value *V = E->getSingleOperand(0);
13225 if (const TreeEntry *TE = getTreeEntry(V))
13226 V = TE->VectorizedValue;
13227 setInsertPointAfterBundle(E);
13228 V = FinalShuffle(V, E, VecTy);
13229 E->VectorizedValue = V;
13230 return V;
13231 }
13232 case Instruction::ExtractValue: {
13233 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
13234 Builder.SetInsertPoint(LI);
13235 Value *Ptr = LI->getPointerOperand();
13236 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
13237 Value *NewV = propagateMetadata(V, E->Scalars);
13238 NewV = FinalShuffle(NewV, E, VecTy);
13239 E->VectorizedValue = NewV;
13240 return NewV;
13241 }
13242 case Instruction::InsertElement: {
13243 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
13244 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
13245 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
13246 ArrayRef<Value *> Op = E->getOperand(1);
13247 Type *ScalarTy = Op.front()->getType();
13248 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
13249 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
13250 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
13251 assert(Res.first > 0 && "Expected item in MinBWs.");
13252 V = Builder.CreateIntCast(
13253 V,
13255 ScalarTy,
13256 cast<FixedVectorType>(V->getType())->getNumElements()),
13257 Res.second);
13258 }
13259
13260 // Create InsertVector shuffle if necessary
13261 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
13262 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
13263 }));
13264 const unsigned NumElts =
13265 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
13266 const unsigned NumScalars = E->Scalars.size();
13267
13268 unsigned Offset = *getElementIndex(VL0);
13269 assert(Offset < NumElts && "Failed to find vector index offset");
13270
13271 // Create shuffle to resize vector
13273 if (!E->ReorderIndices.empty()) {
13274 inversePermutation(E->ReorderIndices, Mask);
13275 Mask.append(NumElts - NumScalars, PoisonMaskElem);
13276 } else {
13277 Mask.assign(NumElts, PoisonMaskElem);
13278 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
13279 }
13280 // Create InsertVector shuffle if necessary
13281 bool IsIdentity = true;
13282 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
13283 Mask.swap(PrevMask);
13284 for (unsigned I = 0; I < NumScalars; ++I) {
13285 Value *Scalar = E->Scalars[PrevMask[I]];
13286 unsigned InsertIdx = *getElementIndex(Scalar);
13287 IsIdentity &= InsertIdx - Offset == I;
13288 Mask[InsertIdx - Offset] = I;
13289 }
13290 if (!IsIdentity || NumElts != NumScalars) {
13291 Value *V2 = nullptr;
13292 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
13293 SmallVector<int> InsertMask(Mask);
13294 if (NumElts != NumScalars && Offset == 0) {
13295 // Follow all insert element instructions from the current buildvector
13296 // sequence.
13297 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
13298 do {
13299 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
13300 if (!InsertIdx)
13301 break;
13302 if (InsertMask[*InsertIdx] == PoisonMaskElem)
13303 InsertMask[*InsertIdx] = *InsertIdx;
13304 if (!Ins->hasOneUse())
13305 break;
13306 Ins = dyn_cast_or_null<InsertElementInst>(
13307 Ins->getUniqueUndroppableUser());
13308 } while (Ins);
13309 SmallBitVector UseMask =
13310 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13311 SmallBitVector IsFirstPoison =
13312 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13313 SmallBitVector IsFirstUndef =
13314 isUndefVector(FirstInsert->getOperand(0), UseMask);
13315 if (!IsFirstPoison.all()) {
13316 unsigned Idx = 0;
13317 for (unsigned I = 0; I < NumElts; I++) {
13318 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
13319 IsFirstUndef.test(I)) {
13320 if (IsVNonPoisonous) {
13321 InsertMask[I] = I < NumScalars ? I : 0;
13322 continue;
13323 }
13324 if (!V2)
13325 V2 = UndefValue::get(V->getType());
13326 if (Idx >= NumScalars)
13327 Idx = NumScalars - 1;
13328 InsertMask[I] = NumScalars + Idx;
13329 ++Idx;
13330 } else if (InsertMask[I] != PoisonMaskElem &&
13331 Mask[I] == PoisonMaskElem) {
13332 InsertMask[I] = PoisonMaskElem;
13333 }
13334 }
13335 } else {
13336 InsertMask = Mask;
13337 }
13338 }
13339 if (!V2)
13340 V2 = PoisonValue::get(V->getType());
13341 V = Builder.CreateShuffleVector(V, V2, InsertMask);
13342 if (auto *I = dyn_cast<Instruction>(V)) {
13343 GatherShuffleExtractSeq.insert(I);
13344 CSEBlocks.insert(I->getParent());
13345 }
13346 }
13347
13348 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13349 for (unsigned I = 0; I < NumElts; I++) {
13350 if (Mask[I] != PoisonMaskElem)
13351 InsertMask[Offset + I] = I;
13352 }
13353 SmallBitVector UseMask =
13354 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13355 SmallBitVector IsFirstUndef =
13356 isUndefVector(FirstInsert->getOperand(0), UseMask);
13357 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
13358 NumElts != NumScalars) {
13359 if (IsFirstUndef.all()) {
13360 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
13361 SmallBitVector IsFirstPoison =
13362 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13363 if (!IsFirstPoison.all()) {
13364 for (unsigned I = 0; I < NumElts; I++) {
13365 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
13366 InsertMask[I] = I + NumElts;
13367 }
13368 }
13369 V = Builder.CreateShuffleVector(
13370 V,
13371 IsFirstPoison.all() ? PoisonValue::get(V->getType())
13372 : FirstInsert->getOperand(0),
13373 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
13374 if (auto *I = dyn_cast<Instruction>(V)) {
13375 GatherShuffleExtractSeq.insert(I);
13376 CSEBlocks.insert(I->getParent());
13377 }
13378 }
13379 } else {
13380 SmallBitVector IsFirstPoison =
13381 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13382 for (unsigned I = 0; I < NumElts; I++) {
13383 if (InsertMask[I] == PoisonMaskElem)
13384 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
13385 else
13386 InsertMask[I] += NumElts;
13387 }
13388 V = Builder.CreateShuffleVector(
13389 FirstInsert->getOperand(0), V, InsertMask,
13390 cast<Instruction>(E->Scalars.back())->getName());
13391 if (auto *I = dyn_cast<Instruction>(V)) {
13392 GatherShuffleExtractSeq.insert(I);
13393 CSEBlocks.insert(I->getParent());
13394 }
13395 }
13396 }
13397
13398 ++NumVectorInstructions;
13399 E->VectorizedValue = V;
13400 return V;
13401 }
13402 case Instruction::ZExt:
13403 case Instruction::SExt:
13404 case Instruction::FPToUI:
13405 case Instruction::FPToSI:
13406 case Instruction::FPExt:
13407 case Instruction::PtrToInt:
13408 case Instruction::IntToPtr:
13409 case Instruction::SIToFP:
13410 case Instruction::UIToFP:
13411 case Instruction::Trunc:
13412 case Instruction::FPTrunc:
13413 case Instruction::BitCast: {
13414 setInsertPointAfterBundle(E);
13415
13416 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
13417 if (E->VectorizedValue) {
13418 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13419 return E->VectorizedValue;
13420 }
13421
13422 auto *CI = cast<CastInst>(VL0);
13423 Instruction::CastOps VecOpcode = CI->getOpcode();
13424 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
13425 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
13426 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
13427 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
13428 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
13429 // Check if the values are candidates to demote.
13430 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
13431 if (SrcIt != MinBWs.end())
13432 SrcBWSz = SrcIt->second.first;
13433 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
13434 if (BWSz == SrcBWSz) {
13435 VecOpcode = Instruction::BitCast;
13436 } else if (BWSz < SrcBWSz) {
13437 VecOpcode = Instruction::Trunc;
13438 } else if (It != MinBWs.end()) {
13439 assert(BWSz > SrcBWSz && "Invalid cast!");
13440 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13441 } else if (SrcIt != MinBWs.end()) {
13442 assert(BWSz > SrcBWSz && "Invalid cast!");
13443 VecOpcode =
13444 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13445 }
13446 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13447 !SrcIt->second.second) {
13448 VecOpcode = Instruction::UIToFP;
13449 }
13450 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13451 ? InVec
13452 : Builder.CreateCast(VecOpcode, InVec, VecTy);
13453 V = FinalShuffle(V, E, VecTy);
13454
13455 E->VectorizedValue = V;
13456 ++NumVectorInstructions;
13457 return V;
13458 }
13459 case Instruction::FCmp:
13460 case Instruction::ICmp: {
13461 setInsertPointAfterBundle(E);
13462
13463 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
13464 if (E->VectorizedValue) {
13465 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13466 return E->VectorizedValue;
13467 }
13468 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
13469 if (E->VectorizedValue) {
13470 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13471 return E->VectorizedValue;
13472 }
13473 if (L->getType() != R->getType()) {
13474 assert((getOperandEntry(E, 0)->isGather() ||
13475 getOperandEntry(E, 1)->isGather() ||
13476 MinBWs.contains(getOperandEntry(E, 0)) ||
13477 MinBWs.contains(getOperandEntry(E, 1))) &&
13478 "Expected item in MinBWs.");
13479 if (cast<VectorType>(L->getType())
13480 ->getElementType()
13481 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
13482 ->getElementType()
13483 ->getIntegerBitWidth()) {
13484 Type *CastTy = R->getType();
13485 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
13486 } else {
13487 Type *CastTy = L->getType();
13488 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
13489 }
13490 }
13491
13492 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
13493 Value *V = Builder.CreateCmp(P0, L, R);
13494 propagateIRFlags(V, E->Scalars, VL0);
13495 // Do not cast for cmps.
13496 VecTy = cast<FixedVectorType>(V->getType());
13497 V = FinalShuffle(V, E, VecTy);
13498
13499 E->VectorizedValue = V;
13500 ++NumVectorInstructions;
13501 return V;
13502 }
13503 case Instruction::Select: {
13504 setInsertPointAfterBundle(E);
13505
13506 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
13507 if (E->VectorizedValue) {
13508 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13509 return E->VectorizedValue;
13510 }
13511 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13512 if (E->VectorizedValue) {
13513 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13514 return E->VectorizedValue;
13515 }
13516 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13517 if (E->VectorizedValue) {
13518 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13519 return E->VectorizedValue;
13520 }
13521 if (True->getType() != VecTy || False->getType() != VecTy) {
13522 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
13523 getOperandEntry(E, 2)->isGather() ||
13524 MinBWs.contains(getOperandEntry(E, 1)) ||
13525 MinBWs.contains(getOperandEntry(E, 2))) &&
13526 "Expected item in MinBWs.");
13527 if (True->getType() != VecTy)
13528 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
13529 if (False->getType() != VecTy)
13530 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
13531 }
13532
13533 unsigned CondNumElements = getNumElements(Cond->getType());
13534 unsigned TrueNumElements = getNumElements(True->getType());
13535 assert(TrueNumElements >= CondNumElements &&
13536 TrueNumElements % CondNumElements == 0 &&
13537 "Cannot vectorize Instruction::Select");
13538 assert(TrueNumElements == getNumElements(False->getType()) &&
13539 "Cannot vectorize Instruction::Select");
13540 if (CondNumElements != TrueNumElements) {
13541 // When the return type is i1 but the source is fixed vector type, we
13542 // need to duplicate the condition value.
13543 Cond = Builder.CreateShuffleVector(
13544 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
13545 CondNumElements));
13546 }
13547 assert(getNumElements(Cond->getType()) == TrueNumElements &&
13548 "Cannot vectorize Instruction::Select");
13549 Value *V = Builder.CreateSelect(Cond, True, False);
13550 V = FinalShuffle(V, E, VecTy);
13551
13552 E->VectorizedValue = V;
13553 ++NumVectorInstructions;
13554 return V;
13555 }
13556 case Instruction::FNeg: {
13557 setInsertPointAfterBundle(E);
13558
13559 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13560
13561 if (E->VectorizedValue) {
13562 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13563 return E->VectorizedValue;
13564 }
13565
13566 Value *V = Builder.CreateUnOp(
13567 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
13568 propagateIRFlags(V, E->Scalars, VL0);
13569 if (auto *I = dyn_cast<Instruction>(V))
13570 V = propagateMetadata(I, E->Scalars);
13571
13572 V = FinalShuffle(V, E, VecTy);
13573
13574 E->VectorizedValue = V;
13575 ++NumVectorInstructions;
13576
13577 return V;
13578 }
13579 case Instruction::Freeze: {
13580 setInsertPointAfterBundle(E);
13581
13582 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13583
13584 if (E->VectorizedValue) {
13585 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13586 return E->VectorizedValue;
13587 }
13588
13589 Value *V = Builder.CreateFreeze(Op);
13590 V = FinalShuffle(V, E, VecTy);
13591
13592 E->VectorizedValue = V;
13593 ++NumVectorInstructions;
13594
13595 return V;
13596 }
13597 case Instruction::Add:
13598 case Instruction::FAdd:
13599 case Instruction::Sub:
13600 case Instruction::FSub:
13601 case Instruction::Mul:
13602 case Instruction::FMul:
13603 case Instruction::UDiv:
13604 case Instruction::SDiv:
13605 case Instruction::FDiv:
13606 case Instruction::URem:
13607 case Instruction::SRem:
13608 case Instruction::FRem:
13609 case Instruction::Shl:
13610 case Instruction::LShr:
13611 case Instruction::AShr:
13612 case Instruction::And:
13613 case Instruction::Or:
13614 case Instruction::Xor: {
13615 setInsertPointAfterBundle(E);
13616
13617 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13618 if (E->VectorizedValue) {
13619 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13620 return E->VectorizedValue;
13621 }
13622 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13623 if (E->VectorizedValue) {
13624 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13625 return E->VectorizedValue;
13626 }
13627 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13628 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13629 ArrayRef<Value *> Ops = E->getOperand(I);
13630 if (all_of(Ops, [&](Value *Op) {
13631 auto *CI = dyn_cast<ConstantInt>(Op);
13632 return CI && CI->getValue().countr_one() >= It->second.first;
13633 })) {
13634 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13635 E->VectorizedValue = V;
13636 ++NumVectorInstructions;
13637 return V;
13638 }
13639 }
13640 }
13641 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13642 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13643 getOperandEntry(E, 1)->isGather() ||
13644 MinBWs.contains(getOperandEntry(E, 0)) ||
13645 MinBWs.contains(getOperandEntry(E, 1))) &&
13646 "Expected item in MinBWs.");
13647 if (LHS->getType() != VecTy)
13648 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13649 if (RHS->getType() != VecTy)
13650 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13651 }
13652
13653 Value *V = Builder.CreateBinOp(
13654 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13655 RHS);
13656 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13657 if (auto *I = dyn_cast<Instruction>(V)) {
13658 V = propagateMetadata(I, E->Scalars);
13659 // Drop nuw flags for abs(sub(commutative), true).
13660 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13661 any_of(E->Scalars, [](Value *V) {
13662 return isCommutative(cast<Instruction>(V));
13663 }))
13664 I->setHasNoUnsignedWrap(/*b=*/false);
13665 }
13666
13667 V = FinalShuffle(V, E, VecTy);
13668
13669 E->VectorizedValue = V;
13670 ++NumVectorInstructions;
13671
13672 return V;
13673 }
13674 case Instruction::Load: {
13675 // Loads are inserted at the head of the tree because we don't want to
13676 // sink them all the way down past store instructions.
13677 setInsertPointAfterBundle(E);
13678
13679 LoadInst *LI = cast<LoadInst>(VL0);
13680 Instruction *NewLI;
13681 Value *PO = LI->getPointerOperand();
13682 if (E->State == TreeEntry::Vectorize) {
13683 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13684 } else if (E->State == TreeEntry::StridedVectorize) {
13685 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13686 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13687 PO = IsReverseOrder ? PtrN : Ptr0;
13688 std::optional<int> Diff = getPointersDiff(
13689 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13690 Type *StrideTy = DL->getIndexType(PO->getType());
13691 Value *StrideVal;
13692 if (Diff) {
13693 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13694 StrideVal =
13695 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13696 DL->getTypeAllocSize(ScalarTy));
13697 } else {
13698 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13699 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13700 return cast<LoadInst>(V)->getPointerOperand();
13701 });
13702 OrdersType Order;
13703 std::optional<Value *> Stride =
13704 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13705 &*Builder.GetInsertPoint());
13706 Value *NewStride =
13707 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13708 StrideVal = Builder.CreateMul(
13709 NewStride,
13710 ConstantInt::get(
13711 StrideTy,
13712 (IsReverseOrder ? -1 : 1) *
13713 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13714 }
13715 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13716 auto *Inst = Builder.CreateIntrinsic(
13717 Intrinsic::experimental_vp_strided_load,
13718 {VecTy, PO->getType(), StrideTy},
13719 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13720 Builder.getInt32(E->Scalars.size())});
13721 Inst->addParamAttr(
13722 /*ArgNo=*/0,
13723 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13724 NewLI = Inst;
13725 } else {
13726 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13727 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13728 if (E->VectorizedValue) {
13729 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13730 return E->VectorizedValue;
13731 }
13732 // Use the minimum alignment of the gathered loads.
13733 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13734 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13735 }
13736 Value *V = propagateMetadata(NewLI, E->Scalars);
13737
13738 V = FinalShuffle(V, E, VecTy);
13739 E->VectorizedValue = V;
13740 ++NumVectorInstructions;
13741 return V;
13742 }
13743 case Instruction::Store: {
13744 auto *SI = cast<StoreInst>(VL0);
13745
13746 setInsertPointAfterBundle(E);
13747
13748 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13749 if (VecValue->getType() != VecTy)
13750 VecValue =
13751 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13752 VecValue = FinalShuffle(VecValue, E, VecTy);
13753
13754 Value *Ptr = SI->getPointerOperand();
13755 Instruction *ST;
13756 if (E->State == TreeEntry::Vectorize) {
13757 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13758 } else {
13759 assert(E->State == TreeEntry::StridedVectorize &&
13760 "Expected either strided or conseutive stores.");
13761 if (!E->ReorderIndices.empty()) {
13762 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13763 Ptr = SI->getPointerOperand();
13764 }
13765 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13766 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13767 auto *Inst = Builder.CreateIntrinsic(
13768 Intrinsic::experimental_vp_strided_store,
13769 {VecTy, Ptr->getType(), StrideTy},
13770 {VecValue, Ptr,
13771 ConstantInt::get(
13772 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13773 Builder.getAllOnesMask(VecTy->getElementCount()),
13774 Builder.getInt32(E->Scalars.size())});
13775 Inst->addParamAttr(
13776 /*ArgNo=*/1,
13777 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13778 ST = Inst;
13779 }
13780
13781 Value *V = propagateMetadata(ST, E->Scalars);
13782
13783 E->VectorizedValue = V;
13784 ++NumVectorInstructions;
13785 return V;
13786 }
13787 case Instruction::GetElementPtr: {
13788 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13789 setInsertPointAfterBundle(E);
13790
13791 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13792 if (E->VectorizedValue) {
13793 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13794 return E->VectorizedValue;
13795 }
13796
13797 SmallVector<Value *> OpVecs;
13798 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13799 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13800 if (E->VectorizedValue) {
13801 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13802 return E->VectorizedValue;
13803 }
13804 OpVecs.push_back(OpVec);
13805 }
13806
13807 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13808 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13810 for (Value *V : E->Scalars) {
13811 if (isa<GetElementPtrInst>(V))
13812 GEPs.push_back(V);
13813 }
13814 V = propagateMetadata(I, GEPs);
13815 }
13816
13817 V = FinalShuffle(V, E, VecTy);
13818
13819 E->VectorizedValue = V;
13820 ++NumVectorInstructions;
13821
13822 return V;
13823 }
13824 case Instruction::Call: {
13825 CallInst *CI = cast<CallInst>(VL0);
13826 setInsertPointAfterBundle(E);
13827
13829
13830 SmallVector<Type *> ArgTys =
13832 It != MinBWs.end() ? It->second.first : 0);
13833 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13834 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13835 VecCallCosts.first <= VecCallCosts.second;
13836
13837 Value *ScalarArg = nullptr;
13838 SmallVector<Value *> OpVecs;
13839 SmallVector<Type *, 2> TysForDecl;
13840 // Add return type if intrinsic is overloaded on it.
13841 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13842 TysForDecl.push_back(VecTy);
13843 auto *CEI = cast<CallInst>(VL0);
13844 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13845 ValueList OpVL;
13846 // Some intrinsics have scalar arguments. This argument should not be
13847 // vectorized.
13848 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13849 ScalarArg = CEI->getArgOperand(I);
13850 // if decided to reduce bitwidth of abs intrinsic, it second argument
13851 // must be set false (do not return poison, if value issigned min).
13852 if (ID == Intrinsic::abs && It != MinBWs.end() &&
13853 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13854 ScalarArg = Builder.getFalse();
13855 OpVecs.push_back(ScalarArg);
13857 TysForDecl.push_back(ScalarArg->getType());
13858 continue;
13859 }
13860
13861 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13862 if (E->VectorizedValue) {
13863 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13864 return E->VectorizedValue;
13865 }
13866 ScalarArg = CEI->getArgOperand(I);
13867 if (cast<VectorType>(OpVec->getType())->getElementType() !=
13868 ScalarArg->getType()->getScalarType() &&
13869 It == MinBWs.end()) {
13870 auto *CastTy =
13871 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
13872 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13873 } else if (It != MinBWs.end()) {
13874 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13875 }
13876 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13877 OpVecs.push_back(OpVec);
13878 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13879 TysForDecl.push_back(OpVec->getType());
13880 }
13881
13882 Function *CF;
13883 if (!UseIntrinsic) {
13884 VFShape Shape =
13887 static_cast<unsigned>(VecTy->getNumElements())),
13888 false /*HasGlobalPred*/);
13889 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13890 } else {
13891 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13892 }
13893
13895 CI->getOperandBundlesAsDefs(OpBundles);
13896 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13897
13898 propagateIRFlags(V, E->Scalars, VL0);
13899 V = FinalShuffle(V, E, VecTy);
13900
13901 E->VectorizedValue = V;
13902 ++NumVectorInstructions;
13903 return V;
13904 }
13905 case Instruction::ShuffleVector: {
13906 Value *V;
13907 if (SLPReVec && !E->isAltShuffle()) {
13908 assert(E->ReuseShuffleIndices.empty() &&
13909 "Not support ReuseShuffleIndices yet.");
13910 assert(E->ReorderIndices.empty() && "Not support ReorderIndices yet.");
13911 setInsertPointAfterBundle(E);
13912 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
13913 if (E->VectorizedValue) {
13914 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13915 return E->VectorizedValue;
13916 }
13917 // The current shufflevector usage always duplicate the source.
13918 V = Builder.CreateShuffleVector(Src,
13919 calculateShufflevectorMask(E->Scalars));
13920 propagateIRFlags(V, E->Scalars, VL0);
13921 } else {
13922 assert(E->isAltShuffle() &&
13923 ((Instruction::isBinaryOp(E->getOpcode()) &&
13924 Instruction::isBinaryOp(E->getAltOpcode())) ||
13925 (Instruction::isCast(E->getOpcode()) &&
13926 Instruction::isCast(E->getAltOpcode())) ||
13927 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13928 "Invalid Shuffle Vector Operand");
13929
13930 Value *LHS = nullptr, *RHS = nullptr;
13931 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13932 setInsertPointAfterBundle(E);
13933 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13934 if (E->VectorizedValue) {
13935 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13936 return E->VectorizedValue;
13937 }
13938 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13939 } else {
13940 setInsertPointAfterBundle(E);
13941 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13942 }
13943 if (E->VectorizedValue) {
13944 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13945 return E->VectorizedValue;
13946 }
13947 if (LHS && RHS &&
13948 ((Instruction::isBinaryOp(E->getOpcode()) &&
13949 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13950 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13951 assert((It != MinBWs.end() ||
13952 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13953 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13954 MinBWs.contains(getOperandEntry(E, 0)) ||
13955 MinBWs.contains(getOperandEntry(E, 1))) &&
13956 "Expected item in MinBWs.");
13957 Type *CastTy = VecTy;
13958 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13959 if (cast<VectorType>(LHS->getType())
13960 ->getElementType()
13961 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13962 ->getElementType()
13963 ->getIntegerBitWidth())
13964 CastTy = RHS->getType();
13965 else
13966 CastTy = LHS->getType();
13967 }
13968 if (LHS->getType() != CastTy)
13969 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13970 if (RHS->getType() != CastTy)
13971 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13972 }
13973
13974 Value *V0, *V1;
13975 if (Instruction::isBinaryOp(E->getOpcode())) {
13976 V0 = Builder.CreateBinOp(
13977 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13978 V1 = Builder.CreateBinOp(
13979 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13980 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13981 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13982 auto *AltCI = cast<CmpInst>(E->getAltOp());
13983 CmpInst::Predicate AltPred = AltCI->getPredicate();
13984 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13985 } else {
13986 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13987 unsigned SrcBWSz = DL->getTypeSizeInBits(
13988 cast<VectorType>(LHS->getType())->getElementType());
13989 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13990 if (BWSz <= SrcBWSz) {
13991 if (BWSz < SrcBWSz)
13992 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13993 assert(LHS->getType() == VecTy &&
13994 "Expected same type as operand.");
13995 if (auto *I = dyn_cast<Instruction>(LHS))
13996 LHS = propagateMetadata(I, E->Scalars);
13997 E->VectorizedValue = LHS;
13998 ++NumVectorInstructions;
13999 return LHS;
14000 }
14001 }
14002 V0 = Builder.CreateCast(
14003 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
14004 V1 = Builder.CreateCast(
14005 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
14006 }
14007 // Add V0 and V1 to later analysis to try to find and remove matching
14008 // instruction, if any.
14009 for (Value *V : {V0, V1}) {
14010 if (auto *I = dyn_cast<Instruction>(V)) {
14011 GatherShuffleExtractSeq.insert(I);
14012 CSEBlocks.insert(I->getParent());
14013 }
14014 }
14015
14016 // Create shuffle to take alternate operations from the vector.
14017 // Also, gather up main and alt scalar ops to propagate IR flags to
14018 // each vector operation.
14019 ValueList OpScalars, AltScalars;
14021 E->buildAltOpShuffleMask(
14022 [E, this](Instruction *I) {
14023 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
14024 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
14025 *TLI);
14026 },
14027 Mask, &OpScalars, &AltScalars);
14028
14029 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
14030 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
14031 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
14032 // Drop nuw flags for abs(sub(commutative), true).
14033 if (auto *I = dyn_cast<Instruction>(Vec);
14034 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
14035 any_of(E->Scalars, [](Value *V) {
14036 auto *IV = cast<Instruction>(V);
14037 return IV->getOpcode() == Instruction::Sub &&
14038 isCommutative(cast<Instruction>(IV));
14039 }))
14040 I->setHasNoUnsignedWrap(/*b=*/false);
14041 };
14042 DropNuwFlag(V0, E->getOpcode());
14043 DropNuwFlag(V1, E->getAltOpcode());
14044
14045 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
14046 assert(SLPReVec && "FixedVectorType is not expected.");
14048 }
14049 V = Builder.CreateShuffleVector(V0, V1, Mask);
14050 }
14051 if (auto *I = dyn_cast<Instruction>(V)) {
14052 V = propagateMetadata(I, E->Scalars);
14053 GatherShuffleExtractSeq.insert(I);
14054 CSEBlocks.insert(I->getParent());
14055 }
14056
14057 E->VectorizedValue = V;
14058 ++NumVectorInstructions;
14059
14060 return V;
14061 }
14062 default:
14063 llvm_unreachable("unknown inst");
14064 }
14065 return nullptr;
14066}
14067
14069 ExtraValueToDebugLocsMap ExternallyUsedValues;
14070 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
14071 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
14072}
14073
14074namespace {
14075/// Data type for handling buildvector sequences with the reused scalars from
14076/// other tree entries.
14077struct ShuffledInsertData {
14078 /// List of insertelements to be replaced by shuffles.
14079 SmallVector<InsertElementInst *> InsertElements;
14080 /// The parent vectors and shuffle mask for the given list of inserts.
14082};
14083} // namespace
14084
14086 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
14087 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
14088 Instruction *ReductionRoot) {
14089 // All blocks must be scheduled before any instructions are inserted.
14090 for (auto &BSIter : BlocksSchedules) {
14091 scheduleBlock(BSIter.second.get());
14092 }
14093 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
14094 // need to rebuild it.
14095 EntryToLastInstruction.clear();
14096
14097 if (ReductionRoot)
14098 Builder.SetInsertPoint(ReductionRoot->getParent(),
14099 ReductionRoot->getIterator());
14100 else
14101 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14102
14103 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
14104 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
14105 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
14106 if (TE->State == TreeEntry::Vectorize &&
14107 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
14108 TE->VectorizedValue)
14109 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
14110 // Run through the list of postponed gathers and emit them, replacing the temp
14111 // emitted allocas with actual vector instructions.
14112 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
14114 for (const TreeEntry *E : PostponedNodes) {
14115 auto *TE = const_cast<TreeEntry *>(E);
14116 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
14117 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
14118 TE->UserTreeIndices.front().EdgeIdx)) &&
14119 VecTE->isSame(TE->Scalars))
14120 // Found gather node which is absolutely the same as one of the
14121 // vectorized nodes. It may happen after reordering.
14122 continue;
14123 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
14124 TE->VectorizedValue = nullptr;
14125 auto *UserI =
14126 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
14127 // If user is a PHI node, its vector code have to be inserted right before
14128 // block terminator. Since the node was delayed, there were some unresolved
14129 // dependencies at the moment when stab instruction was emitted. In a case
14130 // when any of these dependencies turn out an operand of another PHI, coming
14131 // from this same block, position of a stab instruction will become invalid.
14132 // The is because source vector that supposed to feed this gather node was
14133 // inserted at the end of the block [after stab instruction]. So we need
14134 // to adjust insertion point again to the end of block.
14135 if (isa<PHINode>(UserI)) {
14136 // Insert before all users.
14137 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
14138 for (User *U : PrevVec->users()) {
14139 if (U == UserI)
14140 continue;
14141 auto *UI = dyn_cast<Instruction>(U);
14142 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
14143 continue;
14144 if (UI->comesBefore(InsertPt))
14145 InsertPt = UI;
14146 }
14147 Builder.SetInsertPoint(InsertPt);
14148 } else {
14149 Builder.SetInsertPoint(PrevVec);
14150 }
14151 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
14152 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
14153 if (Vec->getType() != PrevVec->getType()) {
14154 assert(Vec->getType()->isIntOrIntVectorTy() &&
14155 PrevVec->getType()->isIntOrIntVectorTy() &&
14156 "Expected integer vector types only.");
14157 std::optional<bool> IsSigned;
14158 for (Value *V : TE->Scalars) {
14159 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
14160 auto It = MinBWs.find(BaseTE);
14161 if (It != MinBWs.end()) {
14162 IsSigned = IsSigned.value_or(false) || It->second.second;
14163 if (*IsSigned)
14164 break;
14165 }
14166 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
14167 auto It = MinBWs.find(MNTE);
14168 if (It != MinBWs.end()) {
14169 IsSigned = IsSigned.value_or(false) || It->second.second;
14170 if (*IsSigned)
14171 break;
14172 }
14173 }
14174 if (IsSigned.value_or(false))
14175 break;
14176 // Scan through gather nodes.
14177 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
14178 auto It = MinBWs.find(BVE);
14179 if (It != MinBWs.end()) {
14180 IsSigned = IsSigned.value_or(false) || It->second.second;
14181 if (*IsSigned)
14182 break;
14183 }
14184 }
14185 if (IsSigned.value_or(false))
14186 break;
14187 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
14188 IsSigned =
14189 IsSigned.value_or(false) ||
14190 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
14191 continue;
14192 }
14193 if (IsSigned.value_or(false))
14194 break;
14195 }
14196 }
14197 if (IsSigned.value_or(false)) {
14198 // Final attempt - check user node.
14199 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
14200 if (It != MinBWs.end())
14201 IsSigned = It->second.second;
14202 }
14203 assert(IsSigned &&
14204 "Expected user node or perfect diamond match in MinBWs.");
14205 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
14206 }
14207 PrevVec->replaceAllUsesWith(Vec);
14208 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
14209 // Replace the stub vector node, if it was used before for one of the
14210 // buildvector nodes already.
14211 auto It = PostponedValues.find(PrevVec);
14212 if (It != PostponedValues.end()) {
14213 for (TreeEntry *VTE : It->getSecond())
14214 VTE->VectorizedValue = Vec;
14215 }
14216 eraseInstruction(PrevVec);
14217 }
14218
14219 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
14220 << " values .\n");
14221
14222 SmallVector<ShuffledInsertData> ShuffledInserts;
14223 // Maps vector instruction to original insertelement instruction
14224 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
14225 // Maps extract Scalar to the corresponding extractelement instruction in the
14226 // basic block. Only one extractelement per block should be emitted.
14228 ScalarToEEs;
14229 SmallDenseSet<Value *, 4> UsedInserts;
14231 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
14232 // Extract all of the elements with the external uses.
14233 for (const auto &ExternalUse : ExternalUses) {
14234 Value *Scalar = ExternalUse.Scalar;
14235 llvm::User *User = ExternalUse.User;
14236
14237 // Skip users that we already RAUW. This happens when one instruction
14238 // has multiple uses of the same value.
14239 if (User && !is_contained(Scalar->users(), User))
14240 continue;
14241 TreeEntry *E = getTreeEntry(Scalar);
14242 assert(E && "Invalid scalar");
14243 assert(!E->isGather() && "Extracting from a gather list");
14244 // Non-instruction pointers are not deleted, just skip them.
14245 if (E->getOpcode() == Instruction::GetElementPtr &&
14246 !isa<GetElementPtrInst>(Scalar))
14247 continue;
14248
14249 Value *Vec = E->VectorizedValue;
14250 assert(Vec && "Can't find vectorizable value");
14251
14252 Value *Lane = Builder.getInt32(ExternalUse.Lane);
14253 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
14254 if (Scalar->getType() != Vec->getType()) {
14255 Value *Ex = nullptr;
14256 Value *ExV = nullptr;
14257 auto *Inst = dyn_cast<Instruction>(Scalar);
14258 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
14259 auto It = ScalarToEEs.find(Scalar);
14260 if (It != ScalarToEEs.end()) {
14261 // No need to emit many extracts, just move the only one in the
14262 // current block.
14263 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
14264 : Builder.GetInsertBlock());
14265 if (EEIt != It->second.end()) {
14266 Value *PrevV = EEIt->second.first;
14267 if (auto *I = dyn_cast<Instruction>(PrevV);
14268 I && !ReplaceInst &&
14269 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
14270 Builder.GetInsertPoint()->comesBefore(I)) {
14271 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
14272 Builder.GetInsertPoint());
14273 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
14274 CI->moveAfter(I);
14275 }
14276 Ex = PrevV;
14277 ExV = EEIt->second.second ? EEIt->second.second : Ex;
14278 }
14279 }
14280 if (!Ex) {
14281 // "Reuse" the existing extract to improve final codegen.
14282 if (ReplaceInst) {
14283 // Leave the instruction as is, if it cheaper extracts and all
14284 // operands are scalar.
14285 auto *CloneInst = Inst->clone();
14286 CloneInst->insertBefore(Inst);
14287 if (Inst->hasName())
14288 CloneInst->takeName(Inst);
14289 Ex = CloneInst;
14290 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14291 ES && isa<Instruction>(Vec)) {
14292 Value *V = ES->getVectorOperand();
14293 auto *IVec = cast<Instruction>(Vec);
14294 if (const TreeEntry *ETE = getTreeEntry(V))
14295 V = ETE->VectorizedValue;
14296 if (auto *IV = dyn_cast<Instruction>(V);
14297 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
14298 IV->comesBefore(IVec))
14299 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
14300 else
14301 Ex = Builder.CreateExtractElement(Vec, Lane);
14302 } else if (auto *VecTy =
14303 dyn_cast<FixedVectorType>(Scalar->getType())) {
14304 assert(SLPReVec && "FixedVectorType is not expected.");
14305 unsigned VecTyNumElements = VecTy->getNumElements();
14306 // When REVEC is enabled, we need to extract a vector.
14307 // Note: The element size of Scalar may be different from the
14308 // element size of Vec.
14309 Ex = Builder.CreateExtractVector(
14311 VecTyNumElements),
14312 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
14313 } else {
14314 Ex = Builder.CreateExtractElement(Vec, Lane);
14315 }
14316 // If necessary, sign-extend or zero-extend ScalarRoot
14317 // to the larger type.
14318 ExV = Ex;
14319 if (Scalar->getType() != Ex->getType())
14320 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
14321 MinBWs.find(E)->second.second);
14322 auto *I = dyn_cast<Instruction>(Ex);
14323 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
14324 : &F->getEntryBlock(),
14325 std::make_pair(Ex, ExV));
14326 }
14327 // The then branch of the previous if may produce constants, since 0
14328 // operand might be a constant.
14329 if (auto *ExI = dyn_cast<Instruction>(Ex);
14330 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
14331 GatherShuffleExtractSeq.insert(ExI);
14332 CSEBlocks.insert(ExI->getParent());
14333 }
14334 return ExV;
14335 }
14336 assert(isa<FixedVectorType>(Scalar->getType()) &&
14337 isa<InsertElementInst>(Scalar) &&
14338 "In-tree scalar of vector type is not insertelement?");
14339 auto *IE = cast<InsertElementInst>(Scalar);
14340 VectorToInsertElement.try_emplace(Vec, IE);
14341 return Vec;
14342 };
14343 // If User == nullptr, the Scalar remains as scalar in vectorized
14344 // instructions or is used as extra arg. Generate ExtractElement instruction
14345 // and update the record for this scalar in ExternallyUsedValues.
14346 if (!User) {
14347 if (!ScalarsWithNullptrUser.insert(Scalar).second)
14348 continue;
14349 assert((ExternallyUsedValues.count(Scalar) ||
14350 Scalar->hasNUsesOrMore(UsesLimit) ||
14351 ExternalUsesAsOriginalScalar.contains(Scalar) ||
14352 any_of(Scalar->users(),
14353 [&](llvm::User *U) {
14354 if (ExternalUsesAsOriginalScalar.contains(U))
14355 return true;
14356 TreeEntry *UseEntry = getTreeEntry(U);
14357 return UseEntry &&
14358 (UseEntry->State == TreeEntry::Vectorize ||
14359 UseEntry->State ==
14360 TreeEntry::StridedVectorize) &&
14361 (E->State == TreeEntry::Vectorize ||
14362 E->State == TreeEntry::StridedVectorize) &&
14363 doesInTreeUserNeedToExtract(
14364 Scalar,
14365 cast<Instruction>(UseEntry->Scalars.front()),
14366 TLI);
14367 })) &&
14368 "Scalar with nullptr User must be registered in "
14369 "ExternallyUsedValues map or remain as scalar in vectorized "
14370 "instructions");
14371 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
14372 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
14373 if (PHI->getParent()->isLandingPad())
14374 Builder.SetInsertPoint(
14375 PHI->getParent(),
14376 std::next(
14377 PHI->getParent()->getLandingPadInst()->getIterator()));
14378 else
14379 Builder.SetInsertPoint(PHI->getParent(),
14380 PHI->getParent()->getFirstNonPHIIt());
14381 } else {
14382 Builder.SetInsertPoint(VecI->getParent(),
14383 std::next(VecI->getIterator()));
14384 }
14385 } else {
14386 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14387 }
14388 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14389 // Required to update internally referenced instructions.
14390 Scalar->replaceAllUsesWith(NewInst);
14391 ReplacedExternals.emplace_back(Scalar, NewInst);
14392 continue;
14393 }
14394
14395 if (auto *VU = dyn_cast<InsertElementInst>(User);
14396 VU && VU->getOperand(1) == Scalar) {
14397 // Skip if the scalar is another vector op or Vec is not an instruction.
14398 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
14399 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
14400 if (!UsedInserts.insert(VU).second)
14401 continue;
14402 // Need to use original vector, if the root is truncated.
14403 auto BWIt = MinBWs.find(E);
14404 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
14405 auto *ScalarTy = FTy->getElementType();
14406 auto Key = std::make_pair(Vec, ScalarTy);
14407 auto VecIt = VectorCasts.find(Key);
14408 if (VecIt == VectorCasts.end()) {
14409 IRBuilderBase::InsertPointGuard Guard(Builder);
14410 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
14411 if (IVec->getParent()->isLandingPad())
14412 Builder.SetInsertPoint(IVec->getParent(),
14413 std::next(IVec->getParent()
14414 ->getLandingPadInst()
14415 ->getIterator()));
14416 else
14417 Builder.SetInsertPoint(
14418 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
14419 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
14420 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
14421 }
14422 Vec = Builder.CreateIntCast(
14423 Vec,
14425 ScalarTy,
14426 cast<FixedVectorType>(Vec->getType())->getNumElements()),
14427 BWIt->second.second);
14428 VectorCasts.try_emplace(Key, Vec);
14429 } else {
14430 Vec = VecIt->second;
14431 }
14432 }
14433
14434 std::optional<unsigned> InsertIdx = getElementIndex(VU);
14435 if (InsertIdx) {
14436 auto *It =
14437 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
14438 // Checks if 2 insertelements are from the same buildvector.
14439 InsertElementInst *VecInsert = Data.InsertElements.front();
14441 VU, VecInsert,
14442 [](InsertElementInst *II) { return II->getOperand(0); });
14443 });
14444 unsigned Idx = *InsertIdx;
14445 if (It == ShuffledInserts.end()) {
14446 (void)ShuffledInserts.emplace_back();
14447 It = std::next(ShuffledInserts.begin(),
14448 ShuffledInserts.size() - 1);
14449 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14450 if (Mask.empty())
14451 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14452 // Find the insertvector, vectorized in tree, if any.
14453 Value *Base = VU;
14454 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
14455 if (IEBase != User &&
14456 (!IEBase->hasOneUse() ||
14457 getElementIndex(IEBase).value_or(Idx) == Idx))
14458 break;
14459 // Build the mask for the vectorized insertelement instructions.
14460 if (const TreeEntry *E = getTreeEntry(IEBase)) {
14461 do {
14462 IEBase = cast<InsertElementInst>(Base);
14463 int IEIdx = *getElementIndex(IEBase);
14464 assert(Mask[IEIdx] == PoisonMaskElem &&
14465 "InsertElementInstruction used already.");
14466 Mask[IEIdx] = IEIdx;
14467 Base = IEBase->getOperand(0);
14468 } while (E == getTreeEntry(Base));
14469 break;
14470 }
14471 Base = cast<InsertElementInst>(Base)->getOperand(0);
14472 // After the vectorization the def-use chain has changed, need
14473 // to look through original insertelement instructions, if they
14474 // get replaced by vector instructions.
14475 auto It = VectorToInsertElement.find(Base);
14476 if (It != VectorToInsertElement.end())
14477 Base = It->second;
14478 }
14479 }
14480 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14481 if (Mask.empty())
14482 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14483 Mask[Idx] = ExternalUse.Lane;
14484 It->InsertElements.push_back(cast<InsertElementInst>(User));
14485 continue;
14486 }
14487 }
14488 }
14489 }
14490
14491 // Generate extracts for out-of-tree users.
14492 // Find the insertion point for the extractelement lane.
14493 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
14494 if (PHINode *PH = dyn_cast<PHINode>(User)) {
14495 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14496 if (PH->getIncomingValue(I) == Scalar) {
14497 Instruction *IncomingTerminator =
14498 PH->getIncomingBlock(I)->getTerminator();
14499 if (isa<CatchSwitchInst>(IncomingTerminator)) {
14500 Builder.SetInsertPoint(VecI->getParent(),
14501 std::next(VecI->getIterator()));
14502 } else {
14503 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
14504 }
14505 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14506 PH->setOperand(I, NewInst);
14507 }
14508 }
14509 } else {
14510 Builder.SetInsertPoint(cast<Instruction>(User));
14511 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14512 User->replaceUsesOfWith(Scalar, NewInst);
14513 }
14514 } else {
14515 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14516 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14517 User->replaceUsesOfWith(Scalar, NewInst);
14518 }
14519
14520 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
14521 }
14522
14523 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14524 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14525 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14526 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14527 for (int I = 0, E = Mask.size(); I < E; ++I) {
14528 if (Mask[I] < VF)
14529 CombinedMask1[I] = Mask[I];
14530 else
14531 CombinedMask2[I] = Mask[I] - VF;
14532 }
14533 ShuffleInstructionBuilder ShuffleBuilder(
14534 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
14535 ShuffleBuilder.add(V1, CombinedMask1);
14536 if (V2)
14537 ShuffleBuilder.add(V2, CombinedMask2);
14538 return ShuffleBuilder.finalize(std::nullopt);
14539 };
14540
14541 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
14542 bool ForSingleMask) {
14543 unsigned VF = Mask.size();
14544 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14545 if (VF != VecVF) {
14546 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
14547 Vec = CreateShuffle(Vec, nullptr, Mask);
14548 return std::make_pair(Vec, true);
14549 }
14550 if (!ForSingleMask) {
14551 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14552 for (unsigned I = 0; I < VF; ++I) {
14553 if (Mask[I] != PoisonMaskElem)
14554 ResizeMask[Mask[I]] = Mask[I];
14555 }
14556 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
14557 }
14558 }
14559
14560 return std::make_pair(Vec, false);
14561 };
14562 // Perform shuffling of the vectorize tree entries for better handling of
14563 // external extracts.
14564 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
14565 // Find the first and the last instruction in the list of insertelements.
14566 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
14567 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
14568 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
14569 Builder.SetInsertPoint(LastInsert);
14570 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
14571 Value *NewInst = performExtractsShuffleAction<Value>(
14572 MutableArrayRef(Vector.data(), Vector.size()),
14573 FirstInsert->getOperand(0),
14574 [](Value *Vec) {
14575 return cast<VectorType>(Vec->getType())
14576 ->getElementCount()
14577 .getKnownMinValue();
14578 },
14579 ResizeToVF,
14580 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
14581 ArrayRef<Value *> Vals) {
14582 assert((Vals.size() == 1 || Vals.size() == 2) &&
14583 "Expected exactly 1 or 2 input values.");
14584 if (Vals.size() == 1) {
14585 // Do not create shuffle if the mask is a simple identity
14586 // non-resizing mask.
14587 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
14588 ->getNumElements() ||
14589 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14590 return CreateShuffle(Vals.front(), nullptr, Mask);
14591 return Vals.front();
14592 }
14593 return CreateShuffle(Vals.front() ? Vals.front()
14594 : FirstInsert->getOperand(0),
14595 Vals.back(), Mask);
14596 });
14597 auto It = ShuffledInserts[I].InsertElements.rbegin();
14598 // Rebuild buildvector chain.
14599 InsertElementInst *II = nullptr;
14600 if (It != ShuffledInserts[I].InsertElements.rend())
14601 II = *It;
14603 while (It != ShuffledInserts[I].InsertElements.rend()) {
14604 assert(II && "Must be an insertelement instruction.");
14605 if (*It == II)
14606 ++It;
14607 else
14608 Inserts.push_back(cast<Instruction>(II));
14609 II = dyn_cast<InsertElementInst>(II->getOperand(0));
14610 }
14611 for (Instruction *II : reverse(Inserts)) {
14612 II->replaceUsesOfWith(II->getOperand(0), NewInst);
14613 if (auto *NewI = dyn_cast<Instruction>(NewInst))
14614 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
14615 II->moveAfter(NewI);
14616 NewInst = II;
14617 }
14618 LastInsert->replaceAllUsesWith(NewInst);
14619 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
14620 IE->replaceUsesOfWith(IE->getOperand(0),
14621 PoisonValue::get(IE->getOperand(0)->getType()));
14622 IE->replaceUsesOfWith(IE->getOperand(1),
14623 PoisonValue::get(IE->getOperand(1)->getType()));
14624 eraseInstruction(IE);
14625 }
14626 CSEBlocks.insert(LastInsert->getParent());
14627 }
14628
14629 SmallVector<Instruction *> RemovedInsts;
14630 // For each vectorized value:
14631 for (auto &TEPtr : VectorizableTree) {
14632 TreeEntry *Entry = TEPtr.get();
14633
14634 // No need to handle users of gathered values.
14635 if (Entry->isGather())
14636 continue;
14637
14638 assert(Entry->VectorizedValue && "Can't find vectorizable value");
14639
14640 // For each lane:
14641 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14642 Value *Scalar = Entry->Scalars[Lane];
14643
14644 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14645 !isa<GetElementPtrInst>(Scalar))
14646 continue;
14647#ifndef NDEBUG
14648 Type *Ty = Scalar->getType();
14649 if (!Ty->isVoidTy()) {
14650 for (User *U : Scalar->users()) {
14651 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14652
14653 // It is legal to delete users in the ignorelist.
14654 assert((getTreeEntry(U) ||
14655 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14656 (isa_and_nonnull<Instruction>(U) &&
14657 isDeleted(cast<Instruction>(U)))) &&
14658 "Deleting out-of-tree value");
14659 }
14660 }
14661#endif
14662 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14663 auto *I = cast<Instruction>(Scalar);
14664 RemovedInsts.push_back(I);
14665 }
14666 }
14667
14668 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14669 // new vector instruction.
14670 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14671 V->mergeDIAssignID(RemovedInsts);
14672
14673 // Clear up reduction references, if any.
14674 if (UserIgnoreList) {
14675 for (Instruction *I : RemovedInsts) {
14676 if (getTreeEntry(I)->Idx != 0)
14677 continue;
14678 SmallVector<SelectInst *> LogicalOpSelects;
14679 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
14680 // Do not replace condition of the logical op in form select <cond>.
14681 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
14682 (match(U.getUser(), m_LogicalAnd()) ||
14683 match(U.getUser(), m_LogicalOr())) &&
14684 U.getOperandNo() == 0;
14685 if (IsPoisoningLogicalOp) {
14686 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
14687 return false;
14688 }
14689 return UserIgnoreList->contains(U.getUser());
14690 });
14691 // Replace conditions of the poisoning logical ops with the non-poison
14692 // constant value.
14693 for (SelectInst *SI : LogicalOpSelects)
14694 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
14695 }
14696 }
14697 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14698 // cache correctness.
14699 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
14700 // - instructions are not deleted until later.
14701 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
14702
14703 Builder.ClearInsertionPoint();
14704 InstrElementSize.clear();
14705
14706 const TreeEntry &RootTE = *VectorizableTree.front();
14707 Value *Vec = RootTE.VectorizedValue;
14708 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14709 It != MinBWs.end() &&
14710 ReductionBitWidth != It->second.first) {
14711 IRBuilder<>::InsertPointGuard Guard(Builder);
14712 Builder.SetInsertPoint(ReductionRoot->getParent(),
14713 ReductionRoot->getIterator());
14714 Vec = Builder.CreateIntCast(
14715 Vec,
14716 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14717 cast<VectorType>(Vec->getType())->getElementCount()),
14718 It->second.second);
14719 }
14720 return Vec;
14721}
14722
14724 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14725 << " gather sequences instructions.\n");
14726 // LICM InsertElementInst sequences.
14727 for (Instruction *I : GatherShuffleExtractSeq) {
14728 if (isDeleted(I))
14729 continue;
14730
14731 // Check if this block is inside a loop.
14732 Loop *L = LI->getLoopFor(I->getParent());
14733 if (!L)
14734 continue;
14735
14736 // Check if it has a preheader.
14737 BasicBlock *PreHeader = L->getLoopPreheader();
14738 if (!PreHeader)
14739 continue;
14740
14741 // If the vector or the element that we insert into it are
14742 // instructions that are defined in this basic block then we can't
14743 // hoist this instruction.
14744 if (any_of(I->operands(), [L](Value *V) {
14745 auto *OpI = dyn_cast<Instruction>(V);
14746 return OpI && L->contains(OpI);
14747 }))
14748 continue;
14749
14750 // We can hoist this instruction. Move it to the pre-header.
14751 I->moveBefore(PreHeader->getTerminator());
14752 CSEBlocks.insert(PreHeader);
14753 }
14754
14755 // Make a list of all reachable blocks in our CSE queue.
14757 CSEWorkList.reserve(CSEBlocks.size());
14758 for (BasicBlock *BB : CSEBlocks)
14759 if (DomTreeNode *N = DT->getNode(BB)) {
14761 CSEWorkList.push_back(N);
14762 }
14763
14764 // Sort blocks by domination. This ensures we visit a block after all blocks
14765 // dominating it are visited.
14766 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14767 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14768 "Different nodes should have different DFS numbers");
14769 return A->getDFSNumIn() < B->getDFSNumIn();
14770 });
14771
14772 // Less defined shuffles can be replaced by the more defined copies.
14773 // Between two shuffles one is less defined if it has the same vector operands
14774 // and its mask indeces are the same as in the first one or undefs. E.g.
14775 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14776 // poison, <0, 0, 0, 0>.
14777 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14778 SmallVectorImpl<int> &NewMask) {
14779 if (I1->getType() != I2->getType())
14780 return false;
14781 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14782 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14783 if (!SI1 || !SI2)
14784 return I1->isIdenticalTo(I2);
14785 if (SI1->isIdenticalTo(SI2))
14786 return true;
14787 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14788 if (SI1->getOperand(I) != SI2->getOperand(I))
14789 return false;
14790 // Check if the second instruction is more defined than the first one.
14791 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14792 ArrayRef<int> SM1 = SI1->getShuffleMask();
14793 // Count trailing undefs in the mask to check the final number of used
14794 // registers.
14795 unsigned LastUndefsCnt = 0;
14796 for (int I = 0, E = NewMask.size(); I < E; ++I) {
14797 if (SM1[I] == PoisonMaskElem)
14798 ++LastUndefsCnt;
14799 else
14800 LastUndefsCnt = 0;
14801 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14802 NewMask[I] != SM1[I])
14803 return false;
14804 if (NewMask[I] == PoisonMaskElem)
14805 NewMask[I] = SM1[I];
14806 }
14807 // Check if the last undefs actually change the final number of used vector
14808 // registers.
14809 return SM1.size() - LastUndefsCnt > 1 &&
14810 TTI->getNumberOfParts(SI1->getType()) ==
14812 getWidenedType(SI1->getType()->getElementType(),
14813 SM1.size() - LastUndefsCnt));
14814 };
14815 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14816 // instructions. TODO: We can further optimize this scan if we split the
14817 // instructions into different buckets based on the insert lane.
14819 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14820 assert(*I &&
14821 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14822 "Worklist not sorted properly!");
14823 BasicBlock *BB = (*I)->getBlock();
14824 // For all instructions in blocks containing gather sequences:
14825 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14826 if (isDeleted(&In))
14827 continue;
14828 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14829 !GatherShuffleExtractSeq.contains(&In))
14830 continue;
14831
14832 // Check if we can replace this instruction with any of the
14833 // visited instructions.
14834 bool Replaced = false;
14835 for (Instruction *&V : Visited) {
14836 SmallVector<int> NewMask;
14837 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14838 DT->dominates(V->getParent(), In.getParent())) {
14839 In.replaceAllUsesWith(V);
14840 eraseInstruction(&In);
14841 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14842 if (!NewMask.empty())
14843 SI->setShuffleMask(NewMask);
14844 Replaced = true;
14845 break;
14846 }
14847 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14848 GatherShuffleExtractSeq.contains(V) &&
14849 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14850 DT->dominates(In.getParent(), V->getParent())) {
14851 In.moveAfter(V);
14852 V->replaceAllUsesWith(&In);
14854 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14855 if (!NewMask.empty())
14856 SI->setShuffleMask(NewMask);
14857 V = &In;
14858 Replaced = true;
14859 break;
14860 }
14861 }
14862 if (!Replaced) {
14863 assert(!is_contained(Visited, &In));
14864 Visited.push_back(&In);
14865 }
14866 }
14867 }
14868 CSEBlocks.clear();
14869 GatherShuffleExtractSeq.clear();
14870}
14871
14872BoUpSLP::ScheduleData *
14873BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14874 ScheduleData *Bundle = nullptr;
14875 ScheduleData *PrevInBundle = nullptr;
14876 for (Value *V : VL) {
14878 continue;
14879 ScheduleData *BundleMember = getScheduleData(V);
14880 assert(BundleMember &&
14881 "no ScheduleData for bundle member "
14882 "(maybe not in same basic block)");
14883 assert(BundleMember->isSchedulingEntity() &&
14884 "bundle member already part of other bundle");
14885 if (PrevInBundle) {
14886 PrevInBundle->NextInBundle = BundleMember;
14887 } else {
14888 Bundle = BundleMember;
14889 }
14890
14891 // Group the instructions to a bundle.
14892 BundleMember->FirstInBundle = Bundle;
14893 PrevInBundle = BundleMember;
14894 }
14895 assert(Bundle && "Failed to find schedule bundle");
14896 return Bundle;
14897}
14898
14899// Groups the instructions to a bundle (which is then a single scheduling entity)
14900// and schedules instructions until the bundle gets ready.
14901std::optional<BoUpSLP::ScheduleData *>
14902BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14903 const InstructionsState &S) {
14904 // No need to schedule PHIs, insertelement, extractelement and extractvalue
14905 // instructions.
14906 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14908 return nullptr;
14909
14910 // Initialize the instruction bundle.
14911 Instruction *OldScheduleEnd = ScheduleEnd;
14912 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14913
14914 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14915 ScheduleData *Bundle) {
14916 // The scheduling region got new instructions at the lower end (or it is a
14917 // new region for the first bundle). This makes it necessary to
14918 // recalculate all dependencies.
14919 // It is seldom that this needs to be done a second time after adding the
14920 // initial bundle to the region.
14921 if (ScheduleEnd != OldScheduleEnd) {
14922 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14923 if (ScheduleData *SD = getScheduleData(I))
14924 SD->clearDependencies();
14925 ReSchedule = true;
14926 }
14927 if (Bundle) {
14928 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14929 << " in block " << BB->getName() << "\n");
14930 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14931 }
14932
14933 if (ReSchedule) {
14934 resetSchedule();
14935 initialFillReadyList(ReadyInsts);
14936 }
14937
14938 // Now try to schedule the new bundle or (if no bundle) just calculate
14939 // dependencies. As soon as the bundle is "ready" it means that there are no
14940 // cyclic dependencies and we can schedule it. Note that's important that we
14941 // don't "schedule" the bundle yet (see cancelScheduling).
14942 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14943 !ReadyInsts.empty()) {
14944 ScheduleData *Picked = ReadyInsts.pop_back_val();
14945 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14946 "must be ready to schedule");
14947 schedule(Picked, ReadyInsts);
14948 }
14949 };
14950
14951 // Make sure that the scheduling region contains all
14952 // instructions of the bundle.
14953 for (Value *V : VL) {
14955 continue;
14956 if (!extendSchedulingRegion(V, S)) {
14957 // If the scheduling region got new instructions at the lower end (or it
14958 // is a new region for the first bundle). This makes it necessary to
14959 // recalculate all dependencies.
14960 // Otherwise the compiler may crash trying to incorrectly calculate
14961 // dependencies and emit instruction in the wrong order at the actual
14962 // scheduling.
14963 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14964 return std::nullopt;
14965 }
14966 }
14967
14968 bool ReSchedule = false;
14969 for (Value *V : VL) {
14971 continue;
14972 ScheduleData *BundleMember = getScheduleData(V);
14973 assert(BundleMember &&
14974 "no ScheduleData for bundle member (maybe not in same basic block)");
14975
14976 // Make sure we don't leave the pieces of the bundle in the ready list when
14977 // whole bundle might not be ready.
14978 ReadyInsts.remove(BundleMember);
14979
14980 if (!BundleMember->IsScheduled)
14981 continue;
14982 // A bundle member was scheduled as single instruction before and now
14983 // needs to be scheduled as part of the bundle. We just get rid of the
14984 // existing schedule.
14985 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14986 << " was already scheduled\n");
14987 ReSchedule = true;
14988 }
14989
14990 auto *Bundle = buildBundle(VL);
14991 TryScheduleBundleImpl(ReSchedule, Bundle);
14992 if (!Bundle->isReady()) {
14993 cancelScheduling(VL, S.OpValue);
14994 return std::nullopt;
14995 }
14996 return Bundle;
14997}
14998
14999void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
15000 Value *OpValue) {
15001 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
15003 return;
15004
15005 if (doesNotNeedToBeScheduled(OpValue))
15006 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
15007 ScheduleData *Bundle = getScheduleData(OpValue);
15008 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
15009 assert(!Bundle->IsScheduled &&
15010 "Can't cancel bundle which is already scheduled");
15011 assert(Bundle->isSchedulingEntity() &&
15012 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
15013 "tried to unbundle something which is not a bundle");
15014
15015 // Remove the bundle from the ready list.
15016 if (Bundle->isReady())
15017 ReadyInsts.remove(Bundle);
15018
15019 // Un-bundle: make single instructions out of the bundle.
15020 ScheduleData *BundleMember = Bundle;
15021 while (BundleMember) {
15022 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
15023 BundleMember->FirstInBundle = BundleMember;
15024 ScheduleData *Next = BundleMember->NextInBundle;
15025 BundleMember->NextInBundle = nullptr;
15026 BundleMember->TE = nullptr;
15027 if (BundleMember->unscheduledDepsInBundle() == 0) {
15028 ReadyInsts.insert(BundleMember);
15029 }
15030 BundleMember = Next;
15031 }
15032}
15033
15034BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
15035 // Allocate a new ScheduleData for the instruction.
15036 if (ChunkPos >= ChunkSize) {
15037 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
15038 ChunkPos = 0;
15039 }
15040 return &(ScheduleDataChunks.back()[ChunkPos++]);
15041}
15042
15043bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
15044 Value *V, const InstructionsState &S) {
15045 Instruction *I = dyn_cast<Instruction>(V);
15046 assert(I && "bundle member must be an instruction");
15047 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
15049 "phi nodes/insertelements/extractelements/extractvalues don't need to "
15050 "be scheduled");
15051 if (getScheduleData(I))
15052 return true;
15053 if (!ScheduleStart) {
15054 // It's the first instruction in the new region.
15055 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
15056 ScheduleStart = I;
15057 ScheduleEnd = I->getNextNode();
15058 assert(ScheduleEnd && "tried to vectorize a terminator?");
15059 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
15060 return true;
15061 }
15062 // Search up and down at the same time, because we don't know if the new
15063 // instruction is above or below the existing scheduling region.
15064 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
15065 // against the budget. Otherwise debug info could affect codegen.
15067 ++ScheduleStart->getIterator().getReverse();
15068 BasicBlock::reverse_iterator UpperEnd = BB->rend();
15069 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
15070 BasicBlock::iterator LowerEnd = BB->end();
15071 auto IsAssumeLikeIntr = [](const Instruction &I) {
15072 if (auto *II = dyn_cast<IntrinsicInst>(&I))
15073 return II->isAssumeLikeIntrinsic();
15074 return false;
15075 };
15076 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
15077 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
15078 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
15079 &*DownIter != I) {
15080 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
15081 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
15082 return false;
15083 }
15084
15085 ++UpIter;
15086 ++DownIter;
15087
15088 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
15089 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
15090 }
15091 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
15092 assert(I->getParent() == ScheduleStart->getParent() &&
15093 "Instruction is in wrong basic block.");
15094 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
15095 ScheduleStart = I;
15096 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
15097 << "\n");
15098 return true;
15099 }
15100 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
15101 "Expected to reach top of the basic block or instruction down the "
15102 "lower end.");
15103 assert(I->getParent() == ScheduleEnd->getParent() &&
15104 "Instruction is in wrong basic block.");
15105 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
15106 nullptr);
15107 ScheduleEnd = I->getNextNode();
15108 assert(ScheduleEnd && "tried to vectorize a terminator?");
15109 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
15110 return true;
15111}
15112
15113void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
15114 Instruction *ToI,
15115 ScheduleData *PrevLoadStore,
15116 ScheduleData *NextLoadStore) {
15117 ScheduleData *CurrentLoadStore = PrevLoadStore;
15118 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
15119 // No need to allocate data for non-schedulable instructions.
15121 continue;
15122 ScheduleData *SD = ScheduleDataMap.lookup(I);
15123 if (!SD) {
15124 SD = allocateScheduleDataChunks();
15125 ScheduleDataMap[I] = SD;
15126 }
15127 assert(!isInSchedulingRegion(SD) &&
15128 "new ScheduleData already in scheduling region");
15129 SD->init(SchedulingRegionID, I);
15130
15131 if (I->mayReadOrWriteMemory() &&
15132 (!isa<IntrinsicInst>(I) ||
15133 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
15134 cast<IntrinsicInst>(I)->getIntrinsicID() !=
15135 Intrinsic::pseudoprobe))) {
15136 // Update the linked list of memory accessing instructions.
15137 if (CurrentLoadStore) {
15138 CurrentLoadStore->NextLoadStore = SD;
15139 } else {
15140 FirstLoadStoreInRegion = SD;
15141 }
15142 CurrentLoadStore = SD;
15143 }
15144
15145 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
15146 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
15147 RegionHasStackSave = true;
15148 }
15149 if (NextLoadStore) {
15150 if (CurrentLoadStore)
15151 CurrentLoadStore->NextLoadStore = NextLoadStore;
15152 } else {
15153 LastLoadStoreInRegion = CurrentLoadStore;
15154 }
15155}
15156
15157void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
15158 bool InsertInReadyList,
15159 BoUpSLP *SLP) {
15160 assert(SD->isSchedulingEntity());
15161
15163 WorkList.push_back(SD);
15164
15165 while (!WorkList.empty()) {
15166 ScheduleData *SD = WorkList.pop_back_val();
15167 for (ScheduleData *BundleMember = SD; BundleMember;
15168 BundleMember = BundleMember->NextInBundle) {
15169 assert(isInSchedulingRegion(BundleMember));
15170 if (BundleMember->hasValidDependencies())
15171 continue;
15172
15173 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
15174 << "\n");
15175 BundleMember->Dependencies = 0;
15176 BundleMember->resetUnscheduledDeps();
15177
15178 // Handle def-use chain dependencies.
15179 for (User *U : BundleMember->Inst->users()) {
15180 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
15181 BundleMember->Dependencies++;
15182 ScheduleData *DestBundle = UseSD->FirstInBundle;
15183 if (!DestBundle->IsScheduled)
15184 BundleMember->incrementUnscheduledDeps(1);
15185 if (!DestBundle->hasValidDependencies())
15186 WorkList.push_back(DestBundle);
15187 }
15188 }
15189
15190 auto MakeControlDependent = [&](Instruction *I) {
15191 auto *DepDest = getScheduleData(I);
15192 assert(DepDest && "must be in schedule window");
15193 DepDest->ControlDependencies.push_back(BundleMember);
15194 BundleMember->Dependencies++;
15195 ScheduleData *DestBundle = DepDest->FirstInBundle;
15196 if (!DestBundle->IsScheduled)
15197 BundleMember->incrementUnscheduledDeps(1);
15198 if (!DestBundle->hasValidDependencies())
15199 WorkList.push_back(DestBundle);
15200 };
15201
15202 // Any instruction which isn't safe to speculate at the beginning of the
15203 // block is control dependend on any early exit or non-willreturn call
15204 // which proceeds it.
15205 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
15206 for (Instruction *I = BundleMember->Inst->getNextNode();
15207 I != ScheduleEnd; I = I->getNextNode()) {
15208 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
15209 continue;
15210
15211 // Add the dependency
15212 MakeControlDependent(I);
15213
15215 // Everything past here must be control dependent on I.
15216 break;
15217 }
15218 }
15219
15220 if (RegionHasStackSave) {
15221 // If we have an inalloc alloca instruction, it needs to be scheduled
15222 // after any preceeding stacksave. We also need to prevent any alloca
15223 // from reordering above a preceeding stackrestore.
15224 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
15225 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
15226 for (Instruction *I = BundleMember->Inst->getNextNode();
15227 I != ScheduleEnd; I = I->getNextNode()) {
15228 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
15229 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
15230 // Any allocas past here must be control dependent on I, and I
15231 // must be memory dependend on BundleMember->Inst.
15232 break;
15233
15234 if (!isa<AllocaInst>(I))
15235 continue;
15236
15237 // Add the dependency
15238 MakeControlDependent(I);
15239 }
15240 }
15241
15242 // In addition to the cases handle just above, we need to prevent
15243 // allocas and loads/stores from moving below a stacksave or a
15244 // stackrestore. Avoiding moving allocas below stackrestore is currently
15245 // thought to be conservatism. Moving loads/stores below a stackrestore
15246 // can lead to incorrect code.
15247 if (isa<AllocaInst>(BundleMember->Inst) ||
15248 BundleMember->Inst->mayReadOrWriteMemory()) {
15249 for (Instruction *I = BundleMember->Inst->getNextNode();
15250 I != ScheduleEnd; I = I->getNextNode()) {
15251 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
15252 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
15253 continue;
15254
15255 // Add the dependency
15256 MakeControlDependent(I);
15257 break;
15258 }
15259 }
15260 }
15261
15262 // Handle the memory dependencies (if any).
15263 ScheduleData *DepDest = BundleMember->NextLoadStore;
15264 if (!DepDest)
15265 continue;
15266 Instruction *SrcInst = BundleMember->Inst;
15267 assert(SrcInst->mayReadOrWriteMemory() &&
15268 "NextLoadStore list for non memory effecting bundle?");
15269 MemoryLocation SrcLoc = getLocation(SrcInst);
15270 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
15271 unsigned NumAliased = 0;
15272 unsigned DistToSrc = 1;
15273
15274 for (; DepDest; DepDest = DepDest->NextLoadStore) {
15275 assert(isInSchedulingRegion(DepDest));
15276
15277 // We have two limits to reduce the complexity:
15278 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
15279 // SLP->isAliased (which is the expensive part in this loop).
15280 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
15281 // the whole loop (even if the loop is fast, it's quadratic).
15282 // It's important for the loop break condition (see below) to
15283 // check this limit even between two read-only instructions.
15284 if (DistToSrc >= MaxMemDepDistance ||
15285 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
15286 (NumAliased >= AliasedCheckLimit ||
15287 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
15288
15289 // We increment the counter only if the locations are aliased
15290 // (instead of counting all alias checks). This gives a better
15291 // balance between reduced runtime and accurate dependencies.
15292 NumAliased++;
15293
15294 DepDest->MemoryDependencies.push_back(BundleMember);
15295 BundleMember->Dependencies++;
15296 ScheduleData *DestBundle = DepDest->FirstInBundle;
15297 if (!DestBundle->IsScheduled) {
15298 BundleMember->incrementUnscheduledDeps(1);
15299 }
15300 if (!DestBundle->hasValidDependencies()) {
15301 WorkList.push_back(DestBundle);
15302 }
15303 }
15304
15305 // Example, explaining the loop break condition: Let's assume our
15306 // starting instruction is i0 and MaxMemDepDistance = 3.
15307 //
15308 // +--------v--v--v
15309 // i0,i1,i2,i3,i4,i5,i6,i7,i8
15310 // +--------^--^--^
15311 //
15312 // MaxMemDepDistance let us stop alias-checking at i3 and we add
15313 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
15314 // Previously we already added dependencies from i3 to i6,i7,i8
15315 // (because of MaxMemDepDistance). As we added a dependency from
15316 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
15317 // and we can abort this loop at i6.
15318 if (DistToSrc >= 2 * MaxMemDepDistance)
15319 break;
15320 DistToSrc++;
15321 }
15322 }
15323 if (InsertInReadyList && SD->isReady()) {
15324 ReadyInsts.insert(SD);
15325 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
15326 << "\n");
15327 }
15328 }
15329}
15330
15331void BoUpSLP::BlockScheduling::resetSchedule() {
15332 assert(ScheduleStart &&
15333 "tried to reset schedule on block which has not been scheduled");
15334 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
15335 if (ScheduleData *SD = getScheduleData(I)) {
15336 assert(isInSchedulingRegion(SD) &&
15337 "ScheduleData not in scheduling region");
15338 SD->IsScheduled = false;
15339 SD->resetUnscheduledDeps();
15340 }
15341 }
15342 ReadyInsts.clear();
15343}
15344
15345void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
15346 if (!BS->ScheduleStart)
15347 return;
15348
15349 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
15350
15351 // A key point - if we got here, pre-scheduling was able to find a valid
15352 // scheduling of the sub-graph of the scheduling window which consists
15353 // of all vector bundles and their transitive users. As such, we do not
15354 // need to reschedule anything *outside of* that subgraph.
15355
15356 BS->resetSchedule();
15357
15358 // For the real scheduling we use a more sophisticated ready-list: it is
15359 // sorted by the original instruction location. This lets the final schedule
15360 // be as close as possible to the original instruction order.
15361 // WARNING: If changing this order causes a correctness issue, that means
15362 // there is some missing dependence edge in the schedule data graph.
15363 struct ScheduleDataCompare {
15364 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
15365 return SD2->SchedulingPriority < SD1->SchedulingPriority;
15366 }
15367 };
15368 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
15369
15370 // Ensure that all dependency data is updated (for nodes in the sub-graph)
15371 // and fill the ready-list with initial instructions.
15372 int Idx = 0;
15373 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
15374 I = I->getNextNode()) {
15375 if (ScheduleData *SD = BS->getScheduleData(I)) {
15376 TreeEntry *SDTE = getTreeEntry(SD->Inst);
15377 (void)SDTE;
15379 SD->isPartOfBundle() ==
15380 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
15381 "scheduler and vectorizer bundle mismatch");
15382 SD->FirstInBundle->SchedulingPriority = Idx++;
15383
15384 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
15385 BS->calculateDependencies(SD, false, this);
15386 }
15387 }
15388 BS->initialFillReadyList(ReadyInsts);
15389
15390 Instruction *LastScheduledInst = BS->ScheduleEnd;
15391
15392 // Do the "real" scheduling.
15393 while (!ReadyInsts.empty()) {
15394 ScheduleData *Picked = *ReadyInsts.begin();
15395 ReadyInsts.erase(ReadyInsts.begin());
15396
15397 // Move the scheduled instruction(s) to their dedicated places, if not
15398 // there yet.
15399 for (ScheduleData *BundleMember = Picked; BundleMember;
15400 BundleMember = BundleMember->NextInBundle) {
15401 Instruction *PickedInst = BundleMember->Inst;
15402 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
15403 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
15404 LastScheduledInst = PickedInst;
15405 }
15406
15407 BS->schedule(Picked, ReadyInsts);
15408 }
15409
15410 // Check that we didn't break any of our invariants.
15411#ifdef EXPENSIVE_CHECKS
15412 BS->verify();
15413#endif
15414
15415#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15416 // Check that all schedulable entities got scheduled
15417 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
15418 ScheduleData *SD = BS->getScheduleData(I);
15419 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
15420 assert(SD->IsScheduled && "must be scheduled at this point");
15421 }
15422#endif
15423
15424 // Avoid duplicate scheduling of the block.
15425 BS->ScheduleStart = nullptr;
15426}
15427
15429 // If V is a store, just return the width of the stored value (or value
15430 // truncated just before storing) without traversing the expression tree.
15431 // This is the common case.
15432 if (auto *Store = dyn_cast<StoreInst>(V))
15433 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
15434
15435 if (auto *IEI = dyn_cast<InsertElementInst>(V))
15436 return getVectorElementSize(IEI->getOperand(1));
15437
15438 auto E = InstrElementSize.find(V);
15439 if (E != InstrElementSize.end())
15440 return E->second;
15441
15442 // If V is not a store, we can traverse the expression tree to find loads
15443 // that feed it. The type of the loaded value may indicate a more suitable
15444 // width than V's type. We want to base the vector element size on the width
15445 // of memory operations where possible.
15448 if (auto *I = dyn_cast<Instruction>(V)) {
15449 Worklist.emplace_back(I, I->getParent(), 0);
15450 Visited.insert(I);
15451 }
15452
15453 // Traverse the expression tree in bottom-up order looking for loads. If we
15454 // encounter an instruction we don't yet handle, we give up.
15455 auto Width = 0u;
15456 Value *FirstNonBool = nullptr;
15457 while (!Worklist.empty()) {
15458 auto [I, Parent, Level] = Worklist.pop_back_val();
15459
15460 // We should only be looking at scalar instructions here. If the current
15461 // instruction has a vector type, skip.
15462 auto *Ty = I->getType();
15463 if (isa<VectorType>(Ty))
15464 continue;
15465 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
15466 FirstNonBool = I;
15467 if (Level > RecursionMaxDepth)
15468 continue;
15469
15470 // If the current instruction is a load, update MaxWidth to reflect the
15471 // width of the loaded value.
15472 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
15473 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
15474
15475 // Otherwise, we need to visit the operands of the instruction. We only
15476 // handle the interesting cases from buildTree here. If an operand is an
15477 // instruction we haven't yet visited and from the same basic block as the
15478 // user or the use is a PHI node, we add it to the worklist.
15481 for (Use &U : I->operands()) {
15482 if (auto *J = dyn_cast<Instruction>(U.get()))
15483 if (Visited.insert(J).second &&
15484 (isa<PHINode>(I) || J->getParent() == Parent)) {
15485 Worklist.emplace_back(J, J->getParent(), Level + 1);
15486 continue;
15487 }
15488 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
15489 FirstNonBool = U.get();
15490 }
15491 } else {
15492 break;
15493 }
15494 }
15495
15496 // If we didn't encounter a memory access in the expression tree, or if we
15497 // gave up for some reason, just return the width of V. Otherwise, return the
15498 // maximum width we found.
15499 if (!Width) {
15500 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
15501 V = FirstNonBool;
15502 Width = DL->getTypeSizeInBits(V->getType());
15503 }
15504
15505 for (Instruction *I : Visited)
15506 InstrElementSize[I] = Width;
15507
15508 return Width;
15509}
15510
15511bool BoUpSLP::collectValuesToDemote(
15512 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
15514 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
15515 bool IsTruncRoot) const {
15516 // We can always demote constants.
15517 if (all_of(E.Scalars, IsaPred<Constant>))
15518 return true;
15519
15520 unsigned OrigBitWidth =
15521 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
15522 if (OrigBitWidth == BitWidth) {
15523 MaxDepthLevel = 1;
15524 return true;
15525 }
15526
15527 // If the value is not a vectorized instruction in the expression and not used
15528 // by the insertelement instruction and not used in multiple vector nodes, it
15529 // cannot be demoted.
15530 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
15531 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15532 });
15533 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
15534 if (MultiNodeScalars.contains(V))
15535 return false;
15536 // For lat shuffle of sext/zext with many uses need to check the extra bit
15537 // for unsigned values, otherwise may have incorrect casting for reused
15538 // scalars.
15539 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
15540 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
15541 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15542 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15543 return true;
15544 }
15545 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15546 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15547 if (IsSignedNode)
15548 ++BitWidth1;
15549 if (auto *I = dyn_cast<Instruction>(V)) {
15550 APInt Mask = DB->getDemandedBits(I);
15551 unsigned BitWidth2 =
15552 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
15553 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15554 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
15555 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15556 break;
15557 BitWidth2 *= 2;
15558 }
15559 BitWidth1 = std::min(BitWidth1, BitWidth2);
15560 }
15561 BitWidth = std::max(BitWidth, BitWidth1);
15562 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
15563 };
15564 using namespace std::placeholders;
15565 auto FinalAnalysis = [&]() {
15566 if (!IsProfitableToDemote)
15567 return false;
15568 bool Res = all_of(
15569 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
15570 // Demote gathers.
15571 if (Res && E.isGather()) {
15572 // Check possible extractelement instructions bases and final vector
15573 // length.
15574 SmallPtrSet<Value *, 4> UniqueBases;
15575 for (Value *V : E.Scalars) {
15576 auto *EE = dyn_cast<ExtractElementInst>(V);
15577 if (!EE)
15578 continue;
15579 UniqueBases.insert(EE->getVectorOperand());
15580 }
15581 const unsigned VF = E.Scalars.size();
15582 Type *OrigScalarTy = E.Scalars.front()->getType();
15583 if (UniqueBases.size() <= 2 ||
15584 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
15586 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
15587 ToDemote.push_back(E.Idx);
15588 }
15589 return Res;
15590 };
15591 if (E.isGather() || !Visited.insert(&E).second ||
15592 any_of(E.Scalars, [&](Value *V) {
15593 return all_of(V->users(), [&](User *U) {
15594 return isa<InsertElementInst>(U) && !getTreeEntry(U);
15595 });
15596 }))
15597 return FinalAnalysis();
15598
15599 if (any_of(E.Scalars, [&](Value *V) {
15600 return !all_of(V->users(), [=](User *U) {
15601 return getTreeEntry(U) ||
15602 (E.Idx == 0 && UserIgnoreList &&
15603 UserIgnoreList->contains(U)) ||
15604 (!isa<CmpInst>(U) && U->getType()->isSized() &&
15605 !U->getType()->isScalableTy() &&
15606 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15607 }) && !IsPotentiallyTruncated(V, BitWidth);
15608 }))
15609 return false;
15610
15611 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15612 bool &NeedToExit) {
15613 NeedToExit = false;
15614 unsigned InitLevel = MaxDepthLevel;
15615 for (const TreeEntry *Op : Operands) {
15616 unsigned Level = InitLevel;
15617 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
15618 ToDemote, Visited, Level, IsProfitableToDemote,
15619 IsTruncRoot)) {
15620 if (!IsProfitableToDemote)
15621 return false;
15622 NeedToExit = true;
15623 if (!FinalAnalysis())
15624 return false;
15625 continue;
15626 }
15627 MaxDepthLevel = std::max(MaxDepthLevel, Level);
15628 }
15629 return true;
15630 };
15631 auto AttemptCheckBitwidth =
15632 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15633 // Try all bitwidth < OrigBitWidth.
15634 NeedToExit = false;
15635 unsigned BestFailBitwidth = 0;
15636 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
15637 if (Checker(BitWidth, OrigBitWidth))
15638 return true;
15639 if (BestFailBitwidth == 0 && FinalAnalysis())
15640 BestFailBitwidth = BitWidth;
15641 }
15642 if (BitWidth >= OrigBitWidth) {
15643 if (BestFailBitwidth == 0) {
15644 BitWidth = OrigBitWidth;
15645 return false;
15646 }
15647 MaxDepthLevel = 1;
15648 BitWidth = BestFailBitwidth;
15649 NeedToExit = true;
15650 return true;
15651 }
15652 return false;
15653 };
15654 auto TryProcessInstruction =
15655 [&](unsigned &BitWidth,
15657 function_ref<bool(unsigned, unsigned)> Checker = {}) {
15658 if (Operands.empty()) {
15659 if (!IsTruncRoot)
15660 MaxDepthLevel = 1;
15661 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15662 std::ref(BitWidth)));
15663 } else {
15664 // Several vectorized uses? Check if we can truncate it, otherwise -
15665 // exit.
15666 if (E.UserTreeIndices.size() > 1 &&
15667 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15668 std::ref(BitWidth))))
15669 return false;
15670 bool NeedToExit = false;
15671 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15672 return false;
15673 if (NeedToExit)
15674 return true;
15675 if (!ProcessOperands(Operands, NeedToExit))
15676 return false;
15677 if (NeedToExit)
15678 return true;
15679 }
15680
15681 ++MaxDepthLevel;
15682 // Record the entry that we can demote.
15683 ToDemote.push_back(E.Idx);
15684 return IsProfitableToDemote;
15685 };
15686 switch (E.getOpcode()) {
15687
15688 // We can always demote truncations and extensions. Since truncations can
15689 // seed additional demotion, we save the truncated value.
15690 case Instruction::Trunc:
15691 if (IsProfitableToDemoteRoot)
15692 IsProfitableToDemote = true;
15693 return TryProcessInstruction(BitWidth);
15694 case Instruction::ZExt:
15695 case Instruction::SExt:
15696 IsProfitableToDemote = true;
15697 return TryProcessInstruction(BitWidth);
15698
15699 // We can demote certain binary operations if we can demote both of their
15700 // operands.
15701 case Instruction::Add:
15702 case Instruction::Sub:
15703 case Instruction::Mul:
15704 case Instruction::And:
15705 case Instruction::Or:
15706 case Instruction::Xor: {
15707 return TryProcessInstruction(
15708 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15709 }
15710 case Instruction::Shl: {
15711 // If we are truncating the result of this SHL, and if it's a shift of an
15712 // inrange amount, we can always perform a SHL in a smaller type.
15713 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15714 return all_of(E.Scalars, [&](Value *V) {
15715 auto *I = cast<Instruction>(V);
15716 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15717 return AmtKnownBits.getMaxValue().ult(BitWidth);
15718 });
15719 };
15720 return TryProcessInstruction(
15721 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15722 }
15723 case Instruction::LShr: {
15724 // If this is a truncate of a logical shr, we can truncate it to a smaller
15725 // lshr iff we know that the bits we would otherwise be shifting in are
15726 // already zeros.
15727 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15728 return all_of(E.Scalars, [&](Value *V) {
15729 auto *I = cast<Instruction>(V);
15730 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15731 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15732 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15733 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15734 SimplifyQuery(*DL));
15735 });
15736 };
15737 return TryProcessInstruction(
15738 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15739 LShrChecker);
15740 }
15741 case Instruction::AShr: {
15742 // If this is a truncate of an arithmetic shr, we can truncate it to a
15743 // smaller ashr iff we know that all the bits from the sign bit of the
15744 // original type and the sign bit of the truncate type are similar.
15745 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15746 return all_of(E.Scalars, [&](Value *V) {
15747 auto *I = cast<Instruction>(V);
15748 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15749 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15750 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15751 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15752 nullptr, DT);
15753 });
15754 };
15755 return TryProcessInstruction(
15756 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15757 AShrChecker);
15758 }
15759 case Instruction::UDiv:
15760 case Instruction::URem: {
15761 // UDiv and URem can be truncated if all the truncated bits are zero.
15762 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15763 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15764 return all_of(E.Scalars, [&](Value *V) {
15765 auto *I = cast<Instruction>(V);
15766 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15767 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15768 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15769 });
15770 };
15771 return TryProcessInstruction(
15772 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15773 }
15774
15775 // We can demote selects if we can demote their true and false values.
15776 case Instruction::Select: {
15777 return TryProcessInstruction(
15778 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15779 }
15780
15781 // We can demote phis if we can demote all their incoming operands. Note that
15782 // we don't need to worry about cycles since we ensure single use above.
15783 case Instruction::PHI: {
15784 const unsigned NumOps = E.getNumOperands();
15786 transform(seq<unsigned>(0, NumOps), Ops.begin(),
15787 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15788
15789 return TryProcessInstruction(BitWidth, Ops);
15790 }
15791
15792 case Instruction::Call: {
15793 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15794 if (!IC)
15795 break;
15797 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15798 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15799 break;
15800 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15801 function_ref<bool(unsigned, unsigned)> CallChecker;
15802 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15803 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15804 return all_of(E.Scalars, [&](Value *V) {
15805 auto *I = cast<Instruction>(V);
15806 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15807 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15808 return MaskedValueIsZero(I->getOperand(0), Mask,
15809 SimplifyQuery(*DL)) &&
15810 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15811 }
15812 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15813 "Expected min/max intrinsics only.");
15814 unsigned SignBits = OrigBitWidth - BitWidth;
15815 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15816 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15817 nullptr, DT);
15818 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15819 nullptr, DT);
15820 return SignBits <= Op0SignBits &&
15821 ((SignBits != Op0SignBits &&
15822 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15823 MaskedValueIsZero(I->getOperand(0), Mask,
15824 SimplifyQuery(*DL))) &&
15825 SignBits <= Op1SignBits &&
15826 ((SignBits != Op1SignBits &&
15827 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
15828 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15829 });
15830 };
15831 if (ID != Intrinsic::abs) {
15832 Operands.push_back(getOperandEntry(&E, 1));
15833 CallChecker = CompChecker;
15834 }
15835 InstructionCost BestCost =
15836 std::numeric_limits<InstructionCost::CostType>::max();
15837 unsigned BestBitWidth = BitWidth;
15838 unsigned VF = E.Scalars.size();
15839 // Choose the best bitwidth based on cost estimations.
15840 auto Checker = [&](unsigned BitWidth, unsigned) {
15841 unsigned MinBW = PowerOf2Ceil(BitWidth);
15842 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15843 auto VecCallCosts = getVectorCallCosts(
15844 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
15845 TTI, TLI, ArgTys);
15846 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15847 if (Cost < BestCost) {
15848 BestCost = Cost;
15849 BestBitWidth = BitWidth;
15850 }
15851 return false;
15852 };
15853 [[maybe_unused]] bool NeedToExit;
15854 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15855 BitWidth = BestBitWidth;
15856 return TryProcessInstruction(BitWidth, Operands, CallChecker);
15857 }
15858
15859 // Otherwise, conservatively give up.
15860 default:
15861 break;
15862 }
15863 MaxDepthLevel = 1;
15864 return FinalAnalysis();
15865}
15866
15867static RecurKind getRdxKind(Value *V);
15868
15870 // We only attempt to truncate integer expressions.
15871 bool IsStoreOrInsertElt =
15872 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15873 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15874 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15875 ExtraBitWidthNodes.size() <= 1 &&
15876 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15877 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15878 return;
15879
15880 unsigned NodeIdx = 0;
15881 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15882 NodeIdx = 1;
15883
15884 // Ensure the roots of the vectorizable tree don't form a cycle.
15885 if (VectorizableTree[NodeIdx]->isGather() ||
15886 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15887 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15888 [NodeIdx](const EdgeInfo &EI) {
15889 return EI.UserTE->Idx >
15890 static_cast<int>(NodeIdx);
15891 })))
15892 return;
15893
15894 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15895 // resize to the final type.
15896 bool IsTruncRoot = false;
15897 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15898 SmallVector<unsigned> RootDemotes;
15899 if (NodeIdx != 0 &&
15900 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15901 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15902 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15903 IsTruncRoot = true;
15904 RootDemotes.push_back(NodeIdx);
15905 IsProfitableToDemoteRoot = true;
15906 ++NodeIdx;
15907 }
15908
15909 // Analyzed the reduction already and not profitable - exit.
15910 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15911 return;
15912
15913 SmallVector<unsigned> ToDemote;
15914 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15915 bool IsProfitableToDemoteRoot, unsigned Opcode,
15916 unsigned Limit, bool IsTruncRoot,
15917 bool IsSignedCmp) -> unsigned {
15918 ToDemote.clear();
15919 // Check if the root is trunc and the next node is gather/buildvector, then
15920 // keep trunc in scalars, which is free in most cases.
15921 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15922 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15923 all_of(E.Scalars, [&](Value *V) {
15924 return V->hasOneUse() || isa<Constant>(V) ||
15925 (!V->hasNUsesOrMore(UsesLimit) &&
15926 none_of(V->users(), [&](User *U) {
15927 const TreeEntry *TE = getTreeEntry(U);
15928 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15929 if (TE == UserTE || !TE)
15930 return false;
15931 unsigned UserTESz = DL->getTypeSizeInBits(
15932 UserTE->Scalars.front()->getType());
15933 auto It = MinBWs.find(TE);
15934 if (It != MinBWs.end() && It->second.first > UserTESz)
15935 return true;
15936 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
15937 }));
15938 })) {
15939 ToDemote.push_back(E.Idx);
15940 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15941 auto It = MinBWs.find(UserTE);
15942 if (It != MinBWs.end())
15943 return It->second.first;
15944 unsigned MaxBitWidth =
15945 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
15946 MaxBitWidth = bit_ceil(MaxBitWidth);
15947 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15948 MaxBitWidth = 8;
15949 return MaxBitWidth;
15950 }
15951
15952 unsigned VF = E.getVectorFactor();
15953 Type *ScalarTy = E.Scalars.front()->getType();
15954 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
15955 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
15956 if (!TreeRootIT || !Opcode)
15957 return 0u;
15958
15959 if (any_of(E.Scalars,
15960 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15961 return 0u;
15962
15963 unsigned NumParts = TTI->getNumberOfParts(
15964 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
15965
15966 // The maximum bit width required to represent all the values that can be
15967 // demoted without loss of precision. It would be safe to truncate the roots
15968 // of the expression to this width.
15969 unsigned MaxBitWidth = 1u;
15970
15971 // True if the roots can be zero-extended back to their original type,
15972 // rather than sign-extended. We know that if the leading bits are not
15973 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15974 // True.
15975 // Determine if the sign bit of all the roots is known to be zero. If not,
15976 // IsKnownPositive is set to False.
15977 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15978 KnownBits Known = computeKnownBits(R, *DL);
15979 return Known.isNonNegative();
15980 });
15981
15982 // We first check if all the bits of the roots are demanded. If they're not,
15983 // we can truncate the roots to this narrower type.
15984 for (Value *Root : E.Scalars) {
15985 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15986 TypeSize NumTypeBits =
15987 DL->getTypeSizeInBits(Root->getType()->getScalarType());
15988 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15989 // If we can't prove that the sign bit is zero, we must add one to the
15990 // maximum bit width to account for the unknown sign bit. This preserves
15991 // the existing sign bit so we can safely sign-extend the root back to the
15992 // original type. Otherwise, if we know the sign bit is zero, we will
15993 // zero-extend the root instead.
15994 //
15995 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15996 // one to the maximum bit width will yield a larger-than-necessary
15997 // type. In general, we need to add an extra bit only if we can't
15998 // prove that the upper bit of the original type is equal to the
15999 // upper bit of the proposed smaller type. If these two bits are
16000 // the same (either zero or one) we know that sign-extending from
16001 // the smaller type will result in the same value. Here, since we
16002 // can't yet prove this, we are just making the proposed smaller
16003 // type larger to ensure correctness.
16004 if (!IsKnownPositive)
16005 ++BitWidth1;
16006
16007 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
16008 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
16009 MaxBitWidth =
16010 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
16011 }
16012
16013 if (MaxBitWidth < 8 && MaxBitWidth > 1)
16014 MaxBitWidth = 8;
16015
16016 // If the original type is large, but reduced type does not improve the reg
16017 // use - ignore it.
16018 if (NumParts > 1 &&
16019 NumParts ==
16021 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
16022 return 0u;
16023
16024 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
16025 Opcode == Instruction::SExt ||
16026 Opcode == Instruction::ZExt || NumParts > 1;
16027 // Conservatively determine if we can actually truncate the roots of the
16028 // expression. Collect the values that can be demoted in ToDemote and
16029 // additional roots that require investigating in Roots.
16031 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
16032 bool NeedToDemote = IsProfitableToDemote;
16033
16034 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
16035 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
16036 IsTruncRoot) ||
16037 (MaxDepthLevel <= Limit &&
16038 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
16039 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
16040 DL->getTypeSizeInBits(TreeRootIT) /
16041 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
16042 ->getOperand(0)
16043 ->getType()) >
16044 2)))))
16045 return 0u;
16046 // Round MaxBitWidth up to the next power-of-two.
16047 MaxBitWidth = bit_ceil(MaxBitWidth);
16048
16049 return MaxBitWidth;
16050 };
16051
16052 // If we can truncate the root, we must collect additional values that might
16053 // be demoted as a result. That is, those seeded by truncations we will
16054 // modify.
16055 // Add reduction ops sizes, if any.
16056 if (UserIgnoreList &&
16057 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
16058 for (Value *V : *UserIgnoreList) {
16059 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
16060 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
16061 unsigned BitWidth1 = NumTypeBits - NumSignBits;
16063 ++BitWidth1;
16064 unsigned BitWidth2 = BitWidth1;
16066 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
16067 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
16068 }
16069 ReductionBitWidth =
16070 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
16071 }
16072 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
16073 ReductionBitWidth = 8;
16074
16075 ReductionBitWidth = bit_ceil(ReductionBitWidth);
16076 }
16077 bool IsTopRoot = NodeIdx == 0;
16078 while (NodeIdx < VectorizableTree.size() &&
16079 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
16080 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
16081 RootDemotes.push_back(NodeIdx);
16082 ++NodeIdx;
16083 IsTruncRoot = true;
16084 }
16085 bool IsSignedCmp = false;
16086 while (NodeIdx < VectorizableTree.size()) {
16087 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
16088 unsigned Limit = 2;
16089 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
16090 if (IsTopRoot &&
16091 ReductionBitWidth ==
16092 DL->getTypeSizeInBits(
16093 VectorizableTree.front()->Scalars.front()->getType()))
16094 Limit = 3;
16095 unsigned MaxBitWidth = ComputeMaxBitWidth(
16096 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
16097 Limit, IsTruncRoot, IsSignedCmp);
16098 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
16099 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
16100 ReductionBitWidth = bit_ceil(MaxBitWidth);
16101 else if (MaxBitWidth == 0)
16102 ReductionBitWidth = 0;
16103 }
16104
16105 for (unsigned Idx : RootDemotes) {
16106 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
16107 uint32_t OrigBitWidth =
16108 DL->getTypeSizeInBits(V->getType()->getScalarType());
16109 if (OrigBitWidth > MaxBitWidth) {
16110 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
16111 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
16112 }
16113 return false;
16114 }))
16115 ToDemote.push_back(Idx);
16116 }
16117 RootDemotes.clear();
16118 IsTopRoot = false;
16119 IsProfitableToDemoteRoot = true;
16120
16121 if (ExtraBitWidthNodes.empty()) {
16122 NodeIdx = VectorizableTree.size();
16123 } else {
16124 unsigned NewIdx = 0;
16125 do {
16126 NewIdx = *ExtraBitWidthNodes.begin();
16127 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
16128 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
16129 NodeIdx = NewIdx;
16130 IsTruncRoot =
16131 NodeIdx < VectorizableTree.size() &&
16132 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
16133 [](const EdgeInfo &EI) {
16134 return EI.EdgeIdx == 0 &&
16135 EI.UserTE->getOpcode() == Instruction::Trunc &&
16136 !EI.UserTE->isAltShuffle();
16137 });
16138 IsSignedCmp =
16139 NodeIdx < VectorizableTree.size() &&
16140 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
16141 [&](const EdgeInfo &EI) {
16142 return EI.UserTE->getOpcode() == Instruction::ICmp &&
16143 any_of(EI.UserTE->Scalars, [&](Value *V) {
16144 auto *IC = dyn_cast<ICmpInst>(V);
16145 return IC &&
16146 (IC->isSigned() ||
16147 !isKnownNonNegative(IC->getOperand(0),
16148 SimplifyQuery(*DL)) ||
16149 !isKnownNonNegative(IC->getOperand(1),
16150 SimplifyQuery(*DL)));
16151 });
16152 });
16153 }
16154
16155 // If the maximum bit width we compute is less than the with of the roots'
16156 // type, we can proceed with the narrowing. Otherwise, do nothing.
16157 if (MaxBitWidth == 0 ||
16158 MaxBitWidth >=
16159 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
16160 ->getBitWidth()) {
16161 if (UserIgnoreList)
16162 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
16163 continue;
16164 }
16165
16166 // Finally, map the values we can demote to the maximum bit with we
16167 // computed.
16168 for (unsigned Idx : ToDemote) {
16169 TreeEntry *TE = VectorizableTree[Idx].get();
16170 if (MinBWs.contains(TE))
16171 continue;
16172 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
16173 return !isKnownNonNegative(R, SimplifyQuery(*DL));
16174 });
16175 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
16176 }
16177 }
16178}
16179
16181 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
16182 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
16183 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
16184 auto *AA = &AM.getResult<AAManager>(F);
16185 auto *LI = &AM.getResult<LoopAnalysis>(F);
16186 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
16187 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
16188 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
16190
16191 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
16192 if (!Changed)
16193 return PreservedAnalyses::all();
16194
16197 return PA;
16198}
16199
16201 TargetTransformInfo *TTI_,
16202 TargetLibraryInfo *TLI_, AAResults *AA_,
16203 LoopInfo *LI_, DominatorTree *DT_,
16204 AssumptionCache *AC_, DemandedBits *DB_,
16207 return false;
16208 SE = SE_;
16209 TTI = TTI_;
16210 TLI = TLI_;
16211 AA = AA_;
16212 LI = LI_;
16213 DT = DT_;
16214 AC = AC_;
16215 DB = DB_;
16216 DL = &F.getDataLayout();
16217
16218 Stores.clear();
16219 GEPs.clear();
16220 bool Changed = false;
16221
16222 // If the target claims to have no vector registers don't attempt
16223 // vectorization.
16225 LLVM_DEBUG(
16226 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
16227 return false;
16228 }
16229
16230 // Don't vectorize when the attribute NoImplicitFloat is used.
16231 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
16232 return false;
16233
16234 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
16235
16236 // Use the bottom up slp vectorizer to construct chains that start with
16237 // store instructions.
16238 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
16239
16240 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
16241 // delete instructions.
16242
16243 // Update DFS numbers now so that we can use them for ordering.
16244 DT->updateDFSNumbers();
16245
16246 // Scan the blocks in the function in post order.
16247 for (auto *BB : post_order(&F.getEntryBlock())) {
16248 // Start new block - clear the list of reduction roots.
16249 R.clearReductionData();
16250 collectSeedInstructions(BB);
16251
16252 // Vectorize trees that end at stores.
16253 if (!Stores.empty()) {
16254 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
16255 << " underlying objects.\n");
16256 Changed |= vectorizeStoreChains(R);
16257 }
16258
16259 // Vectorize trees that end at reductions.
16260 Changed |= vectorizeChainsInBlock(BB, R);
16261
16262 // Vectorize the index computations of getelementptr instructions. This
16263 // is primarily intended to catch gather-like idioms ending at
16264 // non-consecutive loads.
16265 if (!GEPs.empty()) {
16266 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
16267 << " underlying objects.\n");
16268 Changed |= vectorizeGEPIndices(BB, R);
16269 }
16270 }
16271
16272 if (Changed) {
16273 R.optimizeGatherSequence();
16274 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
16275 }
16276 return Changed;
16277}
16278
16279std::optional<bool>
16280SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
16281 unsigned Idx, unsigned MinVF,
16282 unsigned &Size) {
16283 Size = 0;
16284 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
16285 << "\n");
16286 const unsigned Sz = R.getVectorElementSize(Chain[0]);
16287 unsigned VF = Chain.size();
16288
16289 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
16290 // Check if vectorizing with a non-power-of-2 VF should be considered. At
16291 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
16292 // all vector lanes are used.
16293 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
16294 return false;
16295 }
16296
16297 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
16298 << "\n");
16299
16300 SetVector<Value *> ValOps;
16301 for (Value *V : Chain)
16302 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
16303 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
16304 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
16305 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
16306 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
16307 bool IsPowerOf2 =
16308 isPowerOf2_32(ValOps.size()) ||
16309 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
16310 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
16311 (!S.MainOp->isSafeToRemove() ||
16312 any_of(ValOps.getArrayRef(),
16313 [&](Value *V) {
16314 return !isa<ExtractElementInst>(V) &&
16315 (V->getNumUses() > Chain.size() ||
16316 any_of(V->users(), [&](User *U) {
16317 return !Stores.contains(U);
16318 }));
16319 }))) ||
16320 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
16321 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
16322 return false;
16323 }
16324 }
16325 if (R.isLoadCombineCandidate(Chain))
16326 return true;
16327 R.buildTree(Chain);
16328 // Check if tree tiny and store itself or its value is not vectorized.
16329 if (R.isTreeTinyAndNotFullyVectorizable()) {
16330 if (R.isGathered(Chain.front()) ||
16331 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
16332 return std::nullopt;
16333 Size = R.getTreeSize();
16334 return false;
16335 }
16336 R.reorderTopToBottom();
16337 R.reorderBottomToTop();
16338 R.transformNodes();
16339 R.buildExternalUses();
16340
16341 R.computeMinimumValueSizes();
16342
16343 Size = R.getTreeSize();
16344 if (S.getOpcode() == Instruction::Load)
16345 Size = 2; // cut off masked gather small trees
16346 InstructionCost Cost = R.getTreeCost();
16347
16348 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
16349 if (Cost < -SLPCostThreshold) {
16350 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
16351
16352 using namespace ore;
16353
16354 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
16355 cast<StoreInst>(Chain[0]))
16356 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
16357 << " and with tree size "
16358 << NV("TreeSize", R.getTreeSize()));
16359
16360 R.vectorizeTree();
16361 return true;
16362 }
16363
16364 return false;
16365}
16366
16367/// Checks if the quadratic mean deviation is less than 90% of the mean size.
16368static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
16369 bool First) {
16370 unsigned Num = 0;
16371 uint64_t Sum = std::accumulate(
16372 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16373 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16374 unsigned Size = First ? Val.first : Val.second;
16375 if (Size == 1)
16376 return V;
16377 ++Num;
16378 return V + Size;
16379 });
16380 if (Num == 0)
16381 return true;
16382 uint64_t Mean = Sum / Num;
16383 if (Mean == 0)
16384 return true;
16385 uint64_t Dev = std::accumulate(
16386 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16387 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16388 unsigned P = First ? Val.first : Val.second;
16389 if (P == 1)
16390 return V;
16391 return V + (P - Mean) * (P - Mean);
16392 }) /
16393 Num;
16394 return Dev * 81 / (Mean * Mean) == 0;
16395}
16396
16397bool SLPVectorizerPass::vectorizeStores(
16398 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
16399 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16400 &Visited) {
16401 // We may run into multiple chains that merge into a single chain. We mark the
16402 // stores that we vectorized so that we don't visit the same store twice.
16403 BoUpSLP::ValueSet VectorizedStores;
16404 bool Changed = false;
16405
16406 struct StoreDistCompare {
16407 bool operator()(const std::pair<unsigned, int> &Op1,
16408 const std::pair<unsigned, int> &Op2) const {
16409 return Op1.second < Op2.second;
16410 }
16411 };
16412 // A set of pairs (index of store in Stores array ref, Distance of the store
16413 // address relative to base store address in units).
16414 using StoreIndexToDistSet =
16415 std::set<std::pair<unsigned, int>, StoreDistCompare>;
16416 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
16417 int PrevDist = -1;
16419 // Collect the chain into a list.
16420 for (auto [Idx, Data] : enumerate(Set)) {
16421 if (Operands.empty() || Data.second - PrevDist == 1) {
16422 Operands.push_back(Stores[Data.first]);
16423 PrevDist = Data.second;
16424 if (Idx != Set.size() - 1)
16425 continue;
16426 }
16427 auto E = make_scope_exit([&, &DataVar = Data]() {
16428 Operands.clear();
16429 Operands.push_back(Stores[DataVar.first]);
16430 PrevDist = DataVar.second;
16431 });
16432
16433 if (Operands.size() <= 1 ||
16434 !Visited
16435 .insert({Operands.front(),
16436 cast<StoreInst>(Operands.front())->getValueOperand(),
16437 Operands.back(),
16438 cast<StoreInst>(Operands.back())->getValueOperand(),
16439 Operands.size()})
16440 .second)
16441 continue;
16442
16443 unsigned MaxVecRegSize = R.getMaxVecRegSize();
16444 unsigned EltSize = R.getVectorElementSize(Operands[0]);
16445 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
16446
16447 unsigned MaxVF =
16448 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
16449 unsigned MaxRegVF = MaxVF;
16450 auto *Store = cast<StoreInst>(Operands[0]);
16451 Type *StoreTy = Store->getValueOperand()->getType();
16452 Type *ValueTy = StoreTy;
16453 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
16454 ValueTy = Trunc->getSrcTy();
16455 if (ValueTy == StoreTy &&
16456 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
16457 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
16458 unsigned MinVF = std::max<unsigned>(
16460 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
16461 ValueTy)));
16462
16463 if (MaxVF < MinVF) {
16464 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
16465 << ") < "
16466 << "MinVF (" << MinVF << ")\n");
16467 continue;
16468 }
16469
16470 unsigned NonPowerOf2VF = 0;
16472 // First try vectorizing with a non-power-of-2 VF. At the moment, only
16473 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
16474 // lanes are used.
16475 unsigned CandVF = Operands.size();
16476 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
16477 NonPowerOf2VF = CandVF;
16478 }
16479
16480 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
16481 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
16482 unsigned Size = MinVF;
16483 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
16484 VF = Size > MaxVF ? NonPowerOf2VF : Size;
16485 Size *= 2;
16486 });
16487 unsigned End = Operands.size();
16488 unsigned Repeat = 0;
16489 constexpr unsigned MaxAttempts = 4;
16491 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
16492 P.first = P.second = 1;
16493 });
16495 auto IsNotVectorized = [](bool First,
16496 const std::pair<unsigned, unsigned> &P) {
16497 return First ? P.first > 0 : P.second > 0;
16498 };
16499 auto IsVectorized = [](bool First,
16500 const std::pair<unsigned, unsigned> &P) {
16501 return First ? P.first == 0 : P.second == 0;
16502 };
16503 auto VFIsProfitable = [](bool First, unsigned Size,
16504 const std::pair<unsigned, unsigned> &P) {
16505 return First ? Size >= P.first : Size >= P.second;
16506 };
16507 auto FirstSizeSame = [](unsigned Size,
16508 const std::pair<unsigned, unsigned> &P) {
16509 return Size == P.first;
16510 };
16511 while (true) {
16512 ++Repeat;
16513 bool RepeatChanged = false;
16514 bool AnyProfitableGraph = false;
16515 for (unsigned Size : CandidateVFs) {
16516 AnyProfitableGraph = false;
16517 unsigned StartIdx = std::distance(
16518 RangeSizes.begin(),
16519 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
16520 std::placeholders::_1)));
16521 while (StartIdx < End) {
16522 unsigned EndIdx =
16523 std::distance(RangeSizes.begin(),
16524 find_if(RangeSizes.drop_front(StartIdx),
16525 std::bind(IsVectorized, Size >= MaxRegVF,
16526 std::placeholders::_1)));
16527 unsigned Sz = EndIdx >= End ? End : EndIdx;
16528 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
16529 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
16530 Size >= MaxRegVF)) {
16531 ++Cnt;
16532 continue;
16533 }
16535 assert(all_of(Slice,
16536 [&](Value *V) {
16537 return cast<StoreInst>(V)
16538 ->getValueOperand()
16539 ->getType() ==
16540 cast<StoreInst>(Slice.front())
16541 ->getValueOperand()
16542 ->getType();
16543 }) &&
16544 "Expected all operands of same type.");
16545 if (!NonSchedulable.empty()) {
16546 auto [NonSchedSizeMax, NonSchedSizeMin] =
16547 NonSchedulable.lookup(Slice.front());
16548 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
16549 Cnt += NonSchedSizeMax;
16550 continue;
16551 }
16552 }
16553 unsigned TreeSize;
16554 std::optional<bool> Res =
16555 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
16556 if (!Res) {
16557 NonSchedulable
16558 .try_emplace(Slice.front(), std::make_pair(Size, Size))
16559 .first->getSecond()
16560 .second = Size;
16561 } else if (*Res) {
16562 // Mark the vectorized stores so that we don't vectorize them
16563 // again.
16564 VectorizedStores.insert(Slice.begin(), Slice.end());
16565 // Mark the vectorized stores so that we don't vectorize them
16566 // again.
16567 AnyProfitableGraph = RepeatChanged = Changed = true;
16568 // If we vectorized initial block, no need to try to vectorize
16569 // it again.
16570 for_each(RangeSizes.slice(Cnt, Size),
16571 [](std::pair<unsigned, unsigned> &P) {
16572 P.first = P.second = 0;
16573 });
16574 if (Cnt < StartIdx + MinVF) {
16575 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
16576 [](std::pair<unsigned, unsigned> &P) {
16577 P.first = P.second = 0;
16578 });
16579 StartIdx = Cnt + Size;
16580 }
16581 if (Cnt > Sz - Size - MinVF) {
16582 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
16583 [](std::pair<unsigned, unsigned> &P) {
16584 P.first = P.second = 0;
16585 });
16586 if (Sz == End)
16587 End = Cnt;
16588 Sz = Cnt;
16589 }
16590 Cnt += Size;
16591 continue;
16592 }
16593 if (Size > 2 && Res &&
16594 !all_of(RangeSizes.slice(Cnt, Size),
16595 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
16596 std::placeholders::_1))) {
16597 Cnt += Size;
16598 continue;
16599 }
16600 // Check for the very big VFs that we're not rebuilding same
16601 // trees, just with larger number of elements.
16602 if (Size > MaxRegVF && TreeSize > 1 &&
16603 all_of(RangeSizes.slice(Cnt, Size),
16604 std::bind(FirstSizeSame, TreeSize,
16605 std::placeholders::_1))) {
16606 Cnt += Size;
16607 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16608 ++Cnt;
16609 continue;
16610 }
16611 if (TreeSize > 1)
16612 for_each(RangeSizes.slice(Cnt, Size),
16613 [&](std::pair<unsigned, unsigned> &P) {
16614 if (Size >= MaxRegVF)
16615 P.second = std::max(P.second, TreeSize);
16616 else
16617 P.first = std::max(P.first, TreeSize);
16618 });
16619 ++Cnt;
16620 AnyProfitableGraph = true;
16621 }
16622 if (StartIdx >= End)
16623 break;
16624 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16625 AnyProfitableGraph = true;
16626 StartIdx = std::distance(
16627 RangeSizes.begin(),
16628 find_if(RangeSizes.drop_front(Sz),
16629 std::bind(IsNotVectorized, Size >= MaxRegVF,
16630 std::placeholders::_1)));
16631 }
16632 if (!AnyProfitableGraph && Size >= MaxRegVF)
16633 break;
16634 }
16635 // All values vectorized - exit.
16636 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
16637 return P.first == 0 && P.second == 0;
16638 }))
16639 break;
16640 // Check if tried all attempts or no need for the last attempts at all.
16641 if (Repeat >= MaxAttempts ||
16642 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16643 break;
16644 constexpr unsigned StoresLimit = 64;
16645 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
16646 Operands.size(),
16647 static_cast<unsigned>(
16648 End -
16649 std::distance(
16650 RangeSizes.begin(),
16651 find_if(RangeSizes, std::bind(IsNotVectorized, true,
16652 std::placeholders::_1))) +
16653 1)));
16654 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
16655 if (VF > MaxTotalNum || VF >= StoresLimit)
16656 break;
16657 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
16658 if (P.first != 0)
16659 P.first = std::max(P.second, P.first);
16660 });
16661 // Last attempt to vectorize max number of elements, if all previous
16662 // attempts were unsuccessful because of the cost issues.
16663 CandidateVFs.clear();
16664 CandidateVFs.push_back(VF);
16665 }
16666 }
16667 };
16668
16669 // Stores pair (first: index of the store into Stores array ref, address of
16670 // which taken as base, second: sorted set of pairs {index, dist}, which are
16671 // indices of stores in the set and their store location distances relative to
16672 // the base address).
16673
16674 // Need to store the index of the very first store separately, since the set
16675 // may be reordered after the insertion and the first store may be moved. This
16676 // container allows to reduce number of calls of getPointersDiff() function.
16678 // Inserts the specified store SI with the given index Idx to the set of the
16679 // stores. If the store with the same distance is found already - stop
16680 // insertion, try to vectorize already found stores. If some stores from this
16681 // sequence were not vectorized - try to vectorize them with the new store
16682 // later. But this logic is applied only to the stores, that come before the
16683 // previous store with the same distance.
16684 // Example:
16685 // 1. store x, %p
16686 // 2. store y, %p+1
16687 // 3. store z, %p+2
16688 // 4. store a, %p
16689 // 5. store b, %p+3
16690 // - Scan this from the last to first store. The very first bunch of stores is
16691 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16692 // vector).
16693 // - The next store in the list - #1 - has the same distance from store #5 as
16694 // the store #4.
16695 // - Try to vectorize sequence of stores 4,2,3,5.
16696 // - If all these stores are vectorized - just drop them.
16697 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16698 // - Start new stores sequence.
16699 // The new bunch of stores is {1, {1, 0}}.
16700 // - Add the stores from previous sequence, that were not vectorized.
16701 // Here we consider the stores in the reversed order, rather they are used in
16702 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16703 // Store #3 can be added -> comes after store #4 with the same distance as
16704 // store #1.
16705 // Store #5 cannot be added - comes before store #4.
16706 // This logic allows to improve the compile time, we assume that the stores
16707 // after previous store with the same distance most likely have memory
16708 // dependencies and no need to waste compile time to try to vectorize them.
16709 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16710 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16711 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16712 std::optional<int> Diff = getPointersDiff(
16713 Stores[Set.first]->getValueOperand()->getType(),
16714 Stores[Set.first]->getPointerOperand(),
16715 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16716 /*StrictCheck=*/true);
16717 if (!Diff)
16718 continue;
16719 auto It = Set.second.find(std::make_pair(Idx, *Diff));
16720 if (It == Set.second.end()) {
16721 Set.second.emplace(Idx, *Diff);
16722 return;
16723 }
16724 // Try to vectorize the first found set to avoid duplicate analysis.
16725 TryToVectorize(Set.second);
16726 StoreIndexToDistSet PrevSet;
16727 PrevSet.swap(Set.second);
16728 Set.first = Idx;
16729 Set.second.emplace(Idx, 0);
16730 // Insert stores that followed previous match to try to vectorize them
16731 // with this store.
16732 unsigned StartIdx = It->first + 1;
16733 SmallBitVector UsedStores(Idx - StartIdx);
16734 // Distances to previously found dup store (or this store, since they
16735 // store to the same addresses).
16736 SmallVector<int> Dists(Idx - StartIdx, 0);
16737 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16738 // Do not try to vectorize sequences, we already tried.
16739 if (Pair.first <= It->first ||
16740 VectorizedStores.contains(Stores[Pair.first]))
16741 break;
16742 unsigned BI = Pair.first - StartIdx;
16743 UsedStores.set(BI);
16744 Dists[BI] = Pair.second - It->second;
16745 }
16746 for (unsigned I = StartIdx; I < Idx; ++I) {
16747 unsigned BI = I - StartIdx;
16748 if (UsedStores.test(BI))
16749 Set.second.emplace(I, Dists[BI]);
16750 }
16751 return;
16752 }
16753 auto &Res = SortedStores.emplace_back();
16754 Res.first = Idx;
16755 Res.second.emplace(Idx, 0);
16756 };
16757 Type *PrevValTy = nullptr;
16758 for (auto [I, SI] : enumerate(Stores)) {
16759 if (R.isDeleted(SI))
16760 continue;
16761 if (!PrevValTy)
16762 PrevValTy = SI->getValueOperand()->getType();
16763 // Check that we do not try to vectorize stores of different types.
16764 if (PrevValTy != SI->getValueOperand()->getType()) {
16765 for (auto &Set : SortedStores)
16766 TryToVectorize(Set.second);
16767 SortedStores.clear();
16768 PrevValTy = SI->getValueOperand()->getType();
16769 }
16770 FillStoresSet(I, SI);
16771 }
16772
16773 // Final vectorization attempt.
16774 for (auto &Set : SortedStores)
16775 TryToVectorize(Set.second);
16776
16777 return Changed;
16778}
16779
16780void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16781 // Initialize the collections. We will make a single pass over the block.
16782 Stores.clear();
16783 GEPs.clear();
16784
16785 // Visit the store and getelementptr instructions in BB and organize them in
16786 // Stores and GEPs according to the underlying objects of their pointer
16787 // operands.
16788 for (Instruction &I : *BB) {
16789 // Ignore store instructions that are volatile or have a pointer operand
16790 // that doesn't point to a scalar type.
16791 if (auto *SI = dyn_cast<StoreInst>(&I)) {
16792 if (!SI->isSimple())
16793 continue;
16794 if (!isValidElementType(SI->getValueOperand()->getType()))
16795 continue;
16796 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16797 }
16798
16799 // Ignore getelementptr instructions that have more than one index, a
16800 // constant index, or a pointer operand that doesn't point to a scalar
16801 // type.
16802 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16803 if (GEP->getNumIndices() != 1)
16804 continue;
16805 Value *Idx = GEP->idx_begin()->get();
16806 if (isa<Constant>(Idx))
16807 continue;
16808 if (!isValidElementType(Idx->getType()))
16809 continue;
16810 if (GEP->getType()->isVectorTy())
16811 continue;
16812 GEPs[GEP->getPointerOperand()].push_back(GEP);
16813 }
16814 }
16815}
16816
16817bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16818 bool MaxVFOnly) {
16819 if (VL.size() < 2)
16820 return false;
16821
16822 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16823 << VL.size() << ".\n");
16824
16825 // Check that all of the parts are instructions of the same type,
16826 // we permit an alternate opcode via InstructionsState.
16827 InstructionsState S = getSameOpcode(VL, *TLI);
16828 if (!S.getOpcode())
16829 return false;
16830
16831 Instruction *I0 = cast<Instruction>(S.OpValue);
16832 // Make sure invalid types (including vector type) are rejected before
16833 // determining vectorization factor for scalar instructions.
16834 for (Value *V : VL) {
16835 Type *Ty = V->getType();
16836 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16837 // NOTE: the following will give user internal llvm type name, which may
16838 // not be useful.
16839 R.getORE()->emit([&]() {
16840 std::string TypeStr;
16841 llvm::raw_string_ostream rso(TypeStr);
16842 Ty->print(rso);
16843 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16844 << "Cannot SLP vectorize list: type "
16845 << TypeStr + " is unsupported by vectorizer";
16846 });
16847 return false;
16848 }
16849 }
16850
16851 unsigned Sz = R.getVectorElementSize(I0);
16852 unsigned MinVF = R.getMinVF(Sz);
16853 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16854 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16855 if (MaxVF < 2) {
16856 R.getORE()->emit([&]() {
16857 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16858 << "Cannot SLP vectorize list: vectorization factor "
16859 << "less than 2 is not supported";
16860 });
16861 return false;
16862 }
16863
16864 bool Changed = false;
16865 bool CandidateFound = false;
16866 InstructionCost MinCost = SLPCostThreshold.getValue();
16867 Type *ScalarTy = VL[0]->getType();
16868 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16869 ScalarTy = IE->getOperand(1)->getType();
16870
16871 unsigned NextInst = 0, MaxInst = VL.size();
16872 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16873 // No actual vectorization should happen, if number of parts is the same as
16874 // provided vectorization factor (i.e. the scalar type is used for vector
16875 // code during codegen).
16876 auto *VecTy = getWidenedType(ScalarTy, VF);
16877 if (TTI->getNumberOfParts(VecTy) == VF)
16878 continue;
16879 for (unsigned I = NextInst; I < MaxInst; ++I) {
16880 unsigned ActualVF = std::min(MaxInst - I, VF);
16881
16882 if (!isPowerOf2_32(ActualVF))
16883 continue;
16884
16885 if (MaxVFOnly && ActualVF < MaxVF)
16886 break;
16887 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16888 break;
16889
16890 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16891 // Check that a previous iteration of this loop did not delete the Value.
16892 if (llvm::any_of(Ops, [&R](Value *V) {
16893 auto *I = dyn_cast<Instruction>(V);
16894 return I && R.isDeleted(I);
16895 }))
16896 continue;
16897
16898 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16899 << "\n");
16900
16901 R.buildTree(Ops);
16902 if (R.isTreeTinyAndNotFullyVectorizable())
16903 continue;
16904 R.reorderTopToBottom();
16905 R.reorderBottomToTop(
16906 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16907 !R.doesRootHaveInTreeUses());
16908 R.transformNodes();
16909 R.buildExternalUses();
16910
16911 R.computeMinimumValueSizes();
16912 InstructionCost Cost = R.getTreeCost();
16913 CandidateFound = true;
16914 MinCost = std::min(MinCost, Cost);
16915
16916 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16917 << " for VF=" << ActualVF << "\n");
16918 if (Cost < -SLPCostThreshold) {
16919 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16920 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16921 cast<Instruction>(Ops[0]))
16922 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16923 << " and with tree size "
16924 << ore::NV("TreeSize", R.getTreeSize()));
16925
16926 R.vectorizeTree();
16927 // Move to the next bundle.
16928 I += VF - 1;
16929 NextInst = I + 1;
16930 Changed = true;
16931 }
16932 }
16933 }
16934
16935 if (!Changed && CandidateFound) {
16936 R.getORE()->emit([&]() {
16937 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16938 << "List vectorization was possible but not beneficial with cost "
16939 << ore::NV("Cost", MinCost) << " >= "
16940 << ore::NV("Treshold", -SLPCostThreshold);
16941 });
16942 } else if (!Changed) {
16943 R.getORE()->emit([&]() {
16944 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16945 << "Cannot SLP vectorize list: vectorization was impossible"
16946 << " with available vectorization factors";
16947 });
16948 }
16949 return Changed;
16950}
16951
16952bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16953 if (!I)
16954 return false;
16955
16956 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16957 return false;
16958
16959 Value *P = I->getParent();
16960
16961 // Vectorize in current basic block only.
16962 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16963 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16964 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16965 return false;
16966
16967 // First collect all possible candidates
16969 Candidates.emplace_back(Op0, Op1);
16970
16971 auto *A = dyn_cast<BinaryOperator>(Op0);
16972 auto *B = dyn_cast<BinaryOperator>(Op1);
16973 // Try to skip B.
16974 if (A && B && B->hasOneUse()) {
16975 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16976 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16977 if (B0 && B0->getParent() == P)
16978 Candidates.emplace_back(A, B0);
16979 if (B1 && B1->getParent() == P)
16980 Candidates.emplace_back(A, B1);
16981 }
16982 // Try to skip A.
16983 if (B && A && A->hasOneUse()) {
16984 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16985 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16986 if (A0 && A0->getParent() == P)
16987 Candidates.emplace_back(A0, B);
16988 if (A1 && A1->getParent() == P)
16989 Candidates.emplace_back(A1, B);
16990 }
16991
16992 if (Candidates.size() == 1)
16993 return tryToVectorizeList({Op0, Op1}, R);
16994
16995 // We have multiple options. Try to pick the single best.
16996 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16997 if (!BestCandidate)
16998 return false;
16999 return tryToVectorizeList(
17000 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
17001}
17002
17003namespace {
17004
17005/// Model horizontal reductions.
17006///
17007/// A horizontal reduction is a tree of reduction instructions that has values
17008/// that can be put into a vector as its leaves. For example:
17009///
17010/// mul mul mul mul
17011/// \ / \ /
17012/// + +
17013/// \ /
17014/// +
17015/// This tree has "mul" as its leaf values and "+" as its reduction
17016/// instructions. A reduction can feed into a store or a binary operation
17017/// feeding a phi.
17018/// ...
17019/// \ /
17020/// +
17021/// |
17022/// phi +=
17023///
17024/// Or:
17025/// ...
17026/// \ /
17027/// +
17028/// |
17029/// *p =
17030///
17031class HorizontalReduction {
17032 using ReductionOpsType = SmallVector<Value *, 16>;
17033 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
17034 ReductionOpsListType ReductionOps;
17035 /// List of possibly reduced values.
17037 /// Maps reduced value to the corresponding reduction operation.
17039 WeakTrackingVH ReductionRoot;
17040 /// The type of reduction operation.
17041 RecurKind RdxKind;
17042 /// Checks if the optimization of original scalar identity operations on
17043 /// matched horizontal reductions is enabled and allowed.
17044 bool IsSupportedHorRdxIdentityOp = false;
17045
17046 static bool isCmpSelMinMax(Instruction *I) {
17047 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
17049 }
17050
17051 // And/or are potentially poison-safe logical patterns like:
17052 // select x, y, false
17053 // select x, true, y
17054 static bool isBoolLogicOp(Instruction *I) {
17055 return isa<SelectInst>(I) &&
17056 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
17057 }
17058
17059 /// Checks if instruction is associative and can be vectorized.
17060 static bool isVectorizable(RecurKind Kind, Instruction *I) {
17061 if (Kind == RecurKind::None)
17062 return false;
17063
17064 // Integer ops that map to select instructions or intrinsics are fine.
17066 isBoolLogicOp(I))
17067 return true;
17068
17069 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
17070 // FP min/max are associative except for NaN and -0.0. We do not
17071 // have to rule out -0.0 here because the intrinsic semantics do not
17072 // specify a fixed result for it.
17073 return I->getFastMathFlags().noNaNs();
17074 }
17075
17076 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
17077 return true;
17078
17079 return I->isAssociative();
17080 }
17081
17082 static Value *getRdxOperand(Instruction *I, unsigned Index) {
17083 // Poison-safe 'or' takes the form: select X, true, Y
17084 // To make that work with the normal operand processing, we skip the
17085 // true value operand.
17086 // TODO: Change the code and data structures to handle this without a hack.
17087 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
17088 return I->getOperand(2);
17089 return I->getOperand(Index);
17090 }
17091
17092 /// Creates reduction operation with the current opcode.
17093 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
17094 Value *RHS, const Twine &Name, bool UseSelect) {
17095 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
17096 switch (Kind) {
17097 case RecurKind::Or:
17098 if (UseSelect &&
17100 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
17101 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
17102 Name);
17103 case RecurKind::And:
17104 if (UseSelect &&
17106 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
17107 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
17108 Name);
17109 case RecurKind::Add:
17110 case RecurKind::Mul:
17111 case RecurKind::Xor:
17112 case RecurKind::FAdd:
17113 case RecurKind::FMul:
17114 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
17115 Name);
17116 case RecurKind::FMax:
17117 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
17118 case RecurKind::FMin:
17119 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
17120 case RecurKind::FMaximum:
17121 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
17122 case RecurKind::FMinimum:
17123 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
17124 case RecurKind::SMax:
17125 if (UseSelect) {
17126 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
17127 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
17128 }
17129 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
17130 case RecurKind::SMin:
17131 if (UseSelect) {
17132 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
17133 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
17134 }
17135 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
17136 case RecurKind::UMax:
17137 if (UseSelect) {
17138 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
17139 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
17140 }
17141 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
17142 case RecurKind::UMin:
17143 if (UseSelect) {
17144 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
17145 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
17146 }
17147 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
17148 default:
17149 llvm_unreachable("Unknown reduction operation.");
17150 }
17151 }
17152
17153 /// Creates reduction operation with the current opcode with the IR flags
17154 /// from \p ReductionOps, dropping nuw/nsw flags.
17155 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
17156 Value *RHS, const Twine &Name,
17157 const ReductionOpsListType &ReductionOps) {
17158 bool UseSelect = ReductionOps.size() == 2 ||
17159 // Logical or/and.
17160 (ReductionOps.size() == 1 &&
17161 any_of(ReductionOps.front(), IsaPred<SelectInst>));
17162 assert((!UseSelect || ReductionOps.size() != 2 ||
17163 isa<SelectInst>(ReductionOps[1][0])) &&
17164 "Expected cmp + select pairs for reduction");
17165 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
17167 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
17168 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
17169 /*IncludeWrapFlags=*/false);
17170 propagateIRFlags(Op, ReductionOps[1], nullptr,
17171 /*IncludeWrapFlags=*/false);
17172 return Op;
17173 }
17174 }
17175 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
17176 return Op;
17177 }
17178
17179public:
17180 static RecurKind getRdxKind(Value *V) {
17181 auto *I = dyn_cast<Instruction>(V);
17182 if (!I)
17183 return RecurKind::None;
17184 if (match(I, m_Add(m_Value(), m_Value())))
17185 return RecurKind::Add;
17186 if (match(I, m_Mul(m_Value(), m_Value())))
17187 return RecurKind::Mul;
17188 if (match(I, m_And(m_Value(), m_Value())) ||
17190 return RecurKind::And;
17191 if (match(I, m_Or(m_Value(), m_Value())) ||
17193 return RecurKind::Or;
17194 if (match(I, m_Xor(m_Value(), m_Value())))
17195 return RecurKind::Xor;
17196 if (match(I, m_FAdd(m_Value(), m_Value())))
17197 return RecurKind::FAdd;
17198 if (match(I, m_FMul(m_Value(), m_Value())))
17199 return RecurKind::FMul;
17200
17201 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
17202 return RecurKind::FMax;
17203 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
17204 return RecurKind::FMin;
17205
17206 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
17207 return RecurKind::FMaximum;
17208 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
17209 return RecurKind::FMinimum;
17210 // This matches either cmp+select or intrinsics. SLP is expected to handle
17211 // either form.
17212 // TODO: If we are canonicalizing to intrinsics, we can remove several
17213 // special-case paths that deal with selects.
17214 if (match(I, m_SMax(m_Value(), m_Value())))
17215 return RecurKind::SMax;
17216 if (match(I, m_SMin(m_Value(), m_Value())))
17217 return RecurKind::SMin;
17218 if (match(I, m_UMax(m_Value(), m_Value())))
17219 return RecurKind::UMax;
17220 if (match(I, m_UMin(m_Value(), m_Value())))
17221 return RecurKind::UMin;
17222
17223 if (auto *Select = dyn_cast<SelectInst>(I)) {
17224 // Try harder: look for min/max pattern based on instructions producing
17225 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
17226 // During the intermediate stages of SLP, it's very common to have
17227 // pattern like this (since optimizeGatherSequence is run only once
17228 // at the end):
17229 // %1 = extractelement <2 x i32> %a, i32 0
17230 // %2 = extractelement <2 x i32> %a, i32 1
17231 // %cond = icmp sgt i32 %1, %2
17232 // %3 = extractelement <2 x i32> %a, i32 0
17233 // %4 = extractelement <2 x i32> %a, i32 1
17234 // %select = select i1 %cond, i32 %3, i32 %4
17235 CmpInst::Predicate Pred;
17236 Instruction *L1;
17237 Instruction *L2;
17238
17239 Value *LHS = Select->getTrueValue();
17240 Value *RHS = Select->getFalseValue();
17241 Value *Cond = Select->getCondition();
17242
17243 // TODO: Support inverse predicates.
17244 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
17245 if (!isa<ExtractElementInst>(RHS) ||
17246 !L2->isIdenticalTo(cast<Instruction>(RHS)))
17247 return RecurKind::None;
17248 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
17249 if (!isa<ExtractElementInst>(LHS) ||
17250 !L1->isIdenticalTo(cast<Instruction>(LHS)))
17251 return RecurKind::None;
17252 } else {
17253 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
17254 return RecurKind::None;
17255 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
17256 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
17257 !L2->isIdenticalTo(cast<Instruction>(RHS)))
17258 return RecurKind::None;
17259 }
17260
17261 switch (Pred) {
17262 default:
17263 return RecurKind::None;
17264 case CmpInst::ICMP_SGT:
17265 case CmpInst::ICMP_SGE:
17266 return RecurKind::SMax;
17267 case CmpInst::ICMP_SLT:
17268 case CmpInst::ICMP_SLE:
17269 return RecurKind::SMin;
17270 case CmpInst::ICMP_UGT:
17271 case CmpInst::ICMP_UGE:
17272 return RecurKind::UMax;
17273 case CmpInst::ICMP_ULT:
17274 case CmpInst::ICMP_ULE:
17275 return RecurKind::UMin;
17276 }
17277 }
17278 return RecurKind::None;
17279 }
17280
17281 /// Get the index of the first operand.
17282 static unsigned getFirstOperandIndex(Instruction *I) {
17283 return isCmpSelMinMax(I) ? 1 : 0;
17284 }
17285
17286private:
17287 /// Total number of operands in the reduction operation.
17288 static unsigned getNumberOfOperands(Instruction *I) {
17289 return isCmpSelMinMax(I) ? 3 : 2;
17290 }
17291
17292 /// Checks if the instruction is in basic block \p BB.
17293 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
17294 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
17295 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
17296 auto *Sel = cast<SelectInst>(I);
17297 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
17298 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
17299 }
17300 return I->getParent() == BB;
17301 }
17302
17303 /// Expected number of uses for reduction operations/reduced values.
17304 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
17305 if (IsCmpSelMinMax) {
17306 // SelectInst must be used twice while the condition op must have single
17307 // use only.
17308 if (auto *Sel = dyn_cast<SelectInst>(I))
17309 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
17310 return I->hasNUses(2);
17311 }
17312
17313 // Arithmetic reduction operation must be used once only.
17314 return I->hasOneUse();
17315 }
17316
17317 /// Initializes the list of reduction operations.
17318 void initReductionOps(Instruction *I) {
17319 if (isCmpSelMinMax(I))
17320 ReductionOps.assign(2, ReductionOpsType());
17321 else
17322 ReductionOps.assign(1, ReductionOpsType());
17323 }
17324
17325 /// Add all reduction operations for the reduction instruction \p I.
17326 void addReductionOps(Instruction *I) {
17327 if (isCmpSelMinMax(I)) {
17328 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
17329 ReductionOps[1].emplace_back(I);
17330 } else {
17331 ReductionOps[0].emplace_back(I);
17332 }
17333 }
17334
17335 static bool isGoodForReduction(ArrayRef<Value *> Data) {
17336 int Sz = Data.size();
17337 auto *I = dyn_cast<Instruction>(Data.front());
17338 return Sz > 1 || isConstant(Data.front()) ||
17339 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
17340 }
17341
17342public:
17343 HorizontalReduction() = default;
17344
17345 /// Try to find a reduction tree.
17346 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
17347 ScalarEvolution &SE, const DataLayout &DL,
17348 const TargetLibraryInfo &TLI) {
17349 RdxKind = HorizontalReduction::getRdxKind(Root);
17350 if (!isVectorizable(RdxKind, Root))
17351 return false;
17352
17353 // Analyze "regular" integer/FP types for reductions - no target-specific
17354 // types or pointers.
17355 Type *Ty = Root->getType();
17356 if (!isValidElementType(Ty) || Ty->isPointerTy())
17357 return false;
17358
17359 // Though the ultimate reduction may have multiple uses, its condition must
17360 // have only single use.
17361 if (auto *Sel = dyn_cast<SelectInst>(Root))
17362 if (!Sel->getCondition()->hasOneUse())
17363 return false;
17364
17365 ReductionRoot = Root;
17366
17367 // Iterate through all the operands of the possible reduction tree and
17368 // gather all the reduced values, sorting them by their value id.
17369 BasicBlock *BB = Root->getParent();
17370 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
17372 1, std::make_pair(Root, 0));
17373 // Checks if the operands of the \p TreeN instruction are also reduction
17374 // operations or should be treated as reduced values or an extra argument,
17375 // which is not part of the reduction.
17376 auto CheckOperands = [&](Instruction *TreeN,
17377 SmallVectorImpl<Value *> &PossibleReducedVals,
17378 SmallVectorImpl<Instruction *> &ReductionOps,
17379 unsigned Level) {
17380 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
17381 getNumberOfOperands(TreeN)))) {
17382 Value *EdgeVal = getRdxOperand(TreeN, I);
17383 ReducedValsToOps[EdgeVal].push_back(TreeN);
17384 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
17385 // If the edge is not an instruction, or it is different from the main
17386 // reduction opcode or has too many uses - possible reduced value.
17387 // Also, do not try to reduce const values, if the operation is not
17388 // foldable.
17389 if (!EdgeInst || Level > RecursionMaxDepth ||
17390 getRdxKind(EdgeInst) != RdxKind ||
17391 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
17392 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
17393 !isVectorizable(RdxKind, EdgeInst) ||
17394 (R.isAnalyzedReductionRoot(EdgeInst) &&
17395 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
17396 PossibleReducedVals.push_back(EdgeVal);
17397 continue;
17398 }
17399 ReductionOps.push_back(EdgeInst);
17400 }
17401 };
17402 // Try to regroup reduced values so that it gets more profitable to try to
17403 // reduce them. Values are grouped by their value ids, instructions - by
17404 // instruction op id and/or alternate op id, plus do extra analysis for
17405 // loads (grouping them by the distabce between pointers) and cmp
17406 // instructions (grouping them by the predicate).
17408 PossibleReducedVals;
17409 initReductionOps(Root);
17411 SmallSet<size_t, 2> LoadKeyUsed;
17412
17413 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17414 Key = hash_combine(hash_value(LI->getParent()), Key);
17416 if (LoadKeyUsed.contains(Key)) {
17417 auto LIt = LoadsMap.find(Ptr);
17418 if (LIt != LoadsMap.end()) {
17419 for (LoadInst *RLI : LIt->second) {
17420 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
17421 LI->getType(), LI->getPointerOperand(), DL, SE,
17422 /*StrictCheck=*/true))
17423 return hash_value(RLI->getPointerOperand());
17424 }
17425 for (LoadInst *RLI : LIt->second) {
17427 LI->getPointerOperand(), TLI)) {
17428 hash_code SubKey = hash_value(RLI->getPointerOperand());
17429 return SubKey;
17430 }
17431 }
17432 if (LIt->second.size() > 2) {
17433 hash_code SubKey =
17434 hash_value(LIt->second.back()->getPointerOperand());
17435 return SubKey;
17436 }
17437 }
17438 }
17439 LoadKeyUsed.insert(Key);
17440 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
17441 return hash_value(LI->getPointerOperand());
17442 };
17443
17444 while (!Worklist.empty()) {
17445 auto [TreeN, Level] = Worklist.pop_back_val();
17446 SmallVector<Value *> PossibleRedVals;
17447 SmallVector<Instruction *> PossibleReductionOps;
17448 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
17449 addReductionOps(TreeN);
17450 // Add reduction values. The values are sorted for better vectorization
17451 // results.
17452 for (Value *V : PossibleRedVals) {
17453 size_t Key, Idx;
17454 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
17455 /*AllowAlternate=*/false);
17456 ++PossibleReducedVals[Key][Idx]
17457 .insert(std::make_pair(V, 0))
17458 .first->second;
17459 }
17460 for (Instruction *I : reverse(PossibleReductionOps))
17461 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
17462 }
17463 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
17464 // Sort values by the total number of values kinds to start the reduction
17465 // from the longest possible reduced values sequences.
17466 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
17467 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
17468 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
17469 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
17470 It != E; ++It) {
17471 PossibleRedValsVect.emplace_back();
17472 auto RedValsVect = It->second.takeVector();
17473 stable_sort(RedValsVect, llvm::less_second());
17474 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
17475 PossibleRedValsVect.back().append(Data.second, Data.first);
17476 }
17477 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
17478 return P1.size() > P2.size();
17479 });
17480 int NewIdx = -1;
17481 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
17482 if (NewIdx < 0 ||
17483 (!isGoodForReduction(Data) &&
17484 (!isa<LoadInst>(Data.front()) ||
17485 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
17487 cast<LoadInst>(Data.front())->getPointerOperand()) !=
17489 cast<LoadInst>(ReducedVals[NewIdx].front())
17490 ->getPointerOperand())))) {
17491 NewIdx = ReducedVals.size();
17492 ReducedVals.emplace_back();
17493 }
17494 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
17495 }
17496 }
17497 // Sort the reduced values by number of same/alternate opcode and/or pointer
17498 // operand.
17499 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
17500 return P1.size() > P2.size();
17501 });
17502 return true;
17503 }
17504
17505 /// Attempt to vectorize the tree found by matchAssociativeReduction.
17506 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
17507 const TargetLibraryInfo &TLI) {
17508 constexpr int ReductionLimit = 4;
17509 constexpr unsigned RegMaxNumber = 4;
17510 constexpr unsigned RedValsMaxNumber = 128;
17511 // If there are a sufficient number of reduction values, reduce
17512 // to a nearby power-of-2. We can safely generate oversized
17513 // vectors and rely on the backend to split them to legal sizes.
17514 unsigned NumReducedVals =
17515 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
17516 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
17517 if (!isGoodForReduction(Vals))
17518 return Num;
17519 return Num + Vals.size();
17520 });
17521 if (NumReducedVals < ReductionLimit &&
17523 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
17524 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
17525 }))) {
17526 for (ReductionOpsType &RdxOps : ReductionOps)
17527 for (Value *RdxOp : RdxOps)
17528 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17529 return nullptr;
17530 }
17531
17532 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
17533 TargetFolder(DL));
17534 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
17535
17536 // Track the reduced values in case if they are replaced by extractelement
17537 // because of the vectorization.
17538 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
17539 ReducedVals.front().size());
17540 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
17541
17542 // The compare instruction of a min/max is the insertion point for new
17543 // instructions and may be replaced with a new compare instruction.
17544 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
17545 assert(isa<SelectInst>(RdxRootInst) &&
17546 "Expected min/max reduction to have select root instruction");
17547 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
17548 assert(isa<Instruction>(ScalarCond) &&
17549 "Expected min/max reduction to have compare condition");
17550 return cast<Instruction>(ScalarCond);
17551 };
17552
17553 // Return new VectorizedTree, based on previous value.
17554 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
17555 if (VectorizedTree) {
17556 // Update the final value in the reduction.
17558 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
17559 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
17561 !isGuaranteedNotToBePoison(VectorizedTree))) {
17562 auto It = ReducedValsToOps.find(Res);
17563 if (It != ReducedValsToOps.end() &&
17564 any_of(It->getSecond(),
17565 [](Instruction *I) { return isBoolLogicOp(I); }))
17566 std::swap(VectorizedTree, Res);
17567 }
17568
17569 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
17570 ReductionOps);
17571 }
17572 // Initialize the final value in the reduction.
17573 return Res;
17574 };
17575 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
17576 return isBoolLogicOp(cast<Instruction>(V));
17577 });
17578 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
17579 ReductionOps.front().size());
17580 for (ReductionOpsType &RdxOps : ReductionOps)
17581 for (Value *RdxOp : RdxOps) {
17582 if (!RdxOp)
17583 continue;
17584 IgnoreList.insert(RdxOp);
17585 }
17586 // Intersect the fast-math-flags from all reduction operations.
17587 FastMathFlags RdxFMF;
17588 RdxFMF.set();
17589 for (Value *U : IgnoreList)
17590 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
17591 RdxFMF &= FPMO->getFastMathFlags();
17592 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17593
17594 // Need to track reduced vals, they may be changed during vectorization of
17595 // subvectors.
17596 for (ArrayRef<Value *> Candidates : ReducedVals)
17597 for (Value *V : Candidates)
17598 TrackedVals.try_emplace(V, V);
17599
17600 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
17601 // List of the values that were reduced in other trees as part of gather
17602 // nodes and thus requiring extract if fully vectorized in other trees.
17603 SmallPtrSet<Value *, 4> RequiredExtract;
17604 Value *VectorizedTree = nullptr;
17605 bool CheckForReusedReductionOps = false;
17606 // Try to vectorize elements based on their type.
17608 for (ArrayRef<Value *> RV : ReducedVals)
17609 States.push_back(getSameOpcode(RV, TLI));
17610 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
17611 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17612 InstructionsState S = States[I];
17613 SmallVector<Value *> Candidates;
17614 Candidates.reserve(2 * OrigReducedVals.size());
17615 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
17616 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17617 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17618 // Check if the reduction value was not overriden by the extractelement
17619 // instruction because of the vectorization and exclude it, if it is not
17620 // compatible with other values.
17621 // Also check if the instruction was folded to constant/other value.
17622 auto *Inst = dyn_cast<Instruction>(RdxVal);
17623 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
17624 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17625 (S.getOpcode() && !Inst))
17626 continue;
17627 Candidates.push_back(RdxVal);
17628 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17629 }
17630 bool ShuffledExtracts = false;
17631 // Try to handle shuffled extractelements.
17632 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17633 I + 1 < E) {
17634 SmallVector<Value *> CommonCandidates(Candidates);
17635 for (Value *RV : ReducedVals[I + 1]) {
17636 Value *RdxVal = TrackedVals.find(RV)->second;
17637 // Check if the reduction value was not overriden by the
17638 // extractelement instruction because of the vectorization and
17639 // exclude it, if it is not compatible with other values.
17640 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
17641 if (!Inst)
17642 continue;
17643 CommonCandidates.push_back(RdxVal);
17644 TrackedToOrig.try_emplace(RdxVal, RV);
17645 }
17647 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17648 ++I;
17649 Candidates.swap(CommonCandidates);
17650 ShuffledExtracts = true;
17651 }
17652 }
17653
17654 // Emit code for constant values.
17655 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17656 allConstant(Candidates)) {
17657 Value *Res = Candidates.front();
17658 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17659 for (Value *VC : ArrayRef(Candidates).drop_front()) {
17660 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17661 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17662 if (auto *ResI = dyn_cast<Instruction>(Res))
17663 V.analyzedReductionRoot(ResI);
17664 }
17665 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17666 continue;
17667 }
17668
17669 unsigned NumReducedVals = Candidates.size();
17670 if (NumReducedVals < ReductionLimit &&
17671 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17672 !isSplat(Candidates)))
17673 continue;
17674
17675 // Check if we support repeated scalar values processing (optimization of
17676 // original scalar identity operations on matched horizontal reductions).
17677 IsSupportedHorRdxIdentityOp =
17678 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17679 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17680 // Gather same values.
17681 MapVector<Value *, unsigned> SameValuesCounter;
17682 if (IsSupportedHorRdxIdentityOp)
17683 for (Value *V : Candidates)
17684 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17685 // Used to check if the reduced values used same number of times. In this
17686 // case the compiler may produce better code. E.g. if reduced values are
17687 // aabbccdd (8 x values), then the first node of the tree will have a node
17688 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17689 // Plus, the final reduction will be performed on <8 x aabbccdd>.
17690 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17691 // x abcd) * 2.
17692 // Currently it only handles add/fadd/xor. and/or/min/max do not require
17693 // this analysis, other operations may require an extra estimation of
17694 // the profitability.
17695 bool SameScaleFactor = false;
17696 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17697 SameValuesCounter.size() != Candidates.size();
17698 if (OptReusedScalars) {
17699 SameScaleFactor =
17700 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17701 RdxKind == RecurKind::Xor) &&
17702 all_of(drop_begin(SameValuesCounter),
17703 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17704 return P.second == SameValuesCounter.front().second;
17705 });
17706 Candidates.resize(SameValuesCounter.size());
17707 transform(SameValuesCounter, Candidates.begin(),
17708 [](const auto &P) { return P.first; });
17709 NumReducedVals = Candidates.size();
17710 // Have a reduction of the same element.
17711 if (NumReducedVals == 1) {
17712 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17713 unsigned Cnt = SameValuesCounter.lookup(OrigV);
17714 Value *RedVal =
17715 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17716 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17717 VectorizedVals.try_emplace(OrigV, Cnt);
17718 continue;
17719 }
17720 }
17721
17722 unsigned MaxVecRegSize = V.getMaxVecRegSize();
17723 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17724 unsigned MaxElts =
17725 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17726
17727 unsigned ReduxWidth = std::min<unsigned>(
17728 llvm::bit_floor(NumReducedVals),
17729 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17730 RegMaxNumber * RedValsMaxNumber));
17731 unsigned Start = 0;
17732 unsigned Pos = Start;
17733 // Restarts vectorization attempt with lower vector factor.
17734 unsigned PrevReduxWidth = ReduxWidth;
17735 bool CheckForReusedReductionOpsLocal = false;
17736 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17737 &CheckForReusedReductionOpsLocal,
17738 &PrevReduxWidth, &V,
17739 &IgnoreList](bool IgnoreVL = false) {
17740 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17741 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17742 // Check if any of the reduction ops are gathered. If so, worth
17743 // trying again with less number of reduction ops.
17744 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17745 }
17746 ++Pos;
17747 if (Pos < NumReducedVals - ReduxWidth + 1)
17748 return IsAnyRedOpGathered;
17749 Pos = Start;
17750 ReduxWidth /= 2;
17751 return IsAnyRedOpGathered;
17752 };
17753 bool AnyVectorized = false;
17754 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17755 ReduxWidth >= ReductionLimit) {
17756 // Dependency in tree of the reduction ops - drop this attempt, try
17757 // later.
17758 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17759 Start == 0) {
17760 CheckForReusedReductionOps = true;
17761 break;
17762 }
17763 PrevReduxWidth = ReduxWidth;
17764 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17765 // Beeing analyzed already - skip.
17766 if (V.areAnalyzedReductionVals(VL)) {
17767 (void)AdjustReducedVals(/*IgnoreVL=*/true);
17768 continue;
17769 }
17770 // Early exit if any of the reduction values were deleted during
17771 // previous vectorization attempts.
17772 if (any_of(VL, [&V](Value *RedVal) {
17773 auto *RedValI = dyn_cast<Instruction>(RedVal);
17774 if (!RedValI)
17775 return false;
17776 return V.isDeleted(RedValI);
17777 }))
17778 break;
17779 V.buildTree(VL, IgnoreList);
17780 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17781 if (!AdjustReducedVals())
17782 V.analyzedReductionVals(VL);
17783 continue;
17784 }
17785 if (V.isLoadCombineReductionCandidate(RdxKind)) {
17786 if (!AdjustReducedVals())
17787 V.analyzedReductionVals(VL);
17788 continue;
17789 }
17790 V.reorderTopToBottom();
17791 // No need to reorder the root node at all.
17792 V.reorderBottomToTop(/*IgnoreReorder=*/true);
17793 // Keep extracted other reduction values, if they are used in the
17794 // vectorization trees.
17795 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues;
17796 // The reduction root is used as the insertion point for new
17797 // instructions, so set it as externally used to prevent it from being
17798 // deleted.
17799 LocalExternallyUsedValues[ReductionRoot];
17800 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17801 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17802 continue;
17803 for (Value *V : ReducedVals[Cnt])
17804 if (isa<Instruction>(V))
17805 LocalExternallyUsedValues[TrackedVals[V]];
17806 }
17807 if (!IsSupportedHorRdxIdentityOp) {
17808 // Number of uses of the candidates in the vector of values.
17809 assert(SameValuesCounter.empty() &&
17810 "Reused values counter map is not empty");
17811 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17812 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17813 continue;
17814 Value *V = Candidates[Cnt];
17815 Value *OrigV = TrackedToOrig.find(V)->second;
17816 ++SameValuesCounter[OrigV];
17817 }
17818 }
17819 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17820 // Gather externally used values.
17822 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17823 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17824 continue;
17825 Value *RdxVal = Candidates[Cnt];
17826 if (!Visited.insert(RdxVal).second)
17827 continue;
17828 // Check if the scalar was vectorized as part of the vectorization
17829 // tree but not the top node.
17830 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17831 LocalExternallyUsedValues[RdxVal];
17832 continue;
17833 }
17834 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17835 unsigned NumOps =
17836 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17837 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17838 LocalExternallyUsedValues[RdxVal];
17839 }
17840 // Do not need the list of reused scalars in regular mode anymore.
17841 if (!IsSupportedHorRdxIdentityOp)
17842 SameValuesCounter.clear();
17843 for (Value *RdxVal : VL)
17844 if (RequiredExtract.contains(RdxVal))
17845 LocalExternallyUsedValues[RdxVal];
17846 V.transformNodes();
17847 V.buildExternalUses(LocalExternallyUsedValues);
17848
17849 V.computeMinimumValueSizes();
17850
17851 // Estimate cost.
17852 InstructionCost TreeCost = V.getTreeCost(VL);
17853 InstructionCost ReductionCost =
17854 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17855 InstructionCost Cost = TreeCost + ReductionCost;
17856 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17857 << " for reduction\n");
17858 if (!Cost.isValid())
17859 break;
17860 if (Cost >= -SLPCostThreshold) {
17861 V.getORE()->emit([&]() {
17863 SV_NAME, "HorSLPNotBeneficial",
17864 ReducedValsToOps.find(VL[0])->second.front())
17865 << "Vectorizing horizontal reduction is possible "
17866 << "but not beneficial with cost " << ore::NV("Cost", Cost)
17867 << " and threshold "
17868 << ore::NV("Threshold", -SLPCostThreshold);
17869 });
17870 if (!AdjustReducedVals())
17871 V.analyzedReductionVals(VL);
17872 continue;
17873 }
17874
17875 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17876 << Cost << ". (HorRdx)\n");
17877 V.getORE()->emit([&]() {
17878 return OptimizationRemark(
17879 SV_NAME, "VectorizedHorizontalReduction",
17880 ReducedValsToOps.find(VL[0])->second.front())
17881 << "Vectorized horizontal reduction with cost "
17882 << ore::NV("Cost", Cost) << " and with tree size "
17883 << ore::NV("TreeSize", V.getTreeSize());
17884 });
17885
17886 Builder.setFastMathFlags(RdxFMF);
17887
17888 // Emit a reduction. If the root is a select (min/max idiom), the insert
17889 // point is the compare condition of that select.
17890 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17891 Instruction *InsertPt = RdxRootInst;
17892 if (IsCmpSelMinMax)
17893 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17894
17895 // Vectorize a tree.
17896 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17897 ReplacedExternals, InsertPt);
17898
17899 Builder.SetInsertPoint(InsertPt);
17900
17901 // To prevent poison from leaking across what used to be sequential,
17902 // safe, scalar boolean logic operations, the reduction operand must be
17903 // frozen.
17904 if ((isBoolLogicOp(RdxRootInst) ||
17905 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17906 !isGuaranteedNotToBePoison(VectorizedRoot))
17907 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17908
17909 // Emit code to correctly handle reused reduced values, if required.
17910 if (OptReusedScalars && !SameScaleFactor) {
17911 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
17912 SameValuesCounter, TrackedToOrig);
17913 }
17914
17915 Value *ReducedSubTree;
17916 Type *ScalarTy = VL.front()->getType();
17917 if (isa<FixedVectorType>(ScalarTy)) {
17918 assert(SLPReVec && "FixedVectorType is not expected.");
17919 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
17920 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
17921 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
17922 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
17923 // Do reduction for each lane.
17924 // e.g., do reduce add for
17925 // VL[0] = <4 x Ty> <a, b, c, d>
17926 // VL[1] = <4 x Ty> <e, f, g, h>
17927 // Lane[0] = <2 x Ty> <a, e>
17928 // Lane[1] = <2 x Ty> <b, f>
17929 // Lane[2] = <2 x Ty> <c, g>
17930 // Lane[3] = <2 x Ty> <d, h>
17931 // result[0] = reduce add Lane[0]
17932 // result[1] = reduce add Lane[1]
17933 // result[2] = reduce add Lane[2]
17934 // result[3] = reduce add Lane[3]
17936 createStrideMask(I, ScalarTyNumElements, VL.size());
17937 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
17938 ReducedSubTree = Builder.CreateInsertElement(
17939 ReducedSubTree, emitReduction(Lane, Builder, ReduxWidth, TTI),
17940 I);
17941 }
17942 } else {
17943 ReducedSubTree =
17944 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17945 }
17946 if (ReducedSubTree->getType() != VL.front()->getType()) {
17947 assert(ReducedSubTree->getType() != VL.front()->getType() &&
17948 "Expected different reduction type.");
17949 ReducedSubTree =
17950 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
17951 V.isSignedMinBitwidthRootNode());
17952 }
17953
17954 // Improved analysis for add/fadd/xor reductions with same scale factor
17955 // for all operands of reductions. We can emit scalar ops for them
17956 // instead.
17957 if (OptReusedScalars && SameScaleFactor)
17958 ReducedSubTree = emitScaleForReusedOps(
17959 ReducedSubTree, Builder, SameValuesCounter.front().second);
17960
17961 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17962 // Count vectorized reduced values to exclude them from final reduction.
17963 for (Value *RdxVal : VL) {
17964 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17965 if (IsSupportedHorRdxIdentityOp) {
17966 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17967 continue;
17968 }
17969 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17970 if (!V.isVectorized(RdxVal))
17971 RequiredExtract.insert(RdxVal);
17972 }
17973 Pos += ReduxWidth;
17974 Start = Pos;
17975 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17976 AnyVectorized = true;
17977 }
17978 if (OptReusedScalars && !AnyVectorized) {
17979 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17980 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17981 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17982 Value *OrigV = TrackedToOrig.find(P.first)->second;
17983 VectorizedVals.try_emplace(OrigV, P.second);
17984 }
17985 continue;
17986 }
17987 }
17988 if (VectorizedTree) {
17989 // Reorder operands of bool logical op in the natural order to avoid
17990 // possible problem with poison propagation. If not possible to reorder
17991 // (both operands are originally RHS), emit an extra freeze instruction
17992 // for the LHS operand.
17993 // I.e., if we have original code like this:
17994 // RedOp1 = select i1 ?, i1 LHS, i1 false
17995 // RedOp2 = select i1 RHS, i1 ?, i1 false
17996
17997 // Then, we swap LHS/RHS to create a new op that matches the poison
17998 // semantics of the original code.
17999
18000 // If we have original code like this and both values could be poison:
18001 // RedOp1 = select i1 ?, i1 LHS, i1 false
18002 // RedOp2 = select i1 ?, i1 RHS, i1 false
18003
18004 // Then, we must freeze LHS in the new op.
18005 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
18006 Instruction *RedOp1,
18007 Instruction *RedOp2,
18008 bool InitStep) {
18009 if (!AnyBoolLogicOp)
18010 return;
18011 if (isBoolLogicOp(RedOp1) &&
18012 ((!InitStep && LHS == VectorizedTree) ||
18013 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
18014 return;
18015 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
18016 getRdxOperand(RedOp2, 0) == RHS ||
18018 std::swap(LHS, RHS);
18019 return;
18020 }
18021 if (LHS != VectorizedTree)
18022 LHS = Builder.CreateFreeze(LHS);
18023 };
18024 // Finish the reduction.
18025 // Need to add extra arguments and not vectorized possible reduction
18026 // values.
18027 // Try to avoid dependencies between the scalar remainders after
18028 // reductions.
18029 auto FinalGen =
18031 bool InitStep) {
18032 unsigned Sz = InstVals.size();
18034 Sz % 2);
18035 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
18036 Instruction *RedOp = InstVals[I + 1].first;
18037 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
18038 Value *RdxVal1 = InstVals[I].second;
18039 Value *StableRdxVal1 = RdxVal1;
18040 auto It1 = TrackedVals.find(RdxVal1);
18041 if (It1 != TrackedVals.end())
18042 StableRdxVal1 = It1->second;
18043 Value *RdxVal2 = InstVals[I + 1].second;
18044 Value *StableRdxVal2 = RdxVal2;
18045 auto It2 = TrackedVals.find(RdxVal2);
18046 if (It2 != TrackedVals.end())
18047 StableRdxVal2 = It2->second;
18048 // To prevent poison from leaking across what used to be
18049 // sequential, safe, scalar boolean logic operations, the
18050 // reduction operand must be frozen.
18051 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
18052 RedOp, InitStep);
18053 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
18054 StableRdxVal2, "op.rdx", ReductionOps);
18055 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
18056 }
18057 if (Sz % 2 == 1)
18058 ExtraReds[Sz / 2] = InstVals.back();
18059 return ExtraReds;
18060 };
18062 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
18063 VectorizedTree);
18065 for (ArrayRef<Value *> Candidates : ReducedVals) {
18066 for (Value *RdxVal : Candidates) {
18067 if (!Visited.insert(RdxVal).second)
18068 continue;
18069 unsigned NumOps = VectorizedVals.lookup(RdxVal);
18070 for (Instruction *RedOp :
18071 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
18072 .drop_back(NumOps))
18073 ExtraReductions.emplace_back(RedOp, RdxVal);
18074 }
18075 }
18076 // Iterate through all not-vectorized reduction values/extra arguments.
18077 bool InitStep = true;
18078 while (ExtraReductions.size() > 1) {
18080 FinalGen(ExtraReductions, InitStep);
18081 ExtraReductions.swap(NewReds);
18082 InitStep = false;
18083 }
18084 VectorizedTree = ExtraReductions.front().second;
18085
18086 ReductionRoot->replaceAllUsesWith(VectorizedTree);
18087
18088 // The original scalar reduction is expected to have no remaining
18089 // uses outside the reduction tree itself. Assert that we got this
18090 // correct, replace internal uses with undef, and mark for eventual
18091 // deletion.
18092#ifndef NDEBUG
18093 SmallSet<Value *, 4> IgnoreSet;
18094 for (ArrayRef<Value *> RdxOps : ReductionOps)
18095 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
18096#endif
18097 for (ArrayRef<Value *> RdxOps : ReductionOps) {
18098 for (Value *Ignore : RdxOps) {
18099 if (!Ignore)
18100 continue;
18101#ifndef NDEBUG
18102 for (auto *U : Ignore->users()) {
18103 assert(IgnoreSet.count(U) &&
18104 "All users must be either in the reduction ops list.");
18105 }
18106#endif
18107 if (!Ignore->use_empty()) {
18108 Value *P = PoisonValue::get(Ignore->getType());
18109 Ignore->replaceAllUsesWith(P);
18110 }
18111 }
18112 V.removeInstructionsAndOperands(RdxOps);
18113 }
18114 } else if (!CheckForReusedReductionOps) {
18115 for (ReductionOpsType &RdxOps : ReductionOps)
18116 for (Value *RdxOp : RdxOps)
18117 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
18118 }
18119 return VectorizedTree;
18120 }
18121
18122private:
18123 /// Calculate the cost of a reduction.
18124 InstructionCost getReductionCost(TargetTransformInfo *TTI,
18125 ArrayRef<Value *> ReducedVals,
18126 bool IsCmpSelMinMax, unsigned ReduxWidth,
18127 FastMathFlags FMF) {
18129 Type *ScalarTy = ReducedVals.front()->getType();
18130 FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
18131 InstructionCost VectorCost = 0, ScalarCost;
18132 // If all of the reduced values are constant, the vector cost is 0, since
18133 // the reduction value can be calculated at the compile time.
18134 bool AllConsts = allConstant(ReducedVals);
18135 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
18137 // Scalar cost is repeated for N-1 elements.
18138 int Cnt = ReducedVals.size();
18139 for (Value *RdxVal : ReducedVals) {
18140 if (Cnt == 1)
18141 break;
18142 --Cnt;
18143 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
18144 Cost += GenCostFn();
18145 continue;
18146 }
18147 InstructionCost ScalarCost = 0;
18148 for (User *U : RdxVal->users()) {
18149 auto *RdxOp = cast<Instruction>(U);
18150 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
18151 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
18152 continue;
18153 }
18154 ScalarCost = InstructionCost::getInvalid();
18155 break;
18156 }
18157 if (ScalarCost.isValid())
18158 Cost += ScalarCost;
18159 else
18160 Cost += GenCostFn();
18161 }
18162 return Cost;
18163 };
18164 switch (RdxKind) {
18165 case RecurKind::Add:
18166 case RecurKind::Mul:
18167 case RecurKind::Or:
18168 case RecurKind::And:
18169 case RecurKind::Xor:
18170 case RecurKind::FAdd:
18171 case RecurKind::FMul: {
18172 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
18173 if (!AllConsts) {
18174 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
18175 assert(SLPReVec && "FixedVectorType is not expected.");
18176 unsigned ScalarTyNumElements = VecTy->getNumElements();
18177 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
18178 VectorCost += TTI->getShuffleCost(
18179 TTI::SK_PermuteSingleSrc, VectorTy,
18180 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
18181 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
18182 CostKind);
18183 }
18184 VectorCost += TTI->getScalarizationOverhead(
18185 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
18186 /*Extract*/ false, TTI::TCK_RecipThroughput);
18187 } else {
18188 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF,
18189 CostKind);
18190 }
18191 }
18192 ScalarCost = EvaluateScalarCost([&]() {
18193 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
18194 });
18195 break;
18196 }
18197 case RecurKind::FMax:
18198 case RecurKind::FMin:
18199 case RecurKind::FMaximum:
18200 case RecurKind::FMinimum:
18201 case RecurKind::SMax:
18202 case RecurKind::SMin:
18203 case RecurKind::UMax:
18204 case RecurKind::UMin: {
18206 if (!AllConsts)
18207 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
18208 ScalarCost = EvaluateScalarCost([&]() {
18209 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
18210 return TTI->getIntrinsicInstrCost(ICA, CostKind);
18211 });
18212 break;
18213 }
18214 default:
18215 llvm_unreachable("Expected arithmetic or min/max reduction operation");
18216 }
18217
18218 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
18219 << " for reduction of " << shortBundleName(ReducedVals)
18220 << " (It is a splitting reduction)\n");
18221 return VectorCost - ScalarCost;
18222 }
18223
18224 /// Emit a horizontal reduction of the vectorized value.
18225 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
18226 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
18227 assert(VectorizedValue && "Need to have a vectorized tree node");
18228 assert(isPowerOf2_32(ReduxWidth) &&
18229 "We only handle power-of-two reductions for now");
18230 assert(RdxKind != RecurKind::FMulAdd &&
18231 "A call to the llvm.fmuladd intrinsic is not handled yet");
18232
18233 ++NumVectorInstructions;
18234 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
18235 }
18236
18237 /// Emits optimized code for unique scalar value reused \p Cnt times.
18238 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
18239 unsigned Cnt) {
18240 assert(IsSupportedHorRdxIdentityOp &&
18241 "The optimization of matched scalar identity horizontal reductions "
18242 "must be supported.");
18243 if (Cnt == 1)
18244 return VectorizedValue;
18245 switch (RdxKind) {
18246 case RecurKind::Add: {
18247 // res = mul vv, n
18248 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
18249 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
18250 << VectorizedValue << ". (HorRdx)\n");
18251 return Builder.CreateMul(VectorizedValue, Scale);
18252 }
18253 case RecurKind::Xor: {
18254 // res = n % 2 ? 0 : vv
18255 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
18256 << ". (HorRdx)\n");
18257 if (Cnt % 2 == 0)
18258 return Constant::getNullValue(VectorizedValue->getType());
18259 return VectorizedValue;
18260 }
18261 case RecurKind::FAdd: {
18262 // res = fmul v, n
18263 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
18264 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
18265 << VectorizedValue << ". (HorRdx)\n");
18266 return Builder.CreateFMul(VectorizedValue, Scale);
18267 }
18268 case RecurKind::And:
18269 case RecurKind::Or:
18270 case RecurKind::SMax:
18271 case RecurKind::SMin:
18272 case RecurKind::UMax:
18273 case RecurKind::UMin:
18274 case RecurKind::FMax:
18275 case RecurKind::FMin:
18276 case RecurKind::FMaximum:
18277 case RecurKind::FMinimum:
18278 // res = vv
18279 return VectorizedValue;
18280 case RecurKind::Mul:
18281 case RecurKind::FMul:
18282 case RecurKind::FMulAdd:
18283 case RecurKind::IAnyOf:
18284 case RecurKind::FAnyOf:
18285 case RecurKind::None:
18286 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
18287 }
18288 return nullptr;
18289 }
18290
18291 /// Emits actual operation for the scalar identity values, found during
18292 /// horizontal reduction analysis.
18293 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
18294 BoUpSLP &R,
18295 const MapVector<Value *, unsigned> &SameValuesCounter,
18296 const DenseMap<Value *, Value *> &TrackedToOrig) {
18297 assert(IsSupportedHorRdxIdentityOp &&
18298 "The optimization of matched scalar identity horizontal reductions "
18299 "must be supported.");
18300 ArrayRef<Value *> VL = R.getRootNodeScalars();
18301 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
18302 if (VTy->getElementType() != VL.front()->getType()) {
18303 VectorizedValue = Builder.CreateIntCast(
18304 VectorizedValue,
18305 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
18306 R.isSignedMinBitwidthRootNode());
18307 }
18308 switch (RdxKind) {
18309 case RecurKind::Add: {
18310 // root = mul prev_root, <1, 1, n, 1>
18312 for (Value *V : VL) {
18313 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18314 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
18315 }
18316 auto *Scale = ConstantVector::get(Vals);
18317 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
18318 << VectorizedValue << ". (HorRdx)\n");
18319 return Builder.CreateMul(VectorizedValue, Scale);
18320 }
18321 case RecurKind::And:
18322 case RecurKind::Or:
18323 // No need for multiple or/and(s).
18324 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
18325 << ". (HorRdx)\n");
18326 return VectorizedValue;
18327 case RecurKind::SMax:
18328 case RecurKind::SMin:
18329 case RecurKind::UMax:
18330 case RecurKind::UMin:
18331 case RecurKind::FMax:
18332 case RecurKind::FMin:
18333 case RecurKind::FMaximum:
18334 case RecurKind::FMinimum:
18335 // No need for multiple min/max(s) of the same value.
18336 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
18337 << ". (HorRdx)\n");
18338 return VectorizedValue;
18339 case RecurKind::Xor: {
18340 // Replace values with even number of repeats with 0, since
18341 // x xor x = 0.
18342 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
18343 // 7>, if elements 4th and 6th elements have even number of repeats.
18345 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
18347 std::iota(Mask.begin(), Mask.end(), 0);
18348 bool NeedShuffle = false;
18349 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
18350 Value *V = VL[I];
18351 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18352 if (Cnt % 2 == 0) {
18353 Mask[I] = VF;
18354 NeedShuffle = true;
18355 }
18356 }
18357 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
18358 : Mask) dbgs()
18359 << I << " ";
18360 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
18361 if (NeedShuffle)
18362 VectorizedValue = Builder.CreateShuffleVector(
18363 VectorizedValue,
18364 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
18365 return VectorizedValue;
18366 }
18367 case RecurKind::FAdd: {
18368 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
18370 for (Value *V : VL) {
18371 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18372 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
18373 }
18374 auto *Scale = ConstantVector::get(Vals);
18375 return Builder.CreateFMul(VectorizedValue, Scale);
18376 }
18377 case RecurKind::Mul:
18378 case RecurKind::FMul:
18379 case RecurKind::FMulAdd:
18380 case RecurKind::IAnyOf:
18381 case RecurKind::FAnyOf:
18382 case RecurKind::None:
18383 llvm_unreachable("Unexpected reduction kind for reused scalars.");
18384 }
18385 return nullptr;
18386 }
18387};
18388} // end anonymous namespace
18389
18390/// Gets recurrence kind from the specified value.
18392 return HorizontalReduction::getRdxKind(V);
18393}
18394static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
18395 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
18396 return cast<FixedVectorType>(IE->getType())->getNumElements();
18397
18398 unsigned AggregateSize = 1;
18399 auto *IV = cast<InsertValueInst>(InsertInst);
18400 Type *CurrentType = IV->getType();
18401 do {
18402 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
18403 for (auto *Elt : ST->elements())
18404 if (Elt != ST->getElementType(0)) // check homogeneity
18405 return std::nullopt;
18406 AggregateSize *= ST->getNumElements();
18407 CurrentType = ST->getElementType(0);
18408 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
18409 AggregateSize *= AT->getNumElements();
18410 CurrentType = AT->getElementType();
18411 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
18412 AggregateSize *= VT->getNumElements();
18413 return AggregateSize;
18414 } else if (CurrentType->isSingleValueType()) {
18415 return AggregateSize;
18416 } else {
18417 return std::nullopt;
18418 }
18419 } while (true);
18420}
18421
18422static void findBuildAggregate_rec(Instruction *LastInsertInst,
18424 SmallVectorImpl<Value *> &BuildVectorOpds,
18425 SmallVectorImpl<Value *> &InsertElts,
18426 unsigned OperandOffset) {
18427 do {
18428 Value *InsertedOperand = LastInsertInst->getOperand(1);
18429 std::optional<unsigned> OperandIndex =
18430 getElementIndex(LastInsertInst, OperandOffset);
18431 if (!OperandIndex)
18432 return;
18433 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
18434 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
18435 BuildVectorOpds, InsertElts, *OperandIndex);
18436
18437 } else {
18438 BuildVectorOpds[*OperandIndex] = InsertedOperand;
18439 InsertElts[*OperandIndex] = LastInsertInst;
18440 }
18441 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
18442 } while (LastInsertInst != nullptr &&
18443 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
18444 LastInsertInst->hasOneUse());
18445}
18446
18447/// Recognize construction of vectors like
18448/// %ra = insertelement <4 x float> poison, float %s0, i32 0
18449/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
18450/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
18451/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
18452/// starting from the last insertelement or insertvalue instruction.
18453///
18454/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
18455/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
18456/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
18457///
18458/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
18459///
18460/// \return true if it matches.
18461static bool findBuildAggregate(Instruction *LastInsertInst,
18463 SmallVectorImpl<Value *> &BuildVectorOpds,
18464 SmallVectorImpl<Value *> &InsertElts) {
18465
18466 assert((isa<InsertElementInst>(LastInsertInst) ||
18467 isa<InsertValueInst>(LastInsertInst)) &&
18468 "Expected insertelement or insertvalue instruction!");
18469
18470 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
18471 "Expected empty result vectors!");
18472
18473 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
18474 if (!AggregateSize)
18475 return false;
18476 BuildVectorOpds.resize(*AggregateSize);
18477 InsertElts.resize(*AggregateSize);
18478
18479 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
18480 llvm::erase(BuildVectorOpds, nullptr);
18481 llvm::erase(InsertElts, nullptr);
18482 if (BuildVectorOpds.size() >= 2)
18483 return true;
18484
18485 return false;
18486}
18487
18488/// Try and get a reduction instruction from a phi node.
18489///
18490/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
18491/// if they come from either \p ParentBB or a containing loop latch.
18492///
18493/// \returns A candidate reduction value if possible, or \code nullptr \endcode
18494/// if not possible.
18496 BasicBlock *ParentBB, LoopInfo *LI) {
18497 // There are situations where the reduction value is not dominated by the
18498 // reduction phi. Vectorizing such cases has been reported to cause
18499 // miscompiles. See PR25787.
18500 auto DominatedReduxValue = [&](Value *R) {
18501 return isa<Instruction>(R) &&
18502 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
18503 };
18504
18505 Instruction *Rdx = nullptr;
18506
18507 // Return the incoming value if it comes from the same BB as the phi node.
18508 if (P->getIncomingBlock(0) == ParentBB) {
18509 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18510 } else if (P->getIncomingBlock(1) == ParentBB) {
18511 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18512 }
18513
18514 if (Rdx && DominatedReduxValue(Rdx))
18515 return Rdx;
18516
18517 // Otherwise, check whether we have a loop latch to look at.
18518 Loop *BBL = LI->getLoopFor(ParentBB);
18519 if (!BBL)
18520 return nullptr;
18521 BasicBlock *BBLatch = BBL->getLoopLatch();
18522 if (!BBLatch)
18523 return nullptr;
18524
18525 // There is a loop latch, return the incoming value if it comes from
18526 // that. This reduction pattern occasionally turns up.
18527 if (P->getIncomingBlock(0) == BBLatch) {
18528 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18529 } else if (P->getIncomingBlock(1) == BBLatch) {
18530 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18531 }
18532
18533 if (Rdx && DominatedReduxValue(Rdx))
18534 return Rdx;
18535
18536 return nullptr;
18537}
18538
18539static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
18540 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
18541 return true;
18542 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
18543 return true;
18544 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
18545 return true;
18546 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
18547 return true;
18548 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
18549 return true;
18550 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
18551 return true;
18552 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
18553 return true;
18554 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
18555 return true;
18556 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
18557 return true;
18558 return false;
18559}
18560
18561/// We could have an initial reduction that is not an add.
18562/// r *= v1 + v2 + v3 + v4
18563/// In such a case start looking for a tree rooted in the first '+'.
18564/// \Returns the new root if found, which may be nullptr if not an instruction.
18566 Instruction *Root) {
18567 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18568 isa<IntrinsicInst>(Root)) &&
18569 "Expected binop, select, or intrinsic for reduction matching");
18570 Value *LHS =
18571 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
18572 Value *RHS =
18573 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
18574 if (LHS == Phi)
18575 return dyn_cast<Instruction>(RHS);
18576 if (RHS == Phi)
18577 return dyn_cast<Instruction>(LHS);
18578 return nullptr;
18579}
18580
18581/// \p Returns the first operand of \p I that does not match \p Phi. If
18582/// operand is not an instruction it returns nullptr.
18584 Value *Op0 = nullptr;
18585 Value *Op1 = nullptr;
18586 if (!matchRdxBop(I, Op0, Op1))
18587 return nullptr;
18588 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
18589}
18590
18591/// \Returns true if \p I is a candidate instruction for reduction vectorization.
18593 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
18594 Value *B0 = nullptr, *B1 = nullptr;
18595 bool IsBinop = matchRdxBop(I, B0, B1);
18596 return IsBinop || IsSelect;
18597}
18598
18599bool SLPVectorizerPass::vectorizeHorReduction(
18601 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
18602 if (!ShouldVectorizeHor)
18603 return false;
18604 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
18605
18606 if (Root->getParent() != BB || isa<PHINode>(Root))
18607 return false;
18608
18609 // If we can find a secondary reduction root, use that instead.
18610 auto SelectRoot = [&]() {
18611 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
18612 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
18613 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
18614 return NewRoot;
18615 return Root;
18616 };
18617
18618 // Start analysis starting from Root instruction. If horizontal reduction is
18619 // found, try to vectorize it. If it is not a horizontal reduction or
18620 // vectorization is not possible or not effective, and currently analyzed
18621 // instruction is a binary operation, try to vectorize the operands, using
18622 // pre-order DFS traversal order. If the operands were not vectorized, repeat
18623 // the same procedure considering each operand as a possible root of the
18624 // horizontal reduction.
18625 // Interrupt the process if the Root instruction itself was vectorized or all
18626 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18627 // If a horizintal reduction was not matched or vectorized we collect
18628 // instructions for possible later attempts for vectorization.
18629 std::queue<std::pair<Instruction *, unsigned>> Stack;
18630 Stack.emplace(SelectRoot(), 0);
18631 SmallPtrSet<Value *, 8> VisitedInstrs;
18632 bool Res = false;
18633 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
18634 if (R.isAnalyzedReductionRoot(Inst))
18635 return nullptr;
18636 if (!isReductionCandidate(Inst))
18637 return nullptr;
18638 HorizontalReduction HorRdx;
18639 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
18640 return nullptr;
18641 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
18642 };
18643 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18644 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18645 FutureSeed = getNonPhiOperand(Root, P);
18646 if (!FutureSeed)
18647 return false;
18648 }
18649 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18650 // analysis is done separately.
18651 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18652 PostponedInsts.push_back(FutureSeed);
18653 return true;
18654 };
18655
18656 while (!Stack.empty()) {
18657 Instruction *Inst;
18658 unsigned Level;
18659 std::tie(Inst, Level) = Stack.front();
18660 Stack.pop();
18661 // Do not try to analyze instruction that has already been vectorized.
18662 // This may happen when we vectorize instruction operands on a previous
18663 // iteration while stack was populated before that happened.
18664 if (R.isDeleted(Inst))
18665 continue;
18666 if (Value *VectorizedV = TryToReduce(Inst)) {
18667 Res = true;
18668 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18669 // Try to find another reduction.
18670 Stack.emplace(I, Level);
18671 continue;
18672 }
18673 if (R.isDeleted(Inst))
18674 continue;
18675 } else {
18676 // We could not vectorize `Inst` so try to use it as a future seed.
18677 if (!TryAppendToPostponedInsts(Inst)) {
18678 assert(Stack.empty() && "Expected empty stack");
18679 break;
18680 }
18681 }
18682
18683 // Try to vectorize operands.
18684 // Continue analysis for the instruction from the same basic block only to
18685 // save compile time.
18686 if (++Level < RecursionMaxDepth)
18687 for (auto *Op : Inst->operand_values())
18688 if (VisitedInstrs.insert(Op).second)
18689 if (auto *I = dyn_cast<Instruction>(Op))
18690 // Do not try to vectorize CmpInst operands, this is done
18691 // separately.
18692 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18693 !R.isDeleted(I) && I->getParent() == BB)
18694 Stack.emplace(I, Level);
18695 }
18696 return Res;
18697}
18698
18699bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18700 BasicBlock *BB, BoUpSLP &R,
18702 SmallVector<WeakTrackingVH> PostponedInsts;
18703 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18704 Res |= tryToVectorize(PostponedInsts, R);
18705 return Res;
18706}
18707
18708bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18709 BoUpSLP &R) {
18710 bool Res = false;
18711 for (Value *V : Insts)
18712 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18713 Res |= tryToVectorize(Inst, R);
18714 return Res;
18715}
18716
18717bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18718 BasicBlock *BB, BoUpSLP &R,
18719 bool MaxVFOnly) {
18720 if (!R.canMapToVector(IVI->getType()))
18721 return false;
18722
18723 SmallVector<Value *, 16> BuildVectorOpds;
18724 SmallVector<Value *, 16> BuildVectorInsts;
18725 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18726 return false;
18727
18728 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
18729 R.getORE()->emit([&]() {
18730 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
18731 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18732 "trying reduction first.";
18733 });
18734 return false;
18735 }
18736 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18737 // Aggregate value is unlikely to be processed in vector register.
18738 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
18739}
18740
18741bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18742 BasicBlock *BB, BoUpSLP &R,
18743 bool MaxVFOnly) {
18744 SmallVector<Value *, 16> BuildVectorInsts;
18745 SmallVector<Value *, 16> BuildVectorOpds;
18747 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18748 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18749 isFixedVectorShuffle(BuildVectorOpds, Mask)))
18750 return false;
18751
18752 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
18753 R.getORE()->emit([&]() {
18754 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
18755 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
18756 "trying reduction first.";
18757 });
18758 return false;
18759 }
18760 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18761 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
18762}
18763
18764template <typename T>
18766 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18767 function_ref<bool(T *, T *)> AreCompatible,
18768 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18769 bool MaxVFOnly, BoUpSLP &R) {
18770 bool Changed = false;
18771 // Sort by type, parent, operands.
18772 stable_sort(Incoming, Comparator);
18773
18774 // Try to vectorize elements base on their type.
18775 SmallVector<T *> Candidates;
18777 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
18778 VL.clear()) {
18779 // Look for the next elements with the same type, parent and operand
18780 // kinds.
18781 auto *I = dyn_cast<Instruction>(*IncIt);
18782 if (!I || R.isDeleted(I)) {
18783 ++IncIt;
18784 continue;
18785 }
18786 auto *SameTypeIt = IncIt;
18787 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18788 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18789 AreCompatible(*SameTypeIt, *IncIt))) {
18790 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18791 ++SameTypeIt;
18792 if (I && !R.isDeleted(I))
18793 VL.push_back(cast<T>(I));
18794 }
18795
18796 // Try to vectorize them.
18797 unsigned NumElts = VL.size();
18798 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18799 << NumElts << ")\n");
18800 // The vectorization is a 3-state attempt:
18801 // 1. Try to vectorize instructions with the same/alternate opcodes with the
18802 // size of maximal register at first.
18803 // 2. Try to vectorize remaining instructions with the same type, if
18804 // possible. This may result in the better vectorization results rather than
18805 // if we try just to vectorize instructions with the same/alternate opcodes.
18806 // 3. Final attempt to try to vectorize all instructions with the
18807 // same/alternate ops only, this may result in some extra final
18808 // vectorization.
18809 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
18810 // Success start over because instructions might have been changed.
18811 Changed = true;
18812 VL.swap(Candidates);
18813 Candidates.clear();
18814 for (T *V : VL) {
18815 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18816 Candidates.push_back(V);
18817 }
18818 } else {
18819 /// \Returns the minimum number of elements that we will attempt to
18820 /// vectorize.
18821 auto GetMinNumElements = [&R](Value *V) {
18822 unsigned EltSize = R.getVectorElementSize(V);
18823 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18824 };
18825 if (NumElts < GetMinNumElements(*IncIt) &&
18826 (Candidates.empty() ||
18827 Candidates.front()->getType() == (*IncIt)->getType())) {
18828 for (T *V : VL) {
18829 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18830 Candidates.push_back(V);
18831 }
18832 }
18833 }
18834 // Final attempt to vectorize instructions with the same types.
18835 if (Candidates.size() > 1 &&
18836 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18837 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18838 // Success start over because instructions might have been changed.
18839 Changed = true;
18840 } else if (MaxVFOnly) {
18841 // Try to vectorize using small vectors.
18843 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
18844 VL.clear()) {
18845 auto *I = dyn_cast<Instruction>(*It);
18846 if (!I || R.isDeleted(I)) {
18847 ++It;
18848 continue;
18849 }
18850 auto *SameTypeIt = It;
18851 while (SameTypeIt != End &&
18852 (!isa<Instruction>(*SameTypeIt) ||
18853 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18854 AreCompatible(*SameTypeIt, *It))) {
18855 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18856 ++SameTypeIt;
18857 if (I && !R.isDeleted(I))
18858 VL.push_back(cast<T>(I));
18859 }
18860 unsigned NumElts = VL.size();
18861 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
18862 /*MaxVFOnly=*/false))
18863 Changed = true;
18864 It = SameTypeIt;
18865 }
18866 }
18867 Candidates.clear();
18868 }
18869
18870 // Start over at the next instruction of a different type (or the end).
18871 IncIt = SameTypeIt;
18872 }
18873 return Changed;
18874}
18875
18876/// Compare two cmp instructions. If IsCompatibility is true, function returns
18877/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18878/// operands. If IsCompatibility is false, function implements strict weak
18879/// ordering relation between two cmp instructions, returning true if the first
18880/// instruction is "less" than the second, i.e. its predicate is less than the
18881/// predicate of the second or the operands IDs are less than the operands IDs
18882/// of the second cmp instruction.
18883template <bool IsCompatibility>
18884static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18885 const DominatorTree &DT) {
18886 assert(isValidElementType(V->getType()) &&
18887 isValidElementType(V2->getType()) &&
18888 "Expected valid element types only.");
18889 if (V == V2)
18890 return IsCompatibility;
18891 auto *CI1 = cast<CmpInst>(V);
18892 auto *CI2 = cast<CmpInst>(V2);
18893 if (CI1->getOperand(0)->getType()->getTypeID() <
18894 CI2->getOperand(0)->getType()->getTypeID())
18895 return !IsCompatibility;
18896 if (CI1->getOperand(0)->getType()->getTypeID() >
18897 CI2->getOperand(0)->getType()->getTypeID())
18898 return false;
18899 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
18901 return !IsCompatibility;
18902 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
18904 return false;
18905 CmpInst::Predicate Pred1 = CI1->getPredicate();
18906 CmpInst::Predicate Pred2 = CI2->getPredicate();
18909 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18910 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18911 if (BasePred1 < BasePred2)
18912 return !IsCompatibility;
18913 if (BasePred1 > BasePred2)
18914 return false;
18915 // Compare operands.
18916 bool CI1Preds = Pred1 == BasePred1;
18917 bool CI2Preds = Pred2 == BasePred1;
18918 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18919 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18920 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18921 if (Op1 == Op2)
18922 continue;
18923 if (Op1->getValueID() < Op2->getValueID())
18924 return !IsCompatibility;
18925 if (Op1->getValueID() > Op2->getValueID())
18926 return false;
18927 if (auto *I1 = dyn_cast<Instruction>(Op1))
18928 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18929 if (IsCompatibility) {
18930 if (I1->getParent() != I2->getParent())
18931 return false;
18932 } else {
18933 // Try to compare nodes with same parent.
18934 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18935 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18936 if (!NodeI1)
18937 return NodeI2 != nullptr;
18938 if (!NodeI2)
18939 return false;
18940 assert((NodeI1 == NodeI2) ==
18941 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18942 "Different nodes should have different DFS numbers");
18943 if (NodeI1 != NodeI2)
18944 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18945 }
18946 InstructionsState S = getSameOpcode({I1, I2}, TLI);
18947 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18948 continue;
18949 if (IsCompatibility)
18950 return false;
18951 if (I1->getOpcode() != I2->getOpcode())
18952 return I1->getOpcode() < I2->getOpcode();
18953 }
18954 }
18955 return IsCompatibility;
18956}
18957
18958template <typename ItT>
18959bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18960 BasicBlock *BB, BoUpSLP &R) {
18961 bool Changed = false;
18962 // Try to find reductions first.
18963 for (CmpInst *I : CmpInsts) {
18964 if (R.isDeleted(I))
18965 continue;
18966 for (Value *Op : I->operands())
18967 if (auto *RootOp = dyn_cast<Instruction>(Op))
18968 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18969 }
18970 // Try to vectorize operands as vector bundles.
18971 for (CmpInst *I : CmpInsts) {
18972 if (R.isDeleted(I))
18973 continue;
18974 Changed |= tryToVectorize(I, R);
18975 }
18976 // Try to vectorize list of compares.
18977 // Sort by type, compare predicate, etc.
18978 auto CompareSorter = [&](Value *V, Value *V2) {
18979 if (V == V2)
18980 return false;
18981 return compareCmp<false>(V, V2, *TLI, *DT);
18982 };
18983
18984 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18985 if (V1 == V2)
18986 return true;
18987 return compareCmp<true>(V1, V2, *TLI, *DT);
18988 };
18989
18991 for (Instruction *V : CmpInsts)
18992 if (!R.isDeleted(V) && isValidElementType(V->getType()))
18993 Vals.push_back(V);
18994 if (Vals.size() <= 1)
18995 return Changed;
18996 Changed |= tryToVectorizeSequence<Value>(
18997 Vals, CompareSorter, AreCompatibleCompares,
18998 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18999 // Exclude possible reductions from other blocks.
19000 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
19001 return any_of(V->users(), [V](User *U) {
19002 auto *Select = dyn_cast<SelectInst>(U);
19003 return Select &&
19004 Select->getParent() != cast<Instruction>(V)->getParent();
19005 });
19006 });
19007 if (ArePossiblyReducedInOtherBlock)
19008 return false;
19009 return tryToVectorizeList(Candidates, R, MaxVFOnly);
19010 },
19011 /*MaxVFOnly=*/true, R);
19012 return Changed;
19013}
19014
19015bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
19016 BasicBlock *BB, BoUpSLP &R) {
19017 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
19018 "This function only accepts Insert instructions");
19019 bool OpsChanged = false;
19020 SmallVector<WeakTrackingVH> PostponedInsts;
19021 for (auto *I : reverse(Instructions)) {
19022 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
19023 if (R.isDeleted(I) || isa<CmpInst>(I))
19024 continue;
19025 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
19026 OpsChanged |=
19027 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
19028 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
19029 OpsChanged |=
19030 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
19031 }
19032 // pass2 - try to vectorize reductions only
19033 if (R.isDeleted(I))
19034 continue;
19035 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
19036 if (R.isDeleted(I) || isa<CmpInst>(I))
19037 continue;
19038 // pass3 - try to match and vectorize a buildvector sequence.
19039 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
19040 OpsChanged |=
19041 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
19042 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
19043 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
19044 /*MaxVFOnly=*/false);
19045 }
19046 }
19047 // Now try to vectorize postponed instructions.
19048 OpsChanged |= tryToVectorize(PostponedInsts, R);
19049
19050 Instructions.clear();
19051 return OpsChanged;
19052}
19053
19054bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
19055 bool Changed = false;
19057 SmallPtrSet<Value *, 16> VisitedInstrs;
19058 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
19059 // node. Allows better to identify the chains that can be vectorized in the
19060 // better way.
19062 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
19064 isValidElementType(V2->getType()) &&
19065 "Expected vectorizable types only.");
19066 // It is fine to compare type IDs here, since we expect only vectorizable
19067 // types, like ints, floats and pointers, we don't care about other type.
19068 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
19069 return true;
19070 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
19071 return false;
19072 if (V1->getType()->getScalarSizeInBits() <
19073 V2->getType()->getScalarSizeInBits())
19074 return true;
19075 if (V1->getType()->getScalarSizeInBits() >
19076 V2->getType()->getScalarSizeInBits())
19077 return false;
19078 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
19079 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
19080 if (Opcodes1.size() < Opcodes2.size())
19081 return true;
19082 if (Opcodes1.size() > Opcodes2.size())
19083 return false;
19084 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
19085 {
19086 // Instructions come first.
19087 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
19088 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
19089 if (I1 && I2) {
19090 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
19091 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
19092 if (!NodeI1)
19093 return NodeI2 != nullptr;
19094 if (!NodeI2)
19095 return false;
19096 assert((NodeI1 == NodeI2) ==
19097 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19098 "Different nodes should have different DFS numbers");
19099 if (NodeI1 != NodeI2)
19100 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19101 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19102 if (S.getOpcode() && !S.isAltShuffle())
19103 continue;
19104 return I1->getOpcode() < I2->getOpcode();
19105 }
19106 if (I1)
19107 return true;
19108 if (I2)
19109 return false;
19110 }
19111 {
19112 // Non-undef constants come next.
19113 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
19114 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
19115 if (C1 && C2)
19116 continue;
19117 if (C1)
19118 return true;
19119 if (C2)
19120 return false;
19121 }
19122 bool U1 = isa<UndefValue>(Opcodes1[I]);
19123 bool U2 = isa<UndefValue>(Opcodes2[I]);
19124 {
19125 // Non-constant non-instructions come next.
19126 if (!U1 && !U2) {
19127 auto ValID1 = Opcodes1[I]->getValueID();
19128 auto ValID2 = Opcodes2[I]->getValueID();
19129 if (ValID1 == ValID2)
19130 continue;
19131 if (ValID1 < ValID2)
19132 return true;
19133 if (ValID1 > ValID2)
19134 return false;
19135 }
19136 if (!U1)
19137 return true;
19138 if (!U2)
19139 return false;
19140 }
19141 // Undefs come last.
19142 assert(U1 && U2 && "The only thing left should be undef & undef.");
19143 continue;
19144 }
19145 return false;
19146 };
19147 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
19148 if (V1 == V2)
19149 return true;
19150 if (V1->getType() != V2->getType())
19151 return false;
19152 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
19153 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
19154 if (Opcodes1.size() != Opcodes2.size())
19155 return false;
19156 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
19157 // Undefs are compatible with any other value.
19158 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
19159 continue;
19160 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
19161 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
19162 if (R.isDeleted(I1) || R.isDeleted(I2))
19163 return false;
19164 if (I1->getParent() != I2->getParent())
19165 return false;
19166 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19167 if (S.getOpcode())
19168 continue;
19169 return false;
19170 }
19171 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
19172 continue;
19173 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
19174 return false;
19175 }
19176 return true;
19177 };
19178
19179 bool HaveVectorizedPhiNodes = false;
19180 do {
19181 // Collect the incoming values from the PHIs.
19182 Incoming.clear();
19183 for (Instruction &I : *BB) {
19184 auto *P = dyn_cast<PHINode>(&I);
19185 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
19186 break;
19187
19188 // No need to analyze deleted, vectorized and non-vectorizable
19189 // instructions.
19190 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
19191 isValidElementType(P->getType()))
19192 Incoming.push_back(P);
19193 }
19194
19195 if (Incoming.size() <= 1)
19196 break;
19197
19198 // Find the corresponding non-phi nodes for better matching when trying to
19199 // build the tree.
19200 for (Value *V : Incoming) {
19201 SmallVectorImpl<Value *> &Opcodes =
19202 PHIToOpcodes.try_emplace(V).first->getSecond();
19203 if (!Opcodes.empty())
19204 continue;
19205 SmallVector<Value *, 4> Nodes(1, V);
19207 while (!Nodes.empty()) {
19208 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
19209 if (!Visited.insert(PHI).second)
19210 continue;
19211 for (Value *V : PHI->incoming_values()) {
19212 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
19213 Nodes.push_back(PHI1);
19214 continue;
19215 }
19216 Opcodes.emplace_back(V);
19217 }
19218 }
19219 }
19220
19221 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
19222 Incoming, PHICompare, AreCompatiblePHIs,
19223 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
19224 return tryToVectorizeList(Candidates, R, MaxVFOnly);
19225 },
19226 /*MaxVFOnly=*/true, R);
19227 Changed |= HaveVectorizedPhiNodes;
19228 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
19229 auto *PHI = dyn_cast<PHINode>(P.first);
19230 return !PHI || R.isDeleted(PHI);
19231 }))
19232 PHIToOpcodes.clear();
19233 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
19234 } while (HaveVectorizedPhiNodes);
19235
19236 VisitedInstrs.clear();
19237
19238 InstSetVector PostProcessInserts;
19239 SmallSetVector<CmpInst *, 8> PostProcessCmps;
19240 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
19241 // also vectorizes `PostProcessCmps`.
19242 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
19243 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
19244 if (VectorizeCmps) {
19245 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
19246 PostProcessCmps.clear();
19247 }
19248 PostProcessInserts.clear();
19249 return Changed;
19250 };
19251 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
19252 auto IsInPostProcessInstrs = [&](Instruction *I) {
19253 if (auto *Cmp = dyn_cast<CmpInst>(I))
19254 return PostProcessCmps.contains(Cmp);
19255 return isa<InsertElementInst, InsertValueInst>(I) &&
19256 PostProcessInserts.contains(I);
19257 };
19258 // Returns true if `I` is an instruction without users, like terminator, or
19259 // function call with ignored return value, store. Ignore unused instructions
19260 // (basing on instruction type, except for CallInst and InvokeInst).
19261 auto HasNoUsers = [](Instruction *I) {
19262 return I->use_empty() &&
19263 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
19264 };
19265 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
19266 // Skip instructions with scalable type. The num of elements is unknown at
19267 // compile-time for scalable type.
19268 if (isa<ScalableVectorType>(It->getType()))
19269 continue;
19270
19271 // Skip instructions marked for the deletion.
19272 if (R.isDeleted(&*It))
19273 continue;
19274 // We may go through BB multiple times so skip the one we have checked.
19275 if (!VisitedInstrs.insert(&*It).second) {
19276 if (HasNoUsers(&*It) &&
19277 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
19278 // We would like to start over since some instructions are deleted
19279 // and the iterator may become invalid value.
19280 Changed = true;
19281 It = BB->begin();
19282 E = BB->end();
19283 }
19284 continue;
19285 }
19286
19287 if (isa<DbgInfoIntrinsic>(It))
19288 continue;
19289
19290 // Try to vectorize reductions that use PHINodes.
19291 if (PHINode *P = dyn_cast<PHINode>(It)) {
19292 // Check that the PHI is a reduction PHI.
19293 if (P->getNumIncomingValues() == 2) {
19294 // Try to match and vectorize a horizontal reduction.
19295 Instruction *Root = getReductionInstr(DT, P, BB, LI);
19296 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
19297 Changed = true;
19298 It = BB->begin();
19299 E = BB->end();
19300 continue;
19301 }
19302 }
19303 // Try to vectorize the incoming values of the PHI, to catch reductions
19304 // that feed into PHIs.
19305 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
19306 // Skip if the incoming block is the current BB for now. Also, bypass
19307 // unreachable IR for efficiency and to avoid crashing.
19308 // TODO: Collect the skipped incoming values and try to vectorize them
19309 // after processing BB.
19310 if (BB == P->getIncomingBlock(I) ||
19311 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
19312 continue;
19313
19314 // Postponed instructions should not be vectorized here, delay their
19315 // vectorization.
19316 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
19317 PI && !IsInPostProcessInstrs(PI)) {
19318 bool Res = vectorizeRootInstruction(nullptr, PI,
19319 P->getIncomingBlock(I), R, TTI);
19320 Changed |= Res;
19321 if (Res && R.isDeleted(P)) {
19322 It = BB->begin();
19323 E = BB->end();
19324 break;
19325 }
19326 }
19327 }
19328 continue;
19329 }
19330
19331 if (HasNoUsers(&*It)) {
19332 bool OpsChanged = false;
19333 auto *SI = dyn_cast<StoreInst>(It);
19334 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
19335 if (SI) {
19336 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
19337 // Try to vectorize chain in store, if this is the only store to the
19338 // address in the block.
19339 // TODO: This is just a temporarily solution to save compile time. Need
19340 // to investigate if we can safely turn on slp-vectorize-hor-store
19341 // instead to allow lookup for reduction chains in all non-vectorized
19342 // stores (need to check side effects and compile time).
19343 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
19344 SI->getValueOperand()->hasOneUse();
19345 }
19346 if (TryToVectorizeRoot) {
19347 for (auto *V : It->operand_values()) {
19348 // Postponed instructions should not be vectorized here, delay their
19349 // vectorization.
19350 if (auto *VI = dyn_cast<Instruction>(V);
19351 VI && !IsInPostProcessInstrs(VI))
19352 // Try to match and vectorize a horizontal reduction.
19353 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
19354 }
19355 }
19356 // Start vectorization of post-process list of instructions from the
19357 // top-tree instructions to try to vectorize as many instructions as
19358 // possible.
19359 OpsChanged |=
19360 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
19361 if (OpsChanged) {
19362 // We would like to start over since some instructions are deleted
19363 // and the iterator may become invalid value.
19364 Changed = true;
19365 It = BB->begin();
19366 E = BB->end();
19367 continue;
19368 }
19369 }
19370
19371 if (isa<InsertElementInst, InsertValueInst>(It))
19372 PostProcessInserts.insert(&*It);
19373 else if (isa<CmpInst>(It))
19374 PostProcessCmps.insert(cast<CmpInst>(&*It));
19375 }
19376
19377 return Changed;
19378}
19379
19380bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
19381 auto Changed = false;
19382 for (auto &Entry : GEPs) {
19383 // If the getelementptr list has fewer than two elements, there's nothing
19384 // to do.
19385 if (Entry.second.size() < 2)
19386 continue;
19387
19388 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
19389 << Entry.second.size() << ".\n");
19390
19391 // Process the GEP list in chunks suitable for the target's supported
19392 // vector size. If a vector register can't hold 1 element, we are done. We
19393 // are trying to vectorize the index computations, so the maximum number of
19394 // elements is based on the size of the index expression, rather than the
19395 // size of the GEP itself (the target's pointer size).
19396 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
19397 return !R.isDeleted(GEP);
19398 });
19399 if (It == Entry.second.end())
19400 continue;
19401 unsigned MaxVecRegSize = R.getMaxVecRegSize();
19402 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
19403 if (MaxVecRegSize < EltSize)
19404 continue;
19405
19406 unsigned MaxElts = MaxVecRegSize / EltSize;
19407 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
19408 auto Len = std::min<unsigned>(BE - BI, MaxElts);
19409 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
19410
19411 // Initialize a set a candidate getelementptrs. Note that we use a
19412 // SetVector here to preserve program order. If the index computations
19413 // are vectorizable and begin with loads, we want to minimize the chance
19414 // of having to reorder them later.
19415 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
19416
19417 // Some of the candidates may have already been vectorized after we
19418 // initially collected them or their index is optimized to constant value.
19419 // If so, they are marked as deleted, so remove them from the set of
19420 // candidates.
19421 Candidates.remove_if([&R](Value *I) {
19422 return R.isDeleted(cast<Instruction>(I)) ||
19423 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
19424 });
19425
19426 // Remove from the set of candidates all pairs of getelementptrs with
19427 // constant differences. Such getelementptrs are likely not good
19428 // candidates for vectorization in a bottom-up phase since one can be
19429 // computed from the other. We also ensure all candidate getelementptr
19430 // indices are unique.
19431 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
19432 auto *GEPI = GEPList[I];
19433 if (!Candidates.count(GEPI))
19434 continue;
19435 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
19436 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
19437 auto *GEPJ = GEPList[J];
19438 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
19439 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
19440 Candidates.remove(GEPI);
19441 Candidates.remove(GEPJ);
19442 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19443 Candidates.remove(GEPJ);
19444 }
19445 }
19446 }
19447
19448 // We break out of the above computation as soon as we know there are
19449 // fewer than two candidates remaining.
19450 if (Candidates.size() < 2)
19451 continue;
19452
19453 // Add the single, non-constant index of each candidate to the bundle. We
19454 // ensured the indices met these constraints when we originally collected
19455 // the getelementptrs.
19456 SmallVector<Value *, 16> Bundle(Candidates.size());
19457 auto BundleIndex = 0u;
19458 for (auto *V : Candidates) {
19459 auto *GEP = cast<GetElementPtrInst>(V);
19460 auto *GEPIdx = GEP->idx_begin()->get();
19461 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19462 Bundle[BundleIndex++] = GEPIdx;
19463 }
19464
19465 // Try and vectorize the indices. We are currently only interested in
19466 // gather-like cases of the form:
19467 //
19468 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
19469 //
19470 // where the loads of "a", the loads of "b", and the subtractions can be
19471 // performed in parallel. It's likely that detecting this pattern in a
19472 // bottom-up phase will be simpler and less costly than building a
19473 // full-blown top-down phase beginning at the consecutive loads.
19474 Changed |= tryToVectorizeList(Bundle, R);
19475 }
19476 }
19477 return Changed;
19478}
19479
19480bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
19481 bool Changed = false;
19482 // Sort by type, base pointers and values operand. Value operands must be
19483 // compatible (have the same opcode, same parent), otherwise it is
19484 // definitely not profitable to try to vectorize them.
19485 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
19486 if (V->getValueOperand()->getType()->getTypeID() <
19487 V2->getValueOperand()->getType()->getTypeID())
19488 return true;
19489 if (V->getValueOperand()->getType()->getTypeID() >
19490 V2->getValueOperand()->getType()->getTypeID())
19491 return false;
19492 if (V->getPointerOperandType()->getTypeID() <
19493 V2->getPointerOperandType()->getTypeID())
19494 return true;
19495 if (V->getPointerOperandType()->getTypeID() >
19496 V2->getPointerOperandType()->getTypeID())
19497 return false;
19498 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
19499 V2->getValueOperand()->getType()->getScalarSizeInBits())
19500 return true;
19501 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
19502 V2->getValueOperand()->getType()->getScalarSizeInBits())
19503 return false;
19504 // UndefValues are compatible with all other values.
19505 if (isa<UndefValue>(V->getValueOperand()) ||
19506 isa<UndefValue>(V2->getValueOperand()))
19507 return false;
19508 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
19509 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19511 DT->getNode(I1->getParent());
19513 DT->getNode(I2->getParent());
19514 assert(NodeI1 && "Should only process reachable instructions");
19515 assert(NodeI2 && "Should only process reachable instructions");
19516 assert((NodeI1 == NodeI2) ==
19517 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19518 "Different nodes should have different DFS numbers");
19519 if (NodeI1 != NodeI2)
19520 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19521 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19522 if (S.getOpcode())
19523 return false;
19524 return I1->getOpcode() < I2->getOpcode();
19525 }
19526 if (isa<Constant>(V->getValueOperand()) &&
19527 isa<Constant>(V2->getValueOperand()))
19528 return false;
19529 return V->getValueOperand()->getValueID() <
19530 V2->getValueOperand()->getValueID();
19531 };
19532
19533 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
19534 if (V1 == V2)
19535 return true;
19536 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
19537 return false;
19538 if (V1->getPointerOperandType() != V2->getPointerOperandType())
19539 return false;
19540 // Undefs are compatible with any other value.
19541 if (isa<UndefValue>(V1->getValueOperand()) ||
19542 isa<UndefValue>(V2->getValueOperand()))
19543 return true;
19544 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
19545 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19546 if (I1->getParent() != I2->getParent())
19547 return false;
19548 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19549 return S.getOpcode() > 0;
19550 }
19551 if (isa<Constant>(V1->getValueOperand()) &&
19552 isa<Constant>(V2->getValueOperand()))
19553 return true;
19554 return V1->getValueOperand()->getValueID() ==
19555 V2->getValueOperand()->getValueID();
19556 };
19557
19558 // Attempt to sort and vectorize each of the store-groups.
19560 for (auto &Pair : Stores) {
19561 if (Pair.second.size() < 2)
19562 continue;
19563
19564 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
19565 << Pair.second.size() << ".\n");
19566
19567 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
19568 continue;
19569
19570 // Reverse stores to do bottom-to-top analysis. This is important if the
19571 // values are stores to the same addresses several times, in this case need
19572 // to follow the stores order (reversed to meet the memory dependecies).
19573 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
19574 Pair.second.rend());
19575 Changed |= tryToVectorizeSequence<StoreInst>(
19576 ReversedStores, StoreSorter, AreCompatibleStores,
19577 [&](ArrayRef<StoreInst *> Candidates, bool) {
19578 return vectorizeStores(Candidates, R, Attempted);
19579 },
19580 /*MaxVFOnly=*/false, R);
19581 }
19582 return Changed;
19583}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:533
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:915
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:231
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1345
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:424
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:233
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
reverse_iterator rend()
Definition: BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2070
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2207
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2064
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1323
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2061
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:530
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:747
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1104
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:909
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:847
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2281
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1450
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:441
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:873
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:621
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:226
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:146
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:355
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2277
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1054
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:922
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2492
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:508
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2480
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2285
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1824
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2555
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1883
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:845
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1770
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2386
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2417
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2269
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2514
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1683
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2293
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2181
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2216
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1843
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2432
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1604
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1378
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:631
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:282
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:754
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
An instruction for reading from memory.
Definition: Instructions.h:174
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:259
Value * getPointerOperand()
Definition: Instructions.h:253
bool isSimple() const
Definition: Instructions.h:245
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:209
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:95
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:346
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:384
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
iterator end() const
Definition: SmallPtrSet.h:460
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
iterator begin() const
Definition: SmallPtrSet.h:455
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:441
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
Type * getPointerOperandType() const
Definition: Instructions.h:380
Value * getValueOperand()
Definition: Instructions.h:374
Value * getPointerOperand()
Definition: Instructions.h:377
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:230
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:251
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:283
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:212
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:30
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:664
Type * getElementType() const
Definition: DerivedTypes.h:436
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:105
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:826
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1539
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
void stable_sort(R &&Range)
Definition: STLExtras.h:2020
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:127
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:988
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1210
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2431
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7151
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1678
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2090
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1935
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:400
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:120
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition: STLExtras.h:1909
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1308
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1921
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1997
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1928
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1886
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2228
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.