LLVM 23.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
132static cl::opt<bool>
133ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
134 cl::desc("Attempt to vectorize horizontal reductions"));
135
137 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
138 cl::desc(
139 "Attempt to vectorize horizontal reductions feeding into a store"));
140
142 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
143 cl::desc("Improve the code quality by splitting alternate instructions"));
144
145static cl::opt<int>
147 cl::desc("Attempt to vectorize for this register size in bits"));
148
151 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
152
153/// Limits the size of scheduling regions in a block.
154/// It avoid long compile times for _very_ large blocks where vector
155/// instructions are spread over a wide range.
156/// This limit is way higher than needed by real-world functions.
157static cl::opt<int>
158ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
159 cl::desc("Limit the size of the SLP scheduling region per block"));
160
162 "slp-min-reg-size", cl::init(128), cl::Hidden,
163 cl::desc("Attempt to vectorize for this register size in bits"));
164
166 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
167 cl::desc("Limit the recursion depth when building a vectorizable tree"));
168
170 "slp-min-tree-size", cl::init(3), cl::Hidden,
171 cl::desc("Only vectorize small trees if they are fully vectorizable"));
172
173// The maximum depth that the look-ahead score heuristic will explore.
174// The higher this value, the higher the compilation time overhead.
176 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
177 cl::desc("The maximum look-ahead depth for operand reordering scores"));
178
179// The maximum depth that the look-ahead score heuristic will explore
180// when it probing among candidates for vectorization tree roots.
181// The higher this value, the higher the compilation time overhead but unlike
182// similar limit for operands ordering this is less frequently used, hence
183// impact of higher value is less noticeable.
185 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
186 cl::desc("The maximum look-ahead depth for searching best rooting option"));
187
189 "slp-min-strided-loads", cl::init(2), cl::Hidden,
190 cl::desc("The minimum number of loads, which should be considered strided, "
191 "if the stride is > 1 or is runtime value"));
192
194 "slp-max-stride", cl::init(8), cl::Hidden,
195 cl::desc("The maximum stride, considered to be profitable."));
196
197static cl::opt<bool>
198 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
199 cl::desc("Disable tree reordering even if it is "
200 "profitable. Used for testing only."));
201
202static cl::opt<bool>
203 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
204 cl::desc("Generate strided loads even if they are not "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ViewSLPTree("view-slp-tree", cl::Hidden,
209 cl::desc("Display the SLP trees with Graphviz"));
210
212 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
213 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
214
215/// Enables vectorization of copyable elements.
217 "slp-copyable-elements", cl::init(true), cl::Hidden,
218 cl::desc("Try to replace values with the idempotent instructions for "
219 "better vectorization."));
220
222 "slp-cost-loop-trip-count", cl::init(2), cl::Hidden,
223 cl::desc("Loop trip count, considered by the cost model during "
224 "modeling (0=loops are ignored and considered flat code)"));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
266 if (auto *SI = dyn_cast<StoreInst>(V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(V))
269 return CI->getOperand(0)->getType();
270 if (!SLPReVec)
271 if (auto *IE = dyn_cast<InsertElementInst>(V))
272 return IE->getOperand(1)->getType();
273 return V->getType();
274}
275
276/// \returns the number of elements for Ty.
277static unsigned getNumElements(Type *Ty) {
279 "ScalableVectorType is not supported.");
280 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
281 return VecTy->getNumElements();
282 return 1;
283}
284
285/// \returns the vector type of ScalarTy based on vectorization factor.
286static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
287 return FixedVectorType::get(ScalarTy->getScalarType(),
288 VF * getNumElements(ScalarTy));
289}
290
291/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
292/// which forms type, which splits by \p TTI into whole vector types during
293/// legalization.
295 Type *Ty, unsigned Sz) {
296 if (!isValidElementType(Ty))
297 return bit_ceil(Sz);
298 // Find the number of elements, which forms full vectors.
299 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
300 if (NumParts == 0 || NumParts >= Sz)
301 return bit_ceil(Sz);
302 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
303}
304
305/// Returns the number of elements of the given type \p Ty, not greater than \p
306/// Sz, which forms type, which splits by \p TTI into whole vector types during
307/// legalization.
308static unsigned
310 unsigned Sz) {
311 if (!isValidElementType(Ty))
312 return bit_floor(Sz);
313 // Find the number of elements, which forms full vectors.
314 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
315 if (NumParts == 0 || NumParts >= Sz)
316 return bit_floor(Sz);
317 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
318 if (RegVF > Sz)
319 return bit_floor(Sz);
320 return (Sz / RegVF) * RegVF;
321}
322
323static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
324 SmallVectorImpl<int> &Mask) {
325 // The ShuffleBuilder implementation use shufflevector to splat an "element".
326 // But the element have different meaning for SLP (scalar) and REVEC
327 // (vector). We need to expand Mask into masks which shufflevector can use
328 // directly.
329 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
330 for (unsigned I : seq<unsigned>(Mask.size()))
331 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
332 I * VecTyNumElements, VecTyNumElements)))
333 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
334 : Mask[I] * VecTyNumElements + J;
335 Mask.swap(NewMask);
336}
337
338/// \returns the number of groups of shufflevector
339/// A group has the following features
340/// 1. All of value in a group are shufflevector.
341/// 2. The mask of all shufflevector is isExtractSubvectorMask.
342/// 3. The mask of all shufflevector uses all of the elements of the source.
343/// e.g., it is 1 group (%0)
344/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
346/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
347/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
348/// it is 2 groups (%3 and %4)
349/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
351/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
352/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
353/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
355/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
356/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
357/// it is 0 group
358/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
360/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
361/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
363 if (VL.empty())
364 return 0;
366 return 0;
367 auto *SV = cast<ShuffleVectorInst>(VL.front());
368 unsigned SVNumElements =
369 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
370 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
371 if (SVNumElements % ShuffleMaskSize != 0)
372 return 0;
373 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
374 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
375 return 0;
376 unsigned NumGroup = 0;
377 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
378 auto *SV = cast<ShuffleVectorInst>(VL[I]);
379 Value *Src = SV->getOperand(0);
380 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
381 SmallBitVector ExpectedIndex(GroupSize);
382 if (!all_of(Group, [&](Value *V) {
383 auto *SV = cast<ShuffleVectorInst>(V);
384 // From the same source.
385 if (SV->getOperand(0) != Src)
386 return false;
387 int Index;
388 if (!SV->isExtractSubvectorMask(Index))
389 return false;
390 ExpectedIndex.set(Index / ShuffleMaskSize);
391 return true;
392 }))
393 return 0;
394 if (!ExpectedIndex.all())
395 return 0;
396 ++NumGroup;
397 }
398 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
399 return NumGroup;
400}
401
402/// \returns a shufflevector mask which is used to vectorize shufflevectors
403/// e.g.,
404/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
406/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
407/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
408/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
410/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
411/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
412/// the result is
413/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
415 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
416 auto *SV = cast<ShuffleVectorInst>(VL.front());
417 unsigned SVNumElements =
418 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
419 SmallVector<int> Mask;
420 unsigned AccumulateLength = 0;
421 for (Value *V : VL) {
422 auto *SV = cast<ShuffleVectorInst>(V);
423 for (int M : SV->getShuffleMask())
424 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
425 : AccumulateLength + M);
426 AccumulateLength += SVNumElements;
427 }
428 return Mask;
429}
430
431/// \returns True if the value is a constant (but not globals/constant
432/// expressions).
433static bool isConstant(Value *V) {
435}
436
437/// Checks if \p V is one of vector-like instructions, i.e. undef,
438/// insertelement/extractelement with constant indices for fixed vector type or
439/// extractvalue instruction.
443 return false;
444 auto *I = dyn_cast<Instruction>(V);
445 if (!I || isa<ExtractValueInst>(I))
446 return true;
447 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
448 return false;
450 return isConstant(I->getOperand(1));
451 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
452 return isConstant(I->getOperand(2));
453}
454
455/// Returns power-of-2 number of elements in a single register (part), given the
456/// total number of elements \p Size and number of registers (parts) \p
457/// NumParts.
458static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
459 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
460}
461
462/// Returns correct remaining number of elements, considering total amount \p
463/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
464/// and current register (part) \p Part.
465static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
466 unsigned Part) {
467 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
468}
469
470#if !defined(NDEBUG)
471/// Print a short descriptor of the instruction bundle suitable for debug output.
472static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
473 std::string Result;
474 raw_string_ostream OS(Result);
475 if (Idx >= 0)
476 OS << "Idx: " << Idx << ", ";
477 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
478 return Result;
479}
480#endif
481
482/// \returns true if all of the instructions in \p VL are in the same block or
483/// false otherwise.
485 auto *It = find_if(VL, IsaPred<Instruction>);
486 if (It == VL.end())
487 return false;
490 return true;
491
492 BasicBlock *BB = I0->getParent();
493 for (Value *V : iterator_range(It, VL.end())) {
494 if (isa<PoisonValue>(V))
495 continue;
496 auto *II = dyn_cast<Instruction>(V);
497 if (!II)
498 return false;
499
500 if (BB != II->getParent())
501 return false;
502 }
503 return true;
504}
505
506/// \returns True if all of the values in \p VL are constants (but not
507/// globals/constant expressions).
509 // Constant expressions and globals can't be vectorized like normal integer/FP
510 // constants.
511 return all_of(VL, isConstant);
512}
513
514/// \returns True if all of the values in \p VL are identical or some of them
515/// are UndefValue.
516static bool isSplat(ArrayRef<Value *> VL) {
517 Value *FirstNonUndef = nullptr;
518 for (Value *V : VL) {
519 if (isa<UndefValue>(V))
520 continue;
521 if (!FirstNonUndef) {
522 FirstNonUndef = V;
523 continue;
524 }
525 if (V != FirstNonUndef)
526 return false;
527 }
528 return FirstNonUndef != nullptr;
529}
530
531/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
532/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
533/// patterns that make it effectively commutative (like equality comparisons
534/// with zero).
535/// In most cases, users should not call this function directly (since \p I and
536/// \p InstWithUses are the same). However, when analyzing interchangeable
537/// instructions, we need to use the converted opcode along with the original
538/// uses.
539/// \param I The instruction to check for commutativity
540/// \param ValWithUses The value whose uses are analyzed for special
541/// patterns
542static bool isCommutative(Instruction *I, Value *ValWithUses,
543 bool IsCopyable = false) {
544 if (auto *Cmp = dyn_cast<CmpInst>(I))
545 return Cmp->isCommutative();
546 if (auto *BO = dyn_cast<BinaryOperator>(I))
547 return BO->isCommutative() ||
548 (BO->getOpcode() == Instruction::Sub &&
549 ValWithUses->hasUseList() &&
550 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
551 all_of(
552 ValWithUses->uses(),
553 [&](const Use &U) {
554 // Commutative, if icmp eq/ne sub, 0
555 CmpPredicate Pred;
556 if (match(U.getUser(),
557 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
558 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
559 return true;
560 // Commutative, if abs(sub nsw, true) or abs(sub, false).
561 ConstantInt *Flag;
562 auto *I = dyn_cast<BinaryOperator>(U.get());
563 return match(U.getUser(),
564 m_Intrinsic<Intrinsic::abs>(
565 m_Specific(U.get()), m_ConstantInt(Flag))) &&
566 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
567 Flag->isOne());
568 })) ||
569 (BO->getOpcode() == Instruction::FSub &&
570 ValWithUses->hasUseList() &&
571 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
572 all_of(ValWithUses->uses(), [](const Use &U) {
573 return match(U.getUser(),
574 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
575 }));
576 return I->isCommutative();
577}
578
579/// Checks if the operand is commutative. In commutative operations, not all
580/// operands might commutable, e.g. for fmuladd only 2 first operands are
581/// commutable.
582static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
583 bool IsCopyable = false) {
584 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
585 "The instruction is not commutative.");
586 if (isa<CmpInst>(I))
587 return true;
588 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
589 switch (BO->getOpcode()) {
590 case Instruction::Sub:
591 case Instruction::FSub:
592 return true;
593 default:
594 break;
595 }
596 }
597 return I->isCommutableOperand(Op);
598}
599
600/// This is a helper function to check whether \p I is commutative.
601/// This is a convenience wrapper that calls the two-parameter version of
602/// isCommutative with the same instruction for both parameters. This is
603/// the common case where the instruction being checked for commutativity
604/// is the same as the instruction whose uses are analyzed for special
605/// patterns (see the two-parameter version above for details).
606/// \param I The instruction to check for commutativity
607/// \returns true if the instruction is commutative, false otherwise
608static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
609
610/// \returns number of operands of \p I, considering commutativity. Returns 2
611/// for commutative intrinsics.
612/// \param I The instruction to check for commutativity
615 // IntrinsicInst::isCommutative returns true if swapping the first "two"
616 // arguments to the intrinsic produces the same result.
617 constexpr unsigned IntrinsicNumOperands = 2;
618 return IntrinsicNumOperands;
619 }
620 return I->getNumOperands();
621}
622
623template <typename T>
624static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
625 unsigned Offset) {
626 static_assert(std::is_same_v<T, InsertElementInst> ||
627 std::is_same_v<T, ExtractElementInst>,
628 "unsupported T");
629 int Index = Offset;
630 if (const auto *IE = dyn_cast<T>(Inst)) {
631 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
632 if (!VT)
633 return std::nullopt;
634 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
635 if (!CI)
636 return std::nullopt;
637 if (CI->getValue().uge(VT->getNumElements()))
638 return std::nullopt;
639 Index *= VT->getNumElements();
640 Index += CI->getZExtValue();
641 return Index;
642 }
643 return std::nullopt;
644}
645
646/// \returns inserting or extracting index of InsertElement, ExtractElement or
647/// InsertValue instruction, using Offset as base offset for index.
648/// \returns std::nullopt if the index is not an immediate.
649static std::optional<unsigned> getElementIndex(const Value *Inst,
650 unsigned Offset = 0) {
651 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
652 return Index;
654 return Index;
655
656 int Index = Offset;
657
658 const auto *IV = dyn_cast<InsertValueInst>(Inst);
659 if (!IV)
660 return std::nullopt;
661
662 Type *CurrentType = IV->getType();
663 for (unsigned I : IV->indices()) {
664 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
665 Index *= ST->getNumElements();
666 CurrentType = ST->getElementType(I);
667 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
668 Index *= AT->getNumElements();
669 CurrentType = AT->getElementType();
670 } else {
671 return std::nullopt;
672 }
673 Index += I;
674 }
675 return Index;
676}
677
678/// \returns true if all of the values in \p VL use the same opcode.
679/// For comparison instructions, also checks if predicates match.
680/// PoisonValues are considered matching.
681/// Interchangeable instructions are not considered.
683 auto *It = find_if(VL, IsaPred<Instruction>);
684 if (It == VL.end())
685 return true;
686 Instruction *MainOp = cast<Instruction>(*It);
687 unsigned Opcode = MainOp->getOpcode();
688 bool IsCmpOp = isa<CmpInst>(MainOp);
689 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
691 return std::all_of(It, VL.end(), [&](Value *V) {
692 if (auto *CI = dyn_cast<CmpInst>(V))
693 return BasePred == CI->getPredicate();
694 if (auto *I = dyn_cast<Instruction>(V))
695 return I->getOpcode() == Opcode;
696 return isa<PoisonValue>(V);
697 });
698}
699
700namespace {
701/// Specifies the way the mask should be analyzed for undefs/poisonous elements
702/// in the shuffle mask.
703enum class UseMask {
704 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
705 ///< check for the mask elements for the first argument (mask
706 ///< indices are in range [0:VF)).
707 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
708 ///< for the mask elements for the second argument (mask indices
709 ///< are in range [VF:2*VF))
710 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
711 ///< future shuffle elements and mark them as ones as being used
712 ///< in future. Non-undef elements are considered as unused since
713 ///< they're already marked as used in the mask.
714};
715} // namespace
716
717/// Prepares a use bitset for the given mask either for the first argument or
718/// for the second.
720 UseMask MaskArg) {
721 SmallBitVector UseMask(VF, true);
722 for (auto [Idx, Value] : enumerate(Mask)) {
723 if (Value == PoisonMaskElem) {
724 if (MaskArg == UseMask::UndefsAsMask)
725 UseMask.reset(Idx);
726 continue;
727 }
728 if (MaskArg == UseMask::FirstArg && Value < VF)
729 UseMask.reset(Value);
730 else if (MaskArg == UseMask::SecondArg && Value >= VF)
731 UseMask.reset(Value - VF);
732 }
733 return UseMask;
734}
735
736/// Checks if the given value is actually an undefined constant vector.
737/// Also, if the \p UseMask is not empty, tries to check if the non-masked
738/// elements actually mask the insertelement buildvector, if any.
739template <bool IsPoisonOnly = false>
741 const SmallBitVector &UseMask = {}) {
742 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
743 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
744 if (isa<T>(V))
745 return Res;
746 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
747 if (!VecTy)
748 return Res.reset();
749 auto *C = dyn_cast<Constant>(V);
750 if (!C) {
751 if (!UseMask.empty()) {
752 const Value *Base = V;
753 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
754 Base = II->getOperand(0);
755 if (isa<T>(II->getOperand(1)))
756 continue;
757 std::optional<unsigned> Idx = getElementIndex(II);
758 if (!Idx) {
759 Res.reset();
760 return Res;
761 }
762 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
763 Res.reset(*Idx);
764 }
765 // TODO: Add analysis for shuffles here too.
766 if (V == Base) {
767 Res.reset();
768 } else {
769 SmallBitVector SubMask(UseMask.size(), false);
770 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
771 }
772 } else {
773 Res.reset();
774 }
775 return Res;
776 }
777 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
778 if (Constant *Elem = C->getAggregateElement(I))
779 if (!isa<T>(Elem) &&
780 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
781 Res.reset(I);
782 }
783 return Res;
784}
785
786/// Checks if the vector of instructions can be represented as a shuffle, like:
787/// %x0 = extractelement <4 x i8> %x, i32 0
788/// %x3 = extractelement <4 x i8> %x, i32 3
789/// %y1 = extractelement <4 x i8> %y, i32 1
790/// %y2 = extractelement <4 x i8> %y, i32 2
791/// %x0x0 = mul i8 %x0, %x0
792/// %x3x3 = mul i8 %x3, %x3
793/// %y1y1 = mul i8 %y1, %y1
794/// %y2y2 = mul i8 %y2, %y2
795/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
796/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
797/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
798/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
799/// ret <4 x i8> %ins4
800/// can be transformed into:
801/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
802/// i32 6>
803/// %2 = mul <4 x i8> %1, %1
804/// ret <4 x i8> %2
805/// Mask will return the Shuffle Mask equivalent to the extracted elements.
806/// TODO: Can we split off and reuse the shuffle mask detection from
807/// ShuffleVectorInst/getShuffleCost?
808static std::optional<TargetTransformInfo::ShuffleKind>
810 AssumptionCache *AC) {
811 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
812 if (It == VL.end())
813 return std::nullopt;
814 unsigned Size =
815 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
816 auto *EI = dyn_cast<ExtractElementInst>(V);
817 if (!EI)
818 return S;
819 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
820 if (!VTy)
821 return S;
822 return std::max(S, VTy->getNumElements());
823 });
824
825 Value *Vec1 = nullptr;
826 Value *Vec2 = nullptr;
827 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
828 auto *EE = dyn_cast<ExtractElementInst>(V);
829 if (!EE)
830 return false;
831 Value *Vec = EE->getVectorOperand();
832 if (isa<UndefValue>(Vec))
833 return false;
834 return isGuaranteedNotToBePoison(Vec, AC);
835 });
836 enum ShuffleMode { Unknown, Select, Permute };
837 ShuffleMode CommonShuffleMode = Unknown;
838 Mask.assign(VL.size(), PoisonMaskElem);
839 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
840 // Undef can be represented as an undef element in a vector.
841 if (isa<UndefValue>(VL[I]))
842 continue;
843 auto *EI = cast<ExtractElementInst>(VL[I]);
844 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
845 return std::nullopt;
846 auto *Vec = EI->getVectorOperand();
847 // We can extractelement from undef or poison vector.
849 continue;
850 // All vector operands must have the same number of vector elements.
851 if (isa<UndefValue>(Vec)) {
852 Mask[I] = I;
853 } else {
854 if (isa<UndefValue>(EI->getIndexOperand()))
855 continue;
856 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
857 if (!Idx)
858 return std::nullopt;
859 // Undefined behavior if Idx is negative or >= Size.
860 if (Idx->getValue().uge(Size))
861 continue;
862 unsigned IntIdx = Idx->getValue().getZExtValue();
863 Mask[I] = IntIdx;
864 }
865 if (isUndefVector(Vec).all() && HasNonUndefVec)
866 continue;
867 // For correct shuffling we have to have at most 2 different vector operands
868 // in all extractelement instructions.
869 if (!Vec1 || Vec1 == Vec) {
870 Vec1 = Vec;
871 } else if (!Vec2 || Vec2 == Vec) {
872 Vec2 = Vec;
873 Mask[I] += Size;
874 } else {
875 return std::nullopt;
876 }
877 if (CommonShuffleMode == Permute)
878 continue;
879 // If the extract index is not the same as the operation number, it is a
880 // permutation.
881 if (Mask[I] % Size != I) {
882 CommonShuffleMode = Permute;
883 continue;
884 }
885 CommonShuffleMode = Select;
886 }
887 // If we're not crossing lanes in different vectors, consider it as blending.
888 if (CommonShuffleMode == Select && Vec2)
890 // If Vec2 was never used, we have a permutation of a single vector, otherwise
891 // we have permutation of 2 vectors.
894}
895
896/// \returns True if Extract{Value,Element} instruction extracts element Idx.
897static std::optional<unsigned> getExtractIndex(const Instruction *E) {
898 unsigned Opcode = E->getOpcode();
899 assert((Opcode == Instruction::ExtractElement ||
900 Opcode == Instruction::ExtractValue) &&
901 "Expected extractelement or extractvalue instruction.");
902 if (Opcode == Instruction::ExtractElement) {
903 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
904 if (!CI)
905 return std::nullopt;
906 // Check if the index is out of bound - we can get the source vector from
907 // operand 0
908 unsigned Idx = CI->getZExtValue();
909 auto *EE = cast<ExtractElementInst>(E);
910 const unsigned VF = ::getNumElements(EE->getVectorOperandType());
911 if (Idx >= VF)
912 return std::nullopt;
913 return Idx;
914 }
915 auto *EI = cast<ExtractValueInst>(E);
916 if (EI->getNumIndices() != 1)
917 return std::nullopt;
918 return *EI->idx_begin();
919}
920
921/// Checks if the provided value does not require scheduling. It does not
922/// require scheduling if this is not an instruction or it is an instruction
923/// that does not read/write memory and all operands are either not instructions
924/// or phi nodes or instructions from different blocks.
925static bool areAllOperandsNonInsts(Value *V);
926/// Checks if the provided value does not require scheduling. It does not
927/// require scheduling if this is not an instruction or it is an instruction
928/// that does not read/write memory and all users are phi nodes or instructions
929/// from the different blocks.
930static bool isUsedOutsideBlock(Value *V);
931/// Checks if the specified value does not require scheduling. It does not
932/// require scheduling if all operands and all users do not need to be scheduled
933/// in the current basic block.
934static bool doesNotNeedToBeScheduled(Value *V);
935
936/// \returns true if \p Opcode is allowed as part of the main/alternate
937/// instruction for SLP vectorization.
938///
939/// Example of unsupported opcode is SDIV that can potentially cause UB if the
940/// "shuffled out" lane would result in division by zero.
941static bool isValidForAlternation(unsigned Opcode) {
942 return !Instruction::isIntDivRem(Opcode);
943}
944
945namespace {
946
947/// Helper class that determines VL can use the same opcode.
948/// Alternate instruction is supported. In addition, it supports interchangeable
949/// instruction. An interchangeable instruction is an instruction that can be
950/// converted to another instruction with same semantics. For example, x << 1 is
951/// equal to x * 2. x * 1 is equal to x | 0.
952class BinOpSameOpcodeHelper {
953 using MaskType = std::uint_fast32_t;
954 /// Sort SupportedOp because it is used by binary_search.
955 constexpr static std::initializer_list<unsigned> SupportedOp = {
956 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
957 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
958 static_assert(llvm::is_sorted_constexpr(SupportedOp) &&
959 "SupportedOp is not sorted.");
960 enum : MaskType {
961 ShlBIT = 1,
962 AShrBIT = 1 << 1,
963 MulBIT = 1 << 2,
964 AddBIT = 1 << 3,
965 SubBIT = 1 << 4,
966 AndBIT = 1 << 5,
967 OrBIT = 1 << 6,
968 XorBIT = 1 << 7,
969 MainOpBIT = 1 << 8,
971 };
972 /// Return a non-nullptr if either operand of I is a ConstantInt.
973 /// The second return value represents the operand position. We check the
974 /// right-hand side first (1). If the right hand side is not a ConstantInt and
975 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
976 /// side (0).
977 static std::pair<ConstantInt *, unsigned>
978 isBinOpWithConstantInt(const Instruction *I) {
979 unsigned Opcode = I->getOpcode();
980 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
981 (void)SupportedOp;
982 auto *BinOp = cast<BinaryOperator>(I);
983 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
984 return {CI, 1};
985 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
986 Opcode == Instruction::AShr)
987 return {nullptr, 0};
988 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
989 return {CI, 0};
990 return {nullptr, 0};
991 }
992 struct InterchangeableInfo {
993 const Instruction *I = nullptr;
994 /// The bit it sets represents whether MainOp can be converted to.
995 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
996 MulBIT | AShrBIT | ShlBIT;
997 /// We cannot create an interchangeable instruction that does not exist in
998 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
999 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
1000 /// 1]. SeenBefore is used to know what operations have been seen before.
1001 MaskType SeenBefore = 0;
1002 InterchangeableInfo(const Instruction *I) : I(I) {}
1003 /// Return false allows BinOpSameOpcodeHelper to find an alternate
1004 /// instruction. Directly setting the mask will destroy the mask state,
1005 /// preventing us from determining which instruction it should convert to.
1006 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1007 if (Mask & InterchangeableMask) {
1008 SeenBefore |= OpcodeInMaskForm;
1009 Mask &= InterchangeableMask;
1010 return true;
1011 }
1012 return false;
1013 }
1014 bool equal(unsigned Opcode) {
1015 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1016 }
1017 unsigned getOpcode() const {
1018 MaskType Candidate = Mask & SeenBefore;
1019 if (Candidate & MainOpBIT)
1020 return I->getOpcode();
1021 if (Candidate & ShlBIT)
1022 return Instruction::Shl;
1023 if (Candidate & AShrBIT)
1024 return Instruction::AShr;
1025 if (Candidate & MulBIT)
1026 return Instruction::Mul;
1027 if (Candidate & AddBIT)
1028 return Instruction::Add;
1029 if (Candidate & SubBIT)
1030 return Instruction::Sub;
1031 if (Candidate & AndBIT)
1032 return Instruction::And;
1033 if (Candidate & OrBIT)
1034 return Instruction::Or;
1035 if (Candidate & XorBIT)
1036 return Instruction::Xor;
1037 llvm_unreachable("Cannot find interchangeable instruction.");
1038 }
1039
1040 /// Return true if the instruction can be converted to \p Opcode.
1041 bool hasCandidateOpcode(unsigned Opcode) const {
1042 MaskType Candidate = Mask & SeenBefore;
1043 switch (Opcode) {
1044 case Instruction::Shl:
1045 return Candidate & ShlBIT;
1046 case Instruction::AShr:
1047 return Candidate & AShrBIT;
1048 case Instruction::Mul:
1049 return Candidate & MulBIT;
1050 case Instruction::Add:
1051 return Candidate & AddBIT;
1052 case Instruction::Sub:
1053 return Candidate & SubBIT;
1054 case Instruction::And:
1055 return Candidate & AndBIT;
1056 case Instruction::Or:
1057 return Candidate & OrBIT;
1058 case Instruction::Xor:
1059 return Candidate & XorBIT;
1060 case Instruction::LShr:
1061 case Instruction::FAdd:
1062 case Instruction::FSub:
1063 case Instruction::FMul:
1064 case Instruction::SDiv:
1065 case Instruction::UDiv:
1066 case Instruction::FDiv:
1067 case Instruction::SRem:
1068 case Instruction::URem:
1069 case Instruction::FRem:
1070 return false;
1071 default:
1072 break;
1073 }
1074 llvm_unreachable("Cannot find interchangeable instruction.");
1075 }
1076
1077 SmallVector<Value *> getOperand(const Instruction *To) const {
1078 unsigned ToOpcode = To->getOpcode();
1079 unsigned FromOpcode = I->getOpcode();
1080 if (FromOpcode == ToOpcode)
1081 return SmallVector<Value *>(I->operands());
1082 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1083 auto [CI, Pos] = isBinOpWithConstantInt(I);
1084 const APInt &FromCIValue = CI->getValue();
1085 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1086 Type *RHSType = I->getOperand(Pos)->getType();
1087 Constant *RHS;
1088 switch (FromOpcode) {
1089 case Instruction::Shl:
1090 if (ToOpcode == Instruction::Mul) {
1091 RHS = ConstantInt::get(
1092 RHSType, APInt::getOneBitSet(FromCIValueBitWidth,
1093 FromCIValue.getZExtValue()));
1094 } else {
1095 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1096 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1097 /*AllowRHSConstant=*/true);
1098 }
1099 break;
1100 case Instruction::Mul:
1101 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1102 if (ToOpcode == Instruction::Shl) {
1103 RHS = ConstantInt::get(
1104 RHSType, APInt(FromCIValueBitWidth, FromCIValue.logBase2()));
1105 } else {
1106 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1107 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1108 /*AllowRHSConstant=*/true);
1109 }
1110 break;
1111 case Instruction::Add:
1112 case Instruction::Sub:
1113 if (FromCIValue.isZero()) {
1114 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1115 /*AllowRHSConstant=*/true);
1116 } else {
1117 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1118 "Cannot convert the instruction.");
1119 APInt NegatedVal = APInt(FromCIValue);
1120 NegatedVal.negate();
1121 RHS = ConstantInt::get(RHSType, NegatedVal);
1122 }
1123 break;
1124 case Instruction::And:
1125 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1126 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1127 /*AllowRHSConstant=*/true);
1128 break;
1129 default:
1130 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1131 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1132 /*AllowRHSConstant=*/true);
1133 break;
1134 }
1135 Value *LHS = I->getOperand(1 - Pos);
1136 // If the target opcode is non-commutative (e.g., shl, sub),
1137 // force the variable to the left and the constant to the right.
1138 if (Pos == 1 || !Instruction::isCommutative(ToOpcode))
1139 return SmallVector<Value *>({LHS, RHS});
1140
1141 return SmallVector<Value *>({RHS, LHS});
1142 }
1143 };
1144 InterchangeableInfo MainOp;
1145 InterchangeableInfo AltOp;
1146 bool isValidForAlternation(const Instruction *I) const {
1147 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1148 ::isValidForAlternation(I->getOpcode());
1149 }
1150 bool initializeAltOp(const Instruction *I) {
1151 if (AltOp.I)
1152 return true;
1154 return false;
1155 AltOp.I = I;
1156 return true;
1157 }
1158
1159public:
1160 BinOpSameOpcodeHelper(const Instruction *MainOp,
1161 const Instruction *AltOp = nullptr)
1162 : MainOp(MainOp), AltOp(AltOp) {}
1163 bool add(const Instruction *I) {
1165 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1166 unsigned Opcode = I->getOpcode();
1167 MaskType OpcodeInMaskForm;
1168 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1169 switch (Opcode) {
1170 case Instruction::Shl:
1171 OpcodeInMaskForm = ShlBIT;
1172 break;
1173 case Instruction::AShr:
1174 OpcodeInMaskForm = AShrBIT;
1175 break;
1176 case Instruction::Mul:
1177 OpcodeInMaskForm = MulBIT;
1178 break;
1179 case Instruction::Add:
1180 OpcodeInMaskForm = AddBIT;
1181 break;
1182 case Instruction::Sub:
1183 OpcodeInMaskForm = SubBIT;
1184 break;
1185 case Instruction::And:
1186 OpcodeInMaskForm = AndBIT;
1187 break;
1188 case Instruction::Or:
1189 OpcodeInMaskForm = OrBIT;
1190 break;
1191 case Instruction::Xor:
1192 OpcodeInMaskForm = XorBIT;
1193 break;
1194 default:
1195 return MainOp.equal(Opcode) ||
1196 (initializeAltOp(I) && AltOp.equal(Opcode));
1197 }
1198 MaskType InterchangeableMask = OpcodeInMaskForm;
1199 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1200 if (CI) {
1201 constexpr MaskType CanBeAll =
1202 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1203 const APInt &CIValue = CI->getValue();
1204 switch (Opcode) {
1205 case Instruction::Shl:
1206 if (CIValue.ult(CIValue.getBitWidth()))
1207 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1208 break;
1209 case Instruction::Mul:
1210 if (CIValue.isOne()) {
1211 InterchangeableMask = CanBeAll;
1212 break;
1213 }
1214 if (CIValue.isPowerOf2())
1215 InterchangeableMask = MulBIT | ShlBIT;
1216 break;
1217 case Instruction::Add:
1218 case Instruction::Sub:
1219 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1220 break;
1221 case Instruction::And:
1222 if (CIValue.isAllOnes())
1223 InterchangeableMask = CanBeAll;
1224 break;
1225 case Instruction::Xor:
1226 if (CIValue.isZero())
1227 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1228 break;
1229 default:
1230 if (CIValue.isZero())
1231 InterchangeableMask = CanBeAll;
1232 break;
1233 }
1234 }
1235 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1236 (initializeAltOp(I) &&
1237 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1238 }
1239 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1240 /// Checks if the list of potential opcodes includes \p Opcode.
1241 bool hasCandidateOpcode(unsigned Opcode) const {
1242 return MainOp.hasCandidateOpcode(Opcode);
1243 }
1244 bool hasAltOp() const { return AltOp.I; }
1245 unsigned getAltOpcode() const {
1246 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1247 }
1248 SmallVector<Value *> getOperand(const Instruction *I) const {
1249 return MainOp.getOperand(I);
1250 }
1251};
1252
1253/// Main data required for vectorization of instructions.
1254class InstructionsState {
1255 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1256 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1257 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1258 /// isAltShuffle).
1259 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1260 /// from getMainAltOpsNoStateVL.
1261 /// For those InstructionsState that use alternate instructions, the resulting
1262 /// vectorized output ultimately comes from a shufflevector. For example,
1263 /// given a vector list (VL):
1264 /// VL[0] = add i32 a, e
1265 /// VL[1] = sub i32 b, f
1266 /// VL[2] = add i32 c, g
1267 /// VL[3] = sub i32 d, h
1268 /// The vectorized result would be:
1269 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1270 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1271 /// result = shufflevector <4 x i32> intermediated_0,
1272 /// <4 x i32> intermediated_1,
1273 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1274 /// Since shufflevector is used in the final result, when calculating the cost
1275 /// (getEntryCost), we must account for the usage of shufflevector in
1276 /// GetVectorCost.
1277 Instruction *MainOp = nullptr;
1278 Instruction *AltOp = nullptr;
1279 /// Wether the instruction state represents copyable instructions.
1280 bool HasCopyables = false;
1281
1282public:
1283 Instruction *getMainOp() const {
1284 assert(valid() && "InstructionsState is invalid.");
1285 return MainOp;
1286 }
1287
1288 Instruction *getAltOp() const {
1289 assert(valid() && "InstructionsState is invalid.");
1290 return AltOp;
1291 }
1292
1293 /// The main/alternate opcodes for the list of instructions.
1294 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1295
1296 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1297
1298 /// Some of the instructions in the list have alternate opcodes.
1299 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1300
1301 /// Checks if the instruction matches either the main or alternate opcode.
1302 /// \returns
1303 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1304 /// to it
1305 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1306 /// it
1307 /// - nullptr if \param I cannot be matched or converted to either opcode
1308 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1309 assert(MainOp && "MainOp cannot be nullptr.");
1310 if (I->getOpcode() == MainOp->getOpcode())
1311 return MainOp;
1312 if (MainOp->getOpcode() == Instruction::Select &&
1313 I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1314 return MainOp;
1315 // Prefer AltOp instead of interchangeable instruction of MainOp.
1316 assert(AltOp && "AltOp cannot be nullptr.");
1317 if (I->getOpcode() == AltOp->getOpcode())
1318 return AltOp;
1319 if (!I->isBinaryOp())
1320 return nullptr;
1321 BinOpSameOpcodeHelper Converter(MainOp);
1322 if (!Converter.add(I) || !Converter.add(MainOp))
1323 return nullptr;
1324 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1325 BinOpSameOpcodeHelper AltConverter(AltOp);
1326 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1327 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1328 return AltOp;
1329 }
1330 if (Converter.hasAltOp() && !isAltShuffle())
1331 return nullptr;
1332 return Converter.hasAltOp() ? AltOp : MainOp;
1333 }
1334
1335 /// Checks if main/alt instructions are shift operations.
1336 bool isShiftOp() const {
1337 return getMainOp()->isShift() && getAltOp()->isShift();
1338 }
1339
1340 /// Checks if main/alt instructions are bitwise logic operations.
1341 bool isBitwiseLogicOp() const {
1342 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1343 }
1344
1345 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1346 bool isMulDivLikeOp() const {
1347 constexpr std::array<unsigned, 8> MulDiv = {
1348 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1349 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1350 Instruction::URem, Instruction::FRem};
1351 return is_contained(MulDiv, getOpcode()) &&
1352 is_contained(MulDiv, getAltOpcode());
1353 }
1354
1355 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1356 bool isAddSubLikeOp() const {
1357 constexpr std::array<unsigned, 4> AddSub = {
1358 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1359 Instruction::FSub};
1360 return is_contained(AddSub, getOpcode()) &&
1361 is_contained(AddSub, getAltOpcode());
1362 }
1363
1364 /// Checks if main/alt instructions are cmp operations.
1365 bool isCmpOp() const {
1366 return (getOpcode() == Instruction::ICmp ||
1367 getOpcode() == Instruction::FCmp) &&
1368 getAltOpcode() == getOpcode();
1369 }
1370
1371 /// Checks if the current state is valid, i.e. has non-null MainOp
1372 bool valid() const { return MainOp && AltOp; }
1373
1374 explicit operator bool() const { return valid(); }
1375
1376 InstructionsState() = delete;
1377 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1378 bool HasCopyables = false)
1379 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1380 static InstructionsState invalid() { return {nullptr, nullptr}; }
1381
1382 /// Checks if the value is a copyable element.
1383 bool isCopyableElement(Value *V) const {
1384 assert(valid() && "InstructionsState is invalid.");
1385 if (!HasCopyables)
1386 return false;
1387 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1388 return false;
1389 auto *I = dyn_cast<Instruction>(V);
1390 if (!I)
1391 return !isa<PoisonValue>(V);
1392 if (I->getParent() != MainOp->getParent() &&
1395 return true;
1396 if (I->getOpcode() == MainOp->getOpcode())
1397 return false;
1398 if (!I->isBinaryOp())
1399 return true;
1400 BinOpSameOpcodeHelper Converter(MainOp);
1401 return !Converter.add(I) || !Converter.add(MainOp) ||
1402 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1403 }
1404
1405 /// Checks if the value is non-schedulable.
1406 bool isNonSchedulable(Value *V) const {
1407 assert(valid() && "InstructionsState is invalid.");
1408 auto *I = dyn_cast<Instruction>(V);
1409 if (!HasCopyables)
1410 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1412 // MainOp for copyables always schedulable to correctly identify
1413 // non-schedulable copyables.
1414 if (getMainOp() == V)
1415 return false;
1416 if (isCopyableElement(V)) {
1417 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1418 auto *I = dyn_cast<Instruction>(V);
1419 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1421 // If the copyable instructions comes after MainOp
1422 // (non-schedulable, but used in the block) - cannot vectorize
1423 // it, will possibly generate use before def.
1424 !MainOp->comesBefore(I));
1425 };
1426
1427 return IsNonSchedulableCopyableElement(V);
1428 }
1429 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1431 }
1432
1433 /// Checks if the state represents copyable instructions.
1434 bool areInstructionsWithCopyableElements() const {
1435 assert(valid() && "InstructionsState is invalid.");
1436 return HasCopyables;
1437 }
1438};
1439
1440std::pair<Instruction *, SmallVector<Value *>>
1441convertTo(Instruction *I, const InstructionsState &S) {
1442 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1443 assert(SelectedOp && "Cannot convert the instruction.");
1444 if (I->isBinaryOp()) {
1445 BinOpSameOpcodeHelper Converter(I);
1446 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1447 }
1448 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1449}
1450
1451} // end anonymous namespace
1452
1453static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1454 const TargetLibraryInfo &TLI);
1455
1456/// Find an instruction with a specific opcode in VL.
1457/// \param VL Array of values to search through. Must contain only Instructions
1458/// and PoisonValues.
1459/// \param Opcode The instruction opcode to search for
1460/// \returns
1461/// - The first instruction found with matching opcode
1462/// - nullptr if no matching instruction is found
1464 unsigned Opcode) {
1465 for (Value *V : VL) {
1466 if (isa<PoisonValue>(V))
1467 continue;
1468 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1469 auto *Inst = cast<Instruction>(V);
1470 if (Inst->getOpcode() == Opcode)
1471 return Inst;
1472 }
1473 return nullptr;
1474}
1475
1476/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1477/// compatible instructions or constants, or just some other regular values.
1478static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1479 Value *Op1, const TargetLibraryInfo &TLI) {
1480 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1481 (isConstant(BaseOp1) && isConstant(Op1)) ||
1482 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1483 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1484 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1485 getSameOpcode({BaseOp0, Op0}, TLI) ||
1486 getSameOpcode({BaseOp1, Op1}, TLI);
1487}
1488
1489/// \returns true if a compare instruction \p CI has similar "look" and
1490/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1491/// swapped, false otherwise.
1492static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1493 const TargetLibraryInfo &TLI) {
1494 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1495 "Assessing comparisons of different types?");
1496 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1497 CmpInst::Predicate Pred = CI->getPredicate();
1499
1500 Value *BaseOp0 = BaseCI->getOperand(0);
1501 Value *BaseOp1 = BaseCI->getOperand(1);
1502 Value *Op0 = CI->getOperand(0);
1503 Value *Op1 = CI->getOperand(1);
1504
1505 return (BasePred == Pred &&
1506 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1507 (BasePred == SwappedPred &&
1508 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1509}
1510
1511/// \returns analysis of the Instructions in \p VL described in
1512/// InstructionsState, the Opcode that we suppose the whole list
1513/// could be vectorized even if its structure is diverse.
1514static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1515 const TargetLibraryInfo &TLI) {
1516 // Make sure these are all Instructions.
1518 return InstructionsState::invalid();
1519
1520 auto *It = find_if(VL, IsaPred<Instruction>);
1521 if (It == VL.end())
1522 return InstructionsState::invalid();
1523
1524 Instruction *MainOp = cast<Instruction>(*It);
1525 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1526 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1527 (VL.size() == 2 && InstCnt < 2))
1528 return InstructionsState::invalid();
1529
1530 bool IsCastOp = isa<CastInst>(MainOp);
1531 bool IsBinOp = isa<BinaryOperator>(MainOp);
1532 bool IsCmpOp = isa<CmpInst>(MainOp);
1533 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1535 Instruction *AltOp = MainOp;
1536 unsigned Opcode = MainOp->getOpcode();
1537 unsigned AltOpcode = Opcode;
1538
1539 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1540 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1541 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1542 UniquePreds.insert(BasePred);
1543 UniqueNonSwappedPreds.insert(BasePred);
1544 for (Value *V : VL) {
1545 auto *I = dyn_cast<CmpInst>(V);
1546 if (!I)
1547 return false;
1548 CmpInst::Predicate CurrentPred = I->getPredicate();
1549 CmpInst::Predicate SwappedCurrentPred =
1550 CmpInst::getSwappedPredicate(CurrentPred);
1551 UniqueNonSwappedPreds.insert(CurrentPred);
1552 if (!UniquePreds.contains(CurrentPred) &&
1553 !UniquePreds.contains(SwappedCurrentPred))
1554 UniquePreds.insert(CurrentPred);
1555 }
1556 // Total number of predicates > 2, but if consider swapped predicates
1557 // compatible only 2, consider swappable predicates as compatible opcodes,
1558 // not alternate.
1559 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1560 }();
1561 // Check for one alternate opcode from another BinaryOperator.
1562 // TODO - generalize to support all operators (types, calls etc.).
1563 Intrinsic::ID BaseID = 0;
1564 SmallVector<VFInfo> BaseMappings;
1565 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1566 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1567 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1568 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1569 return InstructionsState::invalid();
1570 }
1571 bool AnyPoison = InstCnt != VL.size();
1572 // Check MainOp too to be sure that it matches the requirements for the
1573 // instructions.
1574 for (Value *V : iterator_range(It, VL.end())) {
1575 auto *I = dyn_cast<Instruction>(V);
1576 if (!I)
1577 continue;
1578
1579 // Cannot combine poison and divisions.
1580 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1581 // intrinsics/functions only.
1582 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1583 return InstructionsState::invalid();
1584 unsigned InstOpcode = I->getOpcode();
1585 if (IsBinOp && isa<BinaryOperator>(I)) {
1586 if (BinOpHelper.add(I))
1587 continue;
1588 } else if (IsCastOp && isa<CastInst>(I)) {
1589 Value *Op0 = MainOp->getOperand(0);
1590 Type *Ty0 = Op0->getType();
1591 Value *Op1 = I->getOperand(0);
1592 Type *Ty1 = Op1->getType();
1593 if (Ty0 == Ty1) {
1594 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1595 continue;
1596 if (Opcode == AltOpcode) {
1597 assert(isValidForAlternation(Opcode) &&
1598 isValidForAlternation(InstOpcode) &&
1599 "Cast isn't safe for alternation, logic needs to be updated!");
1600 AltOpcode = InstOpcode;
1601 AltOp = I;
1602 continue;
1603 }
1604 }
1605 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1606 auto *BaseInst = cast<CmpInst>(MainOp);
1607 Type *Ty0 = BaseInst->getOperand(0)->getType();
1608 Type *Ty1 = Inst->getOperand(0)->getType();
1609 if (Ty0 == Ty1) {
1610 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1611 assert(InstOpcode == AltOpcode &&
1612 "Alternate instructions are only supported by BinaryOperator "
1613 "and CastInst.");
1614 // Check for compatible operands. If the corresponding operands are not
1615 // compatible - need to perform alternate vectorization.
1616 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1617 CmpInst::Predicate SwappedCurrentPred =
1618 CmpInst::getSwappedPredicate(CurrentPred);
1619
1620 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1621 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1622 continue;
1623
1624 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1625 continue;
1626 auto *AltInst = cast<CmpInst>(AltOp);
1627 if (MainOp != AltOp) {
1628 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1629 continue;
1630 } else if (BasePred != CurrentPred) {
1631 assert(
1632 isValidForAlternation(InstOpcode) &&
1633 "CmpInst isn't safe for alternation, logic needs to be updated!");
1634 AltOp = I;
1635 continue;
1636 }
1637 CmpInst::Predicate AltPred = AltInst->getPredicate();
1638 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1639 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1640 continue;
1641 }
1642 } else if (InstOpcode == Opcode) {
1643 assert(InstOpcode == AltOpcode &&
1644 "Alternate instructions are only supported by BinaryOperator and "
1645 "CastInst.");
1646 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1647 if (Gep->getNumOperands() != 2 ||
1648 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1649 return InstructionsState::invalid();
1650 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1652 return InstructionsState::invalid();
1653 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1654 auto *BaseLI = cast<LoadInst>(MainOp);
1655 if (!LI->isSimple() || !BaseLI->isSimple())
1656 return InstructionsState::invalid();
1657 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1658 auto *CallBase = cast<CallInst>(MainOp);
1659 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1660 return InstructionsState::invalid();
1661 if (Call->hasOperandBundles() &&
1663 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1664 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1665 CallBase->op_begin() +
1667 return InstructionsState::invalid();
1669 if (ID != BaseID)
1670 return InstructionsState::invalid();
1671 if (!ID) {
1672 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1673 if (Mappings.size() != BaseMappings.size() ||
1674 Mappings.front().ISA != BaseMappings.front().ISA ||
1675 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1676 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1677 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1678 Mappings.front().Shape.Parameters !=
1679 BaseMappings.front().Shape.Parameters)
1680 return InstructionsState::invalid();
1681 }
1682 }
1683 continue;
1684 }
1685 return InstructionsState::invalid();
1686 }
1687
1688 if (IsBinOp) {
1689 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1690 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1691 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1692 assert(AltOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1693 }
1694 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1695 "Incorrect implementation of allSameOpcode.");
1696 InstructionsState S(MainOp, AltOp);
1697 assert(all_of(VL,
1698 [&](Value *V) {
1699 return isa<PoisonValue>(V) ||
1700 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1701 }) &&
1702 "Invalid InstructionsState.");
1703 return S;
1704}
1705
1706/// \returns true if all of the values in \p VL have the same type or false
1707/// otherwise.
1709 Type *Ty = VL.consume_front()->getType();
1710 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1711}
1712
1713/// \returns True if in-tree use also needs extract. This refers to
1714/// possible scalar operand in vectorized instruction.
1715static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1716 TargetLibraryInfo *TLI,
1717 const TargetTransformInfo *TTI) {
1718 if (!UserInst)
1719 return false;
1720 unsigned Opcode = UserInst->getOpcode();
1721 switch (Opcode) {
1722 case Instruction::Load: {
1723 LoadInst *LI = cast<LoadInst>(UserInst);
1724 return (LI->getPointerOperand() == Scalar);
1725 }
1726 case Instruction::Store: {
1727 StoreInst *SI = cast<StoreInst>(UserInst);
1728 return (SI->getPointerOperand() == Scalar);
1729 }
1730 case Instruction::Call: {
1731 CallInst *CI = cast<CallInst>(UserInst);
1733 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1734 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1735 Arg.value().get() == Scalar;
1736 });
1737 }
1738 default:
1739 return false;
1740 }
1741}
1742
1743/// \returns the AA location that is being access by the instruction.
1746 return MemoryLocation::get(SI);
1747 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1748 return MemoryLocation::get(LI);
1749 return MemoryLocation();
1750}
1751
1752/// \returns True if the instruction is not a volatile or atomic load/store.
1753static bool isSimple(Instruction *I) {
1754 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1755 return LI->isSimple();
1757 return SI->isSimple();
1759 return !MI->isVolatile();
1760 return true;
1761}
1762
1763/// Shuffles \p Mask in accordance with the given \p SubMask.
1764/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1765/// one but two input vectors.
1766static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1767 bool ExtendingManyInputs = false) {
1768 if (SubMask.empty())
1769 return;
1770 assert(
1771 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1772 // Check if input scalars were extended to match the size of other node.
1773 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1774 "SubMask with many inputs support must be larger than the mask.");
1775 if (Mask.empty()) {
1776 Mask.append(SubMask.begin(), SubMask.end());
1777 return;
1778 }
1779 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1780 int TermValue = std::min(Mask.size(), SubMask.size());
1781 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1782 if (SubMask[I] == PoisonMaskElem ||
1783 (!ExtendingManyInputs &&
1784 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1785 continue;
1786 NewMask[I] = Mask[SubMask[I]];
1787 }
1788 Mask.swap(NewMask);
1789}
1790
1791/// Order may have elements assigned special value (size) which is out of
1792/// bounds. Such indices only appear on places which correspond to undef values
1793/// (see canReuseExtract for details) and used in order to avoid undef values
1794/// have effect on operands ordering.
1795/// The first loop below simply finds all unused indices and then the next loop
1796/// nest assigns these indices for undef values positions.
1797/// As an example below Order has two undef positions and they have assigned
1798/// values 3 and 7 respectively:
1799/// before: 6 9 5 4 9 2 1 0
1800/// after: 6 3 5 4 7 2 1 0
1802 const size_t Sz = Order.size();
1803 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1804 SmallBitVector MaskedIndices(Sz);
1805 for (unsigned I = 0; I < Sz; ++I) {
1806 if (Order[I] < Sz)
1807 UnusedIndices.reset(Order[I]);
1808 else
1809 MaskedIndices.set(I);
1810 }
1811 if (MaskedIndices.none())
1812 return;
1813 assert(UnusedIndices.count() == MaskedIndices.count() &&
1814 "Non-synced masked/available indices.");
1815 int Idx = UnusedIndices.find_first();
1816 int MIdx = MaskedIndices.find_first();
1817 while (MIdx >= 0) {
1818 assert(Idx >= 0 && "Indices must be synced.");
1819 Order[MIdx] = Idx;
1820 Idx = UnusedIndices.find_next(Idx);
1821 MIdx = MaskedIndices.find_next(MIdx);
1822 }
1823}
1824
1825/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1826/// Opcode1.
1828 unsigned Opcode0, unsigned Opcode1) {
1829 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1830 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1831 for (unsigned Lane : seq<unsigned>(VL.size())) {
1832 if (isa<PoisonValue>(VL[Lane]))
1833 continue;
1834 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1835 OpcodeMask.set(Lane * ScalarTyNumElements,
1836 Lane * ScalarTyNumElements + ScalarTyNumElements);
1837 }
1838 return OpcodeMask;
1839}
1840
1841/// Replicates the given \p Val \p VF times.
1843 unsigned VF) {
1844 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1845 "Expected scalar constants.");
1846 SmallVector<Constant *> NewVal(Val.size() * VF);
1847 for (auto [I, V] : enumerate(Val))
1848 std::fill_n(NewVal.begin() + I * VF, VF, V);
1849 return NewVal;
1850}
1851
1853 SmallVectorImpl<int> &Mask) {
1854 Mask.clear();
1855 const unsigned E = Indices.size();
1856 Mask.resize(E, PoisonMaskElem);
1857 for (unsigned I = 0; I < E; ++I)
1858 Mask[Indices[I]] = I;
1859}
1860
1861/// Reorders the list of scalars in accordance with the given \p Mask.
1863 ArrayRef<int> Mask) {
1864 assert(!Mask.empty() && "Expected non-empty mask.");
1865 SmallVector<Value *> Prev(Scalars.size(),
1866 PoisonValue::get(Scalars.front()->getType()));
1867 Prev.swap(Scalars);
1868 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1869 if (Mask[I] != PoisonMaskElem)
1870 Scalars[Mask[I]] = Prev[I];
1871}
1872
1873/// Checks if the provided value does not require scheduling. It does not
1874/// require scheduling if this is not an instruction or it is an instruction
1875/// that does not read/write memory and all operands are either not instructions
1876/// or phi nodes or instructions from different blocks.
1878 auto *I = dyn_cast<Instruction>(V);
1879 if (!I)
1880 return true;
1881 return !mayHaveNonDefUseDependency(*I) &&
1882 all_of(I->operands(), [I](Value *V) {
1883 auto *IO = dyn_cast<Instruction>(V);
1884 if (!IO)
1885 return true;
1886 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1887 });
1888}
1889
1890/// Checks if the provided value does not require scheduling. It does not
1891/// require scheduling if this is not an instruction or it is an instruction
1892/// that does not read/write memory and all users are phi nodes or instructions
1893/// from the different blocks.
1894static bool isUsedOutsideBlock(Value *V) {
1895 auto *I = dyn_cast<Instruction>(V);
1896 if (!I)
1897 return true;
1898 // Limits the number of uses to save compile time.
1899 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1900 all_of(I->users(), [I](User *U) {
1901 auto *IU = dyn_cast<Instruction>(U);
1902 if (!IU)
1903 return true;
1904 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1905 });
1906}
1907
1908/// Checks if the specified value does not require scheduling. It does not
1909/// require scheduling if all operands and all users do not need to be scheduled
1910/// in the current basic block.
1913}
1914
1915/// Checks if the specified array of instructions does not require scheduling.
1916/// It is so if all either instructions have operands that do not require
1917/// scheduling or their users do not require scheduling since they are phis or
1918/// in other basic blocks.
1920 return !VL.empty() &&
1922}
1923
1924/// Returns true if widened type of \p Ty elements with size \p Sz represents
1925/// full vector type, i.e. adding extra element results in extra parts upon type
1926/// legalization.
1928 unsigned Sz) {
1929 if (Sz <= 1)
1930 return false;
1932 return false;
1933 if (has_single_bit(Sz))
1934 return true;
1935 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1936 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1937 Sz % NumParts == 0;
1938}
1939
1940/// Returns number of parts, the type \p VecTy will be split at the codegen
1941/// phase. If the type is going to be scalarized or does not uses whole
1942/// registers, returns 1.
1943static unsigned
1945 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1946 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1947 if (NumParts == 0 || NumParts >= Limit)
1948 return 1;
1949 unsigned Sz = getNumElements(VecTy);
1950 if (NumParts >= Sz || Sz % NumParts != 0 ||
1951 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1952 return 1;
1953 return NumParts;
1954}
1955
1956/// Bottom Up SLP Vectorizer.
1958 class TreeEntry;
1959 class ScheduleEntity;
1960 class ScheduleData;
1961 class ScheduleCopyableData;
1962 class ScheduleBundle;
1965
1966 /// If we decide to generate strided load / store, this struct contains all
1967 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1968 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1969 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1970 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1971 /// size of element of FixedVectorType.
1972 struct StridedPtrInfo {
1973 Value *StrideVal = nullptr;
1974 const SCEV *StrideSCEV = nullptr;
1975 FixedVectorType *Ty = nullptr;
1976 };
1977 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1978
1979public:
1980 /// Tracks the state we can represent the loads in the given sequence.
1988
1995
1997 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1999 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
2000 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
2001 AC(AC), DB(DB), DL(DL), ORE(ORE),
2002 Builder(Se->getContext(), TargetFolder(*DL)) {
2003 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
2004 // Use the vector register size specified by the target unless overridden
2005 // by a command-line option.
2006 // TODO: It would be better to limit the vectorization factor based on
2007 // data type rather than just register size. For example, x86 AVX has
2008 // 256-bit registers, but it does not support integer operations
2009 // at that width (that requires AVX2).
2010 if (MaxVectorRegSizeOption.getNumOccurrences())
2011 MaxVecRegSize = MaxVectorRegSizeOption;
2012 else
2013 MaxVecRegSize =
2014 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
2015 .getFixedValue();
2016
2017 if (MinVectorRegSizeOption.getNumOccurrences())
2018 MinVecRegSize = MinVectorRegSizeOption;
2019 else
2020 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2021 }
2022
2023 /// Vectorize the tree that starts with the elements in \p VL.
2024 /// Returns the vectorized root.
2026
2027 /// Vectorize the tree but with the list of externally used values \p
2028 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2029 /// generated extractvalue instructions.
2030 Value *
2031 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2032 Instruction *ReductionRoot = nullptr,
2033 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2034 VectorValuesAndScales = {});
2035
2036 /// \returns the cost incurred by unwanted spills and fills, caused by
2037 /// holding live values over call sites.
2039
2040 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2041 /// final cost.
2044
2045 /// \returns the vectorization cost of the subtree that starts at \p VL.
2046 /// A negative number means that this is profitable.
2048 ArrayRef<Value *> VectorizedVals = {},
2049 InstructionCost ReductionCost = TTI::TCC_Free);
2050
2051 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2052 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2053 void buildTree(ArrayRef<Value *> Roots,
2054 const SmallDenseSet<Value *> &UserIgnoreLst);
2055
2056 /// Construct a vectorizable tree that starts at \p Roots.
2057 void buildTree(ArrayRef<Value *> Roots);
2058
2059 /// Return the scalars of the root node.
2061 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2062 return VectorizableTree.front()->Scalars;
2063 }
2064
2065 /// Returns the type/is-signed info for the root node in the graph without
2066 /// casting.
2067 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2068 const TreeEntry &Root = *VectorizableTree.front();
2069 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2070 !Root.Scalars.front()->getType()->isIntegerTy())
2071 return std::nullopt;
2072 auto It = MinBWs.find(&Root);
2073 if (It != MinBWs.end())
2074 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2075 It->second.first),
2076 It->second.second);
2077 if (Root.getOpcode() == Instruction::ZExt ||
2078 Root.getOpcode() == Instruction::SExt)
2079 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2080 Root.getOpcode() == Instruction::SExt);
2081 return std::nullopt;
2082 }
2083
2084 /// Checks if the root graph node can be emitted with narrower bitwidth at
2085 /// codegen and returns it signedness, if so.
2087 return MinBWs.at(VectorizableTree.front().get()).second;
2088 }
2089
2090 /// Returns reduction type after minbitdth analysis.
2092 if (ReductionBitWidth == 0 ||
2093 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2094 ReductionBitWidth >=
2095 DL->getTypeSizeInBits(
2096 VectorizableTree.front()->Scalars.front()->getType()))
2097 return getWidenedType(
2098 VectorizableTree.front()->Scalars.front()->getType(),
2099 VectorizableTree.front()->getVectorFactor());
2100 return getWidenedType(
2102 VectorizableTree.front()->Scalars.front()->getContext(),
2103 ReductionBitWidth),
2104 VectorizableTree.front()->getVectorFactor());
2105 }
2106
2107 /// Returns true if the tree results in one of the reduced bitcasts variants.
2109 return VectorizableTree.front()->hasState() &&
2110 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2111 VectorizableTree.front()->CombinedOp ==
2112 TreeEntry::ReducedBitcastBSwap ||
2113 VectorizableTree.front()->CombinedOp ==
2114 TreeEntry::ReducedBitcastLoads ||
2115 VectorizableTree.front()->CombinedOp ==
2116 TreeEntry::ReducedBitcastBSwapLoads) &&
2117 VectorizableTree.front()->State == TreeEntry::Vectorize;
2118 }
2119
2120 /// Returns true if the tree results in the reduced cmp bitcast root.
2122 return VectorizableTree.front()->hasState() &&
2123 VectorizableTree.front()->CombinedOp ==
2124 TreeEntry::ReducedCmpBitcast &&
2125 VectorizableTree.front()->State == TreeEntry::Vectorize;
2126 }
2127
2128 /// Builds external uses of the vectorized scalars, i.e. the list of
2129 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2130 /// ExternallyUsedValues contains additional list of external uses to handle
2131 /// vectorization of reductions.
2132 void
2133 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2134
2135 /// Transforms graph nodes to target specific representations, if profitable.
2136 void transformNodes();
2137
2138 /// Clear the internal data structures that are created by 'buildTree'.
2139 void deleteTree() {
2140 VectorizableTree.clear();
2141 ScalarToTreeEntries.clear();
2142 DeletedNodes.clear();
2143 TransformedToGatherNodes.clear();
2144 OperandsToTreeEntry.clear();
2145 ScalarsInSplitNodes.clear();
2146 MustGather.clear();
2147 NonScheduledFirst.clear();
2148 EntryToLastInstruction.clear();
2149 LastInstructionToPos.clear();
2150 LoadEntriesToVectorize.clear();
2151 IsGraphTransformMode = false;
2152 GatheredLoadsEntriesFirst.reset();
2153 CompressEntryToData.clear();
2154 ExternalUses.clear();
2155 ExternalUsesAsOriginalScalar.clear();
2156 ExternalUsesWithNonUsers.clear();
2157 for (auto &Iter : BlocksSchedules) {
2158 BlockScheduling *BS = Iter.second.get();
2159 BS->clear();
2160 }
2161 MinBWs.clear();
2162 ReductionBitWidth = 0;
2163 BaseGraphSize = 1;
2164 CastMaxMinBWSizes.reset();
2165 ExtraBitWidthNodes.clear();
2166 InstrElementSize.clear();
2167 UserIgnoreList = nullptr;
2168 PostponedGathers.clear();
2169 ValueToGatherNodes.clear();
2170 TreeEntryToStridedPtrInfoMap.clear();
2171 CurrentLoopNest.clear();
2172 }
2173
2174 unsigned getTreeSize() const { return VectorizableTree.size(); }
2175
2176 /// Returns the base graph size, before any transformations.
2177 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2178
2179 /// Perform LICM and CSE on the newly generated gather sequences.
2181
2182 /// Does this non-empty order represent an identity order? Identity
2183 /// should be represented as an empty order, so this is used to
2184 /// decide if we can canonicalize a computed order. Undef elements
2185 /// (represented as size) are ignored.
2187 assert(!Order.empty() && "expected non-empty order");
2188 const unsigned Sz = Order.size();
2189 return all_of(enumerate(Order), [&](const auto &P) {
2190 return P.value() == P.index() || P.value() == Sz;
2191 });
2192 }
2193
2194 /// Checks if the specified gather tree entry \p TE can be represented as a
2195 /// shuffled vector entry + (possibly) permutation with other gathers. It
2196 /// implements the checks only for possibly ordered scalars (Loads,
2197 /// ExtractElement, ExtractValue), which can be part of the graph.
2198 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2199 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2200 /// node might be ignored.
2201 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2202 bool TopToBottom,
2203 bool IgnoreReorder);
2204
2205 /// Sort loads into increasing pointers offsets to allow greater clustering.
2206 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2207
2208 /// Gets reordering data for the given tree entry. If the entry is vectorized
2209 /// - just return ReorderIndices, otherwise check if the scalars can be
2210 /// reordered and return the most optimal order.
2211 /// \return std::nullopt if ordering is not important, empty order, if
2212 /// identity order is important, or the actual order.
2213 /// \param TopToBottom If true, include the order of vectorized stores and
2214 /// insertelement nodes, otherwise skip them.
2215 /// \param IgnoreReorder true, if the root node order can be ignored.
2216 std::optional<OrdersType>
2217 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2218
2219 /// Checks if it is profitable to reorder the current tree.
2220 /// If the tree does not contain many profitable reordable nodes, better to
2221 /// skip it to save compile time.
2222 bool isProfitableToReorder() const;
2223
2224 /// Reorders the current graph to the most profitable order starting from the
2225 /// root node to the leaf nodes. The best order is chosen only from the nodes
2226 /// of the same size (vectorization factor). Smaller nodes are considered
2227 /// parts of subgraph with smaller VF and they are reordered independently. We
2228 /// can make it because we still need to extend smaller nodes to the wider VF
2229 /// and we can merge reordering shuffles with the widening shuffles.
2230 void reorderTopToBottom();
2231
2232 /// Reorders the current graph to the most profitable order starting from
2233 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2234 /// number of reshuffles if the leaf nodes use the same order. In this case we
2235 /// can merge the orders and just shuffle user node instead of shuffling its
2236 /// operands. Plus, even the leaf nodes have different orders, it allows to
2237 /// sink reordering in the graph closer to the root node and merge it later
2238 /// during analysis.
2239 void reorderBottomToTop(bool IgnoreReorder = false);
2240
2241 /// \return The vector element size in bits to use when vectorizing the
2242 /// expression tree ending at \p V. If V is a store, the size is the width of
2243 /// the stored value. Otherwise, the size is the width of the largest loaded
2244 /// value reaching V. This method is used by the vectorizer to calculate
2245 /// vectorization factors.
2246 unsigned getVectorElementSize(Value *V);
2247
2248 /// Compute the minimum type sizes required to represent the entries in a
2249 /// vectorizable tree.
2251
2252 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2253 unsigned getMaxVecRegSize() const {
2254 return MaxVecRegSize;
2255 }
2256
2257 // \returns minimum vector register size as set by cl::opt.
2258 unsigned getMinVecRegSize() const {
2259 return MinVecRegSize;
2260 }
2261
2262 unsigned getMinVF(unsigned Sz) const {
2263 return std::max(2U, getMinVecRegSize() / Sz);
2264 }
2265
2266 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2267 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2268 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2269 return MaxVF ? MaxVF : UINT_MAX;
2270 }
2271
2272 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2273 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2274 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2275 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2276 ///
2277 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2278 unsigned canMapToVector(Type *T) const;
2279
2280 /// \returns True if the VectorizableTree is both tiny and not fully
2281 /// vectorizable. We do not vectorize such trees.
2282 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2283
2284 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2285 /// It may happen, if all gather nodes are loads and they cannot be
2286 /// "clusterized". In this case even subgraphs cannot be vectorized more
2287 /// effectively than the base graph.
2288 bool isTreeNotExtendable() const;
2289
2290 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2291 Align Alignment, const int64_t Diff,
2292 const size_t Sz) const;
2293
2294 /// Return true if an array of scalar loads can be replaced with a strided
2295 /// load (with constant stride).
2296 ///
2297 /// It is possible that the load gets "widened". Suppose that originally each
2298 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2299 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2300 /// ...
2301 /// %b + 0 * %s + (w - 1)
2302 ///
2303 /// %b + 1 * %s + 0
2304 /// %b + 1 * %s + 1
2305 /// %b + 1 * %s + 2
2306 /// ...
2307 /// %b + 1 * %s + (w - 1)
2308 /// ...
2309 ///
2310 /// %b + (n - 1) * %s + 0
2311 /// %b + (n - 1) * %s + 1
2312 /// %b + (n - 1) * %s + 2
2313 /// ...
2314 /// %b + (n - 1) * %s + (w - 1)
2315 ///
2316 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2317 ///
2318 /// \param PointerOps list of pointer arguments of loads.
2319 /// \param ElemTy original scalar type of loads.
2320 /// \param Alignment alignment of the first load.
2321 /// \param SortedIndices is the order of PointerOps as returned by
2322 /// `sortPtrAccesses`
2323 /// \param Diff Pointer difference between the lowest and the highes pointer
2324 /// in `PointerOps` as returned by `getPointersDiff`.
2325 /// \param Ptr0 first pointer in `PointersOps`.
2326 /// \param PtrN last pointer in `PointersOps`.
2327 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2328 /// of `SPtrInfo` necessary to generate the strided load later.
2330 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2331 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2332 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2333
2334 /// Return true if an array of scalar loads can be replaced with a strided
2335 /// load (with run-time stride).
2336 /// \param PointerOps list of pointer arguments of loads.
2337 /// \param ScalarTy type of loads.
2338 /// \param CommonAlignment common alignement of loads as computed by
2339 /// `computeCommonAlignment<LoadInst>`.
2340 /// \param SortedIndicies is a list of indicies computed by this function such
2341 /// that the sequence `PointerOps[SortedIndices[0]],
2342 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2343 /// ordered by the coefficient of the stride. For example, if PointerOps is
2344 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2345 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2346 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2347 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2348 /// of `SPtrInfo` necessary to generate the strided load later.
2349 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2350 Align CommonAlignment,
2351 SmallVectorImpl<unsigned> &SortedIndices,
2352 StridedPtrInfo &SPtrInfo) const;
2353
2354 /// Checks if the given array of loads can be represented as a vectorized,
2355 /// scatter or just simple gather.
2356 /// \param VL list of loads.
2357 /// \param VL0 main load value.
2358 /// \param Order returned order of load instructions.
2359 /// \param PointerOps returned list of pointer operands.
2360 /// \param BestVF return best vector factor, if recursive check found better
2361 /// vectorization sequences rather than masked gather.
2362 /// \param TryRecursiveCheck used to check if long masked gather can be
2363 /// represented as a serie of loads/insert subvector, if profitable.
2366 SmallVectorImpl<Value *> &PointerOps,
2367 StridedPtrInfo &SPtrInfo,
2368 unsigned *BestVF = nullptr,
2369 bool TryRecursiveCheck = true) const;
2370
2371 /// Registers non-vectorizable sequence of loads
2372 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2373 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2374 }
2375
2376 /// Checks if the given loads sequence is known as not vectorizable
2377 template <typename T>
2379 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2380 }
2381
2383
2384 /// This structure holds any data we need about the edges being traversed
2385 /// during buildTreeRec(). We keep track of:
2386 /// (i) the user TreeEntry index, and
2387 /// (ii) the index of the edge.
2388 struct EdgeInfo {
2389 EdgeInfo() = default;
2390 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2392 /// The user TreeEntry.
2393 TreeEntry *UserTE = nullptr;
2394 /// The operand index of the use.
2395 unsigned EdgeIdx = UINT_MAX;
2396#ifndef NDEBUG
2398 const BoUpSLP::EdgeInfo &EI) {
2399 EI.dump(OS);
2400 return OS;
2401 }
2402 /// Debug print.
2403 void dump(raw_ostream &OS) const {
2404 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2405 << " EdgeIdx:" << EdgeIdx << "}";
2406 }
2407 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2408#endif
2409 bool operator == (const EdgeInfo &Other) const {
2410 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2411 }
2412
2413 operator bool() const { return UserTE != nullptr; }
2414 };
2415 friend struct DenseMapInfo<EdgeInfo>;
2416
2417 /// A helper class used for scoring candidates for two consecutive lanes.
2419 const TargetLibraryInfo &TLI;
2420 const DataLayout &DL;
2421 ScalarEvolution &SE;
2422 const BoUpSLP &R;
2423 int NumLanes; // Total number of lanes (aka vectorization factor).
2424 int MaxLevel; // The maximum recursion depth for accumulating score.
2425
2426 public:
2428 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2429 int MaxLevel)
2430 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2431 MaxLevel(MaxLevel) {}
2432
2433 // The hard-coded scores listed here are not very important, though it shall
2434 // be higher for better matches to improve the resulting cost. When
2435 // computing the scores of matching one sub-tree with another, we are
2436 // basically counting the number of values that are matching. So even if all
2437 // scores are set to 1, we would still get a decent matching result.
2438 // However, sometimes we have to break ties. For example we may have to
2439 // choose between matching loads vs matching opcodes. This is what these
2440 // scores are helping us with: they provide the order of preference. Also,
2441 // this is important if the scalar is externally used or used in another
2442 // tree entry node in the different lane.
2443
2444 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2445 static const int ScoreConsecutiveLoads = 4;
2446 /// The same load multiple times. This should have a better score than
2447 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2448 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2449 /// a vector load and 1.0 for a broadcast.
2450 static const int ScoreSplatLoads = 3;
2451 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2452 static const int ScoreReversedLoads = 3;
2453 /// A load candidate for masked gather.
2454 static const int ScoreMaskedGatherCandidate = 1;
2455 /// ExtractElementInst from same vector and consecutive indexes.
2456 static const int ScoreConsecutiveExtracts = 4;
2457 /// ExtractElementInst from same vector and reversed indices.
2458 static const int ScoreReversedExtracts = 3;
2459 /// Constants.
2460 static const int ScoreConstants = 2;
2461 /// Instructions with the same opcode.
2462 static const int ScoreSameOpcode = 2;
2463 /// Instructions with alt opcodes (e.g, add + sub).
2464 static const int ScoreAltOpcodes = 1;
2465 /// Identical instructions (a.k.a. splat or broadcast).
2466 static const int ScoreSplat = 1;
2467 /// Matching with an undef is preferable to failing.
2468 static const int ScoreUndef = 1;
2469 /// Score for failing to find a decent match.
2470 static const int ScoreFail = 0;
2471 /// Score if all users are vectorized.
2472 static const int ScoreAllUserVectorized = 1;
2473
2474 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2475 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2476 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2477 /// MainAltOps.
2479 ArrayRef<Value *> MainAltOps) const {
2480 if (!isValidElementType(V1->getType()) ||
2483
2484 if (V1 == V2) {
2485 if (isa<LoadInst>(V1)) {
2486 // Retruns true if the users of V1 and V2 won't need to be extracted.
2487 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2488 // Bail out if we have too many uses to save compilation time.
2489 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2490 return false;
2491
2492 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2493 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2494 return U == U1 || U == U2 || R.isVectorized(U);
2495 });
2496 };
2497 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2498 };
2499 // A broadcast of a load can be cheaper on some targets.
2500 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2501 ElementCount::getFixed(NumLanes)) &&
2502 ((int)V1->getNumUses() == NumLanes ||
2503 AllUsersAreInternal(V1, V2)))
2505 }
2507 }
2508
2509 auto CheckSameEntryOrFail = [&]() {
2510 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2512 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2513 !TEs2.empty() &&
2514 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2516 }
2518 };
2519
2520 auto *LI1 = dyn_cast<LoadInst>(V1);
2521 auto *LI2 = dyn_cast<LoadInst>(V2);
2522 if (LI1 && LI2) {
2523 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2524 !LI2->isSimple())
2525 return CheckSameEntryOrFail();
2526
2527 std::optional<int64_t> Dist = getPointersDiff(
2528 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2529 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2530 if (!Dist || *Dist == 0) {
2531 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2532 getUnderlyingObject(LI2->getPointerOperand()) &&
2533 R.TTI->isLegalMaskedGather(
2534 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2536 return CheckSameEntryOrFail();
2537 }
2538 // The distance is too large - still may be profitable to use masked
2539 // loads/gathers.
2540 if (std::abs(*Dist) > NumLanes / 2)
2542 // This still will detect consecutive loads, but we might have "holes"
2543 // in some cases. It is ok for non-power-2 vectorization and may produce
2544 // better results. It should not affect current vectorization.
2547 }
2548
2549 auto *C1 = dyn_cast<Constant>(V1);
2550 auto *C2 = dyn_cast<Constant>(V2);
2551 if (C1 && C2)
2553
2554 // Consider constants and buildvector compatible.
2555 if ((C1 && isa<InsertElementInst>(V2)) ||
2556 (C2 && isa<InsertElementInst>(V1)))
2558
2559 // Extracts from consecutive indexes of the same vector better score as
2560 // the extracts could be optimized away.
2561 Value *EV1;
2562 ConstantInt *Ex1Idx;
2563 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2564 // Undefs are always profitable for extractelements.
2565 // Compiler can easily combine poison and extractelement <non-poison> or
2566 // undef and extractelement <poison>. But combining undef +
2567 // extractelement <non-poison-but-may-produce-poison> requires some
2568 // extra operations.
2569 if (isa<UndefValue>(V2))
2570 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2573 Value *EV2 = nullptr;
2574 ConstantInt *Ex2Idx = nullptr;
2575 if (match(V2,
2577 m_Undef())))) {
2578 // Undefs are always profitable for extractelements.
2579 if (!Ex2Idx)
2581 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2583 if (EV2 == EV1) {
2584 int Idx1 = Ex1Idx->getZExtValue();
2585 int Idx2 = Ex2Idx->getZExtValue();
2586 int Dist = Idx2 - Idx1;
2587 // The distance is too large - still may be profitable to use
2588 // shuffles.
2589 if (std::abs(Dist) == 0)
2591 if (std::abs(Dist) > NumLanes / 2)
2595 }
2597 }
2598 return CheckSameEntryOrFail();
2599 }
2600
2601 auto *I1 = dyn_cast<Instruction>(V1);
2602 auto *I2 = dyn_cast<Instruction>(V2);
2603 if (I1 && I2) {
2604 if (I1->getParent() != I2->getParent())
2605 return CheckSameEntryOrFail();
2606 Value *V;
2607 Value *Cond;
2608 // ZExt i1 to something must be considered same opcode for select i1
2609 // cmp, x, y
2610 // Required to better match the transformation after
2611 // BoUpSLP::matchesInversedZExtSelect analysis.
2612 if ((match(I1, m_ZExt(m_Value(V))) &&
2613 match(I2, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
2614 V->getType() == Cond->getType()) ||
2615 (match(I2, m_ZExt(m_Value(V))) &&
2616 match(I1, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
2617 V->getType() == Cond->getType()))
2619 SmallVector<Value *, 4> Ops(MainAltOps);
2620 Ops.push_back(I1);
2621 Ops.push_back(I2);
2622 InstructionsState S = getSameOpcode(Ops, TLI);
2623 // Note: Only consider instructions with <= 2 operands to avoid
2624 // complexity explosion.
2625 if (S &&
2626 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2627 !S.isAltShuffle()) &&
2628 all_of(Ops, [&S](Value *V) {
2629 return isa<PoisonValue>(V) ||
2630 cast<Instruction>(V)->getNumOperands() ==
2631 S.getMainOp()->getNumOperands();
2632 }))
2633 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2635 }
2636
2637 if (I1 && isa<PoisonValue>(V2))
2639
2640 if (isa<UndefValue>(V2))
2642
2643 return CheckSameEntryOrFail();
2644 }
2645
2646 /// Go through the operands of \p LHS and \p RHS recursively until
2647 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2648 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2649 /// of \p U1 and \p U2), except at the beginning of the recursion where
2650 /// these are set to nullptr.
2651 ///
2652 /// For example:
2653 /// \verbatim
2654 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2655 /// \ / \ / \ / \ /
2656 /// + + + +
2657 /// G1 G2 G3 G4
2658 /// \endverbatim
2659 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2660 /// each level recursively, accumulating the score. It starts from matching
2661 /// the additions at level 0, then moves on to the loads (level 1). The
2662 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2663 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2664 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2665 /// Please note that the order of the operands does not matter, as we
2666 /// evaluate the score of all profitable combinations of operands. In
2667 /// other words the score of G1 and G4 is the same as G1 and G2. This
2668 /// heuristic is based on ideas described in:
2669 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2670 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2671 /// Luís F. W. Góes
2673 Instruction *U2, int CurrLevel,
2674 ArrayRef<Value *> MainAltOps) const {
2675
2676 // Get the shallow score of V1 and V2.
2677 int ShallowScoreAtThisLevel =
2678 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2679
2680 // If reached MaxLevel,
2681 // or if V1 and V2 are not instructions,
2682 // or if they are SPLAT,
2683 // or if they are not consecutive,
2684 // or if profitable to vectorize loads or extractelements, early return
2685 // the current cost.
2686 auto *I1 = dyn_cast<Instruction>(LHS);
2687 auto *I2 = dyn_cast<Instruction>(RHS);
2688 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2689 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2690 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2691 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2693 ShallowScoreAtThisLevel))
2694 return ShallowScoreAtThisLevel;
2695 assert(I1 && I2 && "Should have early exited.");
2696
2697 // Contains the I2 operand indexes that got matched with I1 operands.
2698 SmallSet<unsigned, 4> Op2Used;
2699
2700 // Recursion towards the operands of I1 and I2. We are trying all possible
2701 // operand pairs, and keeping track of the best score.
2702 if (I1->getNumOperands() != I2->getNumOperands())
2704 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2705 OpIdx1 != NumOperands1; ++OpIdx1) {
2706 // Try to pair op1I with the best operand of I2.
2707 int MaxTmpScore = 0;
2708 unsigned MaxOpIdx2 = 0;
2709 bool FoundBest = false;
2710 // If I2 is commutative try all combinations.
2711 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2712 unsigned ToIdx = isCommutative(I2)
2713 ? I2->getNumOperands()
2714 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2715 assert(FromIdx <= ToIdx && "Bad index");
2716 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2717 // Skip operands already paired with OpIdx1.
2718 if (Op2Used.count(OpIdx2))
2719 continue;
2720 // Recursively calculate the cost at each level
2721 int TmpScore =
2722 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2723 I1, I2, CurrLevel + 1, {});
2724 // Look for the best score.
2725 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2726 TmpScore > MaxTmpScore) {
2727 MaxTmpScore = TmpScore;
2728 MaxOpIdx2 = OpIdx2;
2729 FoundBest = true;
2730 }
2731 }
2732 if (FoundBest) {
2733 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2734 Op2Used.insert(MaxOpIdx2);
2735 ShallowScoreAtThisLevel += MaxTmpScore;
2736 }
2737 }
2738 return ShallowScoreAtThisLevel;
2739 }
2740 };
2741 /// A helper data structure to hold the operands of a vector of instructions.
2742 /// This supports a fixed vector length for all operand vectors.
2744 /// For each operand we need (i) the value, and (ii) the opcode that it
2745 /// would be attached to if the expression was in a left-linearized form.
2746 /// This is required to avoid illegal operand reordering.
2747 /// For example:
2748 /// \verbatim
2749 /// 0 Op1
2750 /// |/
2751 /// Op1 Op2 Linearized + Op2
2752 /// \ / ----------> |/
2753 /// - -
2754 ///
2755 /// Op1 - Op2 (0 + Op1) - Op2
2756 /// \endverbatim
2757 ///
2758 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2759 ///
2760 /// Another way to think of this is to track all the operations across the
2761 /// path from the operand all the way to the root of the tree and to
2762 /// calculate the operation that corresponds to this path. For example, the
2763 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2764 /// corresponding operation is a '-' (which matches the one in the
2765 /// linearized tree, as shown above).
2766 ///
2767 /// For lack of a better term, we refer to this operation as Accumulated
2768 /// Path Operation (APO).
2769 struct OperandData {
2770 OperandData() = default;
2771 OperandData(Value *V, bool APO, bool IsUsed)
2772 : V(V), APO(APO), IsUsed(IsUsed) {}
2773 /// The operand value.
2774 Value *V = nullptr;
2775 /// TreeEntries only allow a single opcode, or an alternate sequence of
2776 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2777 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2778 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2779 /// (e.g., Add/Mul)
2780 bool APO = false;
2781 /// Helper data for the reordering function.
2782 bool IsUsed = false;
2783 };
2784
2785 /// During operand reordering, we are trying to select the operand at lane
2786 /// that matches best with the operand at the neighboring lane. Our
2787 /// selection is based on the type of value we are looking for. For example,
2788 /// if the neighboring lane has a load, we need to look for a load that is
2789 /// accessing a consecutive address. These strategies are summarized in the
2790 /// 'ReorderingMode' enumerator.
2791 enum class ReorderingMode {
2792 Load, ///< Matching loads to consecutive memory addresses
2793 Opcode, ///< Matching instructions based on opcode (same or alternate)
2794 Constant, ///< Matching constants
2795 Splat, ///< Matching the same instruction multiple times (broadcast)
2796 Failed, ///< We failed to create a vectorizable group
2797 };
2798
2799 using OperandDataVec = SmallVector<OperandData, 2>;
2800
2801 /// A vector of operand vectors.
2803 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2804 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2805 unsigned ArgSize = 0;
2806
2807 const TargetLibraryInfo &TLI;
2808 const DataLayout &DL;
2809 ScalarEvolution &SE;
2810 const BoUpSLP &R;
2811 const Loop *L = nullptr;
2812
2813 /// \returns the operand data at \p OpIdx and \p Lane.
2814 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2815 return OpsVec[OpIdx][Lane];
2816 }
2817
2818 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2819 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2820 return OpsVec[OpIdx][Lane];
2821 }
2822
2823 /// Clears the used flag for all entries.
2824 void clearUsed() {
2825 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2826 OpIdx != NumOperands; ++OpIdx)
2827 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2828 ++Lane)
2829 OpsVec[OpIdx][Lane].IsUsed = false;
2830 }
2831
2832 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2833 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2834 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2835 }
2836
2837 /// \param Lane lane of the operands under analysis.
2838 /// \param OpIdx operand index in \p Lane lane we're looking the best
2839 /// candidate for.
2840 /// \param Idx operand index of the current candidate value.
2841 /// \returns The additional score due to possible broadcasting of the
2842 /// elements in the lane. It is more profitable to have power-of-2 unique
2843 /// elements in the lane, it will be vectorized with higher probability
2844 /// after removing duplicates. Currently the SLP vectorizer supports only
2845 /// vectorization of the power-of-2 number of unique scalars.
2846 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2847 const SmallBitVector &UsedLanes) const {
2848 Value *IdxLaneV = getData(Idx, Lane).V;
2849 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2850 isa<ExtractElementInst>(IdxLaneV))
2851 return 0;
2853 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2854 if (Ln == Lane)
2855 continue;
2856 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2857 if (!isa<Instruction>(OpIdxLnV))
2858 return 0;
2859 Uniques.try_emplace(OpIdxLnV, Ln);
2860 }
2861 unsigned UniquesCount = Uniques.size();
2862 auto IdxIt = Uniques.find(IdxLaneV);
2863 unsigned UniquesCntWithIdxLaneV =
2864 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2865 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2866 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2867 unsigned UniquesCntWithOpIdxLaneV =
2868 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2869 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2870 return 0;
2871 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2872 UniquesCntWithOpIdxLaneV,
2873 UniquesCntWithOpIdxLaneV -
2874 bit_floor(UniquesCntWithOpIdxLaneV)) -
2875 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2876 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2877 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2878 }
2879
2880 /// \param Lane lane of the operands under analysis.
2881 /// \param OpIdx operand index in \p Lane lane we're looking the best
2882 /// candidate for.
2883 /// \param Idx operand index of the current candidate value.
2884 /// \returns The additional score for the scalar which users are all
2885 /// vectorized.
2886 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2887 Value *IdxLaneV = getData(Idx, Lane).V;
2888 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2889 // Do not care about number of uses for vector-like instructions
2890 // (extractelement/extractvalue with constant indices), they are extracts
2891 // themselves and already externally used. Vectorization of such
2892 // instructions does not add extra extractelement instruction, just may
2893 // remove it.
2894 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2895 isVectorLikeInstWithConstOps(OpIdxLaneV))
2897 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2898 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2899 return 0;
2900 return R.areAllUsersVectorized(IdxLaneI)
2902 : 0;
2903 }
2904
2905 /// Score scaling factor for fully compatible instructions but with
2906 /// different number of external uses. Allows better selection of the
2907 /// instructions with less external uses.
2908 static const int ScoreScaleFactor = 10;
2909
2910 /// \Returns the look-ahead score, which tells us how much the sub-trees
2911 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2912 /// score. This helps break ties in an informed way when we cannot decide on
2913 /// the order of the operands by just considering the immediate
2914 /// predecessors.
2915 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2916 int Lane, unsigned OpIdx, unsigned Idx,
2917 bool &IsUsed, const SmallBitVector &UsedLanes) {
2918 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2920 // Keep track of the instruction stack as we recurse into the operands
2921 // during the look-ahead score exploration.
2922 int Score =
2923 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2924 /*CurrLevel=*/1, MainAltOps);
2925 if (Score) {
2926 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2927 if (Score <= -SplatScore) {
2928 // Failed score.
2929 Score = 0;
2930 } else {
2931 Score += SplatScore;
2932 // Scale score to see the difference between different operands
2933 // and similar operands but all vectorized/not all vectorized
2934 // uses. It does not affect actual selection of the best
2935 // compatible operand in general, just allows to select the
2936 // operand with all vectorized uses.
2937 Score *= ScoreScaleFactor;
2938 Score += getExternalUseScore(Lane, OpIdx, Idx);
2939 IsUsed = true;
2940 }
2941 }
2942 return Score;
2943 }
2944
2945 /// Best defined scores per lanes between the passes. Used to choose the
2946 /// best operand (with the highest score) between the passes.
2947 /// The key - {Operand Index, Lane}.
2948 /// The value - the best score between the passes for the lane and the
2949 /// operand.
2951 BestScoresPerLanes;
2952
2953 // Search all operands in Ops[*][Lane] for the one that matches best
2954 // Ops[OpIdx][LastLane] and return its opreand index.
2955 // If no good match can be found, return std::nullopt.
2956 std::optional<unsigned>
2957 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2958 ArrayRef<ReorderingMode> ReorderingModes,
2959 ArrayRef<Value *> MainAltOps,
2960 const SmallBitVector &UsedLanes) {
2961 unsigned NumOperands = getNumOperands();
2962
2963 // The operand of the previous lane at OpIdx.
2964 Value *OpLastLane = getData(OpIdx, LastLane).V;
2965
2966 // Our strategy mode for OpIdx.
2967 ReorderingMode RMode = ReorderingModes[OpIdx];
2968 if (RMode == ReorderingMode::Failed)
2969 return std::nullopt;
2970
2971 // The linearized opcode of the operand at OpIdx, Lane.
2972 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2973
2974 // The best operand index and its score.
2975 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2976 // are using the score to differentiate between the two.
2977 struct BestOpData {
2978 std::optional<unsigned> Idx;
2979 unsigned Score = 0;
2980 } BestOp;
2981 BestOp.Score =
2982 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2983 .first->second;
2984
2985 // Track if the operand must be marked as used. If the operand is set to
2986 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2987 // want to reestimate the operands again on the following iterations).
2988 bool IsUsed = RMode == ReorderingMode::Splat ||
2989 RMode == ReorderingMode::Constant ||
2990 RMode == ReorderingMode::Load;
2991 // Iterate through all unused operands and look for the best.
2992 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2993 // Get the operand at Idx and Lane.
2994 OperandData &OpData = getData(Idx, Lane);
2995 Value *Op = OpData.V;
2996 bool OpAPO = OpData.APO;
2997
2998 // Skip already selected operands.
2999 if (OpData.IsUsed)
3000 continue;
3001
3002 // Skip if we are trying to move the operand to a position with a
3003 // different opcode in the linearized tree form. This would break the
3004 // semantics.
3005 if (OpAPO != OpIdxAPO)
3006 continue;
3007
3008 // Look for an operand that matches the current mode.
3009 switch (RMode) {
3010 case ReorderingMode::Load:
3011 case ReorderingMode::Opcode: {
3012 bool LeftToRight = Lane > LastLane;
3013 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
3014 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
3015 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
3016 OpIdx, Idx, IsUsed, UsedLanes);
3017 if (Score > static_cast<int>(BestOp.Score) ||
3018 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
3019 Idx == OpIdx)) {
3020 BestOp.Idx = Idx;
3021 BestOp.Score = Score;
3022 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
3023 }
3024 break;
3025 }
3026 case ReorderingMode::Constant:
3027 if (isa<Constant>(Op) ||
3028 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
3029 BestOp.Idx = Idx;
3030 if (isa<Constant>(Op)) {
3032 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3034 }
3036 IsUsed = false;
3037 }
3038 break;
3039 case ReorderingMode::Splat:
3040 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
3041 IsUsed = Op == OpLastLane;
3042 if (Op == OpLastLane) {
3043 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3044 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3046 }
3047 BestOp.Idx = Idx;
3048 }
3049 break;
3050 case ReorderingMode::Failed:
3051 llvm_unreachable("Not expected Failed reordering mode.");
3052 }
3053 }
3054
3055 if (BestOp.Idx) {
3056 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3057 return BestOp.Idx;
3058 }
3059 // If we could not find a good match return std::nullopt.
3060 return std::nullopt;
3061 }
3062
3063 /// Helper for reorderOperandVecs.
3064 /// \returns the lane that we should start reordering from. This is the one
3065 /// which has the least number of operands that can freely move about or
3066 /// less profitable because it already has the most optimal set of operands.
3067 unsigned getBestLaneToStartReordering() const {
3068 unsigned Min = UINT_MAX;
3069 unsigned SameOpNumber = 0;
3070 // std::pair<unsigned, unsigned> is used to implement a simple voting
3071 // algorithm and choose the lane with the least number of operands that
3072 // can freely move about or less profitable because it already has the
3073 // most optimal set of operands. The first unsigned is a counter for
3074 // voting, the second unsigned is the counter of lanes with instructions
3075 // with same/alternate opcodes and same parent basic block.
3077 // Try to be closer to the original results, if we have multiple lanes
3078 // with same cost. If 2 lanes have the same cost, use the one with the
3079 // highest index.
3080 for (int I = getNumLanes(); I > 0; --I) {
3081 unsigned Lane = I - 1;
3082 OperandsOrderData NumFreeOpsHash =
3083 getMaxNumOperandsThatCanBeReordered(Lane);
3084 // Compare the number of operands that can move and choose the one with
3085 // the least number.
3086 if (NumFreeOpsHash.NumOfAPOs < Min) {
3087 Min = NumFreeOpsHash.NumOfAPOs;
3088 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3089 HashMap.clear();
3090 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3091 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3092 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3093 // Select the most optimal lane in terms of number of operands that
3094 // should be moved around.
3095 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3096 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3097 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3098 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3099 auto [It, Inserted] =
3100 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3101 if (!Inserted)
3102 ++It->second.first;
3103 }
3104 }
3105 // Select the lane with the minimum counter.
3106 unsigned BestLane = 0;
3107 unsigned CntMin = UINT_MAX;
3108 for (const auto &Data : reverse(HashMap)) {
3109 if (Data.second.first < CntMin) {
3110 CntMin = Data.second.first;
3111 BestLane = Data.second.second;
3112 }
3113 }
3114 return BestLane;
3115 }
3116
3117 /// Data structure that helps to reorder operands.
3118 struct OperandsOrderData {
3119 /// The best number of operands with the same APOs, which can be
3120 /// reordered.
3121 unsigned NumOfAPOs = UINT_MAX;
3122 /// Number of operands with the same/alternate instruction opcode and
3123 /// parent.
3124 unsigned NumOpsWithSameOpcodeParent = 0;
3125 /// Hash for the actual operands ordering.
3126 /// Used to count operands, actually their position id and opcode
3127 /// value. It is used in the voting mechanism to find the lane with the
3128 /// least number of operands that can freely move about or less profitable
3129 /// because it already has the most optimal set of operands. Can be
3130 /// replaced with SmallVector<unsigned> instead but hash code is faster
3131 /// and requires less memory.
3132 unsigned Hash = 0;
3133 };
3134 /// \returns the maximum number of operands that are allowed to be reordered
3135 /// for \p Lane and the number of compatible instructions(with the same
3136 /// parent/opcode). This is used as a heuristic for selecting the first lane
3137 /// to start operand reordering.
3138 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3139 unsigned CntTrue = 0;
3140 unsigned NumOperands = getNumOperands();
3141 // Operands with the same APO can be reordered. We therefore need to count
3142 // how many of them we have for each APO, like this: Cnt[APO] = x.
3143 // Since we only have two APOs, namely true and false, we can avoid using
3144 // a map. Instead we can simply count the number of operands that
3145 // correspond to one of them (in this case the 'true' APO), and calculate
3146 // the other by subtracting it from the total number of operands.
3147 // Operands with the same instruction opcode and parent are more
3148 // profitable since we don't need to move them in many cases, with a high
3149 // probability such lane already can be vectorized effectively.
3150 bool AllUndefs = true;
3151 unsigned NumOpsWithSameOpcodeParent = 0;
3152 Instruction *OpcodeI = nullptr;
3153 BasicBlock *Parent = nullptr;
3154 unsigned Hash = 0;
3155 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3156 const OperandData &OpData = getData(OpIdx, Lane);
3157 if (OpData.APO)
3158 ++CntTrue;
3159 // Use Boyer-Moore majority voting for finding the majority opcode and
3160 // the number of times it occurs.
3161 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3162 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3163 I->getParent() != Parent) {
3164 if (NumOpsWithSameOpcodeParent == 0) {
3165 NumOpsWithSameOpcodeParent = 1;
3166 OpcodeI = I;
3167 Parent = I->getParent();
3168 } else {
3169 --NumOpsWithSameOpcodeParent;
3170 }
3171 } else {
3172 ++NumOpsWithSameOpcodeParent;
3173 }
3174 }
3175 Hash = hash_combine(
3176 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3177 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3178 }
3179 if (AllUndefs)
3180 return {};
3181 OperandsOrderData Data;
3182 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3183 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3184 Data.Hash = Hash;
3185 return Data;
3186 }
3187
3188 /// Go through the instructions in VL and append their operands.
3189 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3190 const InstructionsState &S) {
3191 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3192 assert((empty() || all_of(Operands,
3193 [this](const ValueList &VL) {
3194 return VL.size() == getNumLanes();
3195 })) &&
3196 "Expected same number of lanes");
3197 assert(S.valid() && "InstructionsState is invalid.");
3198 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3199 // arguments to the intrinsic produces the same result.
3200 Instruction *MainOp = S.getMainOp();
3201 unsigned NumOperands = MainOp->getNumOperands();
3203 OpsVec.resize(ArgSize);
3204 unsigned NumLanes = VL.size();
3205 for (OperandDataVec &Ops : OpsVec)
3206 Ops.resize(NumLanes);
3207 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3208 // Our tree has just 3 nodes: the root and two operands.
3209 // It is therefore trivial to get the APO. We only need to check the
3210 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3211 // operand. The LHS operand of both add and sub is never attached to an
3212 // inversese operation in the linearized form, therefore its APO is
3213 // false. The RHS is true only if V is an inverse operation.
3214
3215 // Since operand reordering is performed on groups of commutative
3216 // operations or alternating sequences (e.g., +, -), we can safely tell
3217 // the inverse operations by checking commutativity.
3218 auto *I = dyn_cast<Instruction>(VL[Lane]);
3219 if (!I && isa<PoisonValue>(VL[Lane])) {
3220 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3221 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3222 continue;
3223 }
3224 bool IsInverseOperation = false;
3225 if (S.isCopyableElement(VL[Lane])) {
3226 // The value is a copyable element.
3227 IsInverseOperation =
3228 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3229 } else {
3230 assert(I && "Expected instruction");
3231 auto [SelectedOp, Ops] = convertTo(I, S);
3232 // We cannot check commutativity by the converted instruction
3233 // (SelectedOp) because isCommutative also examines def-use
3234 // relationships.
3235 IsInverseOperation = !isCommutative(SelectedOp, I);
3236 }
3237 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3238 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3239 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3240 }
3241 }
3242 }
3243
3244 /// \returns the number of operands.
3245 unsigned getNumOperands() const { return ArgSize; }
3246
3247 /// \returns the number of lanes.
3248 unsigned getNumLanes() const { return OpsVec[0].size(); }
3249
3250 /// \returns the operand value at \p OpIdx and \p Lane.
3251 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3252 return getData(OpIdx, Lane).V;
3253 }
3254
3255 /// \returns true if the data structure is empty.
3256 bool empty() const { return OpsVec.empty(); }
3257
3258 /// Clears the data.
3259 void clear() { OpsVec.clear(); }
3260
3261 /// \Returns true if there are enough operands identical to \p Op to fill
3262 /// the whole vector (it is mixed with constants or loop invariant values).
3263 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3264 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3265 assert(Op == getValue(OpIdx, Lane) &&
3266 "Op is expected to be getValue(OpIdx, Lane).");
3267 // Small number of loads - try load matching.
3268 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3269 return false;
3270 bool OpAPO = getData(OpIdx, Lane).APO;
3271 bool IsInvariant = L && L->isLoopInvariant(Op);
3272 unsigned Cnt = 0;
3273 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3274 if (Ln == Lane)
3275 continue;
3276 // This is set to true if we found a candidate for broadcast at Lane.
3277 bool FoundCandidate = false;
3278 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3279 OperandData &Data = getData(OpI, Ln);
3280 if (Data.APO != OpAPO || Data.IsUsed)
3281 continue;
3282 Value *OpILane = getValue(OpI, Lane);
3283 bool IsConstantOp = isa<Constant>(OpILane);
3284 // Consider the broadcast candidate if:
3285 // 1. Same value is found in one of the operands.
3286 if (Data.V == Op ||
3287 // 2. The operand in the given lane is not constant but there is a
3288 // constant operand in another lane (which can be moved to the
3289 // given lane). In this case we can represent it as a simple
3290 // permutation of constant and broadcast.
3291 (!IsConstantOp &&
3292 ((Lns > 2 && isa<Constant>(Data.V)) ||
3293 // 2.1. If we have only 2 lanes, need to check that value in the
3294 // next lane does not build same opcode sequence.
3295 (Lns == 2 &&
3296 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3297 isa<Constant>(Data.V)))) ||
3298 // 3. The operand in the current lane is loop invariant (can be
3299 // hoisted out) and another operand is also a loop invariant
3300 // (though not a constant). In this case the whole vector can be
3301 // hoisted out.
3302 // FIXME: need to teach the cost model about this case for better
3303 // estimation.
3304 (IsInvariant && !isa<Constant>(Data.V) &&
3305 !getSameOpcode({Op, Data.V}, TLI) &&
3306 L->isLoopInvariant(Data.V))) {
3307 FoundCandidate = true;
3308 Data.IsUsed = Data.V == Op;
3309 if (Data.V == Op)
3310 ++Cnt;
3311 break;
3312 }
3313 }
3314 if (!FoundCandidate)
3315 return false;
3316 }
3317 return getNumLanes() == 2 || Cnt > 1;
3318 }
3319
3320 /// Checks if there is at least single compatible operand in lanes other
3321 /// than \p Lane, compatible with the operand \p Op.
3322 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3323 assert(Op == getValue(OpIdx, Lane) &&
3324 "Op is expected to be getValue(OpIdx, Lane).");
3325 bool OpAPO = getData(OpIdx, Lane).APO;
3326 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3327 if (Ln == Lane)
3328 continue;
3329 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3330 const OperandData &Data = getData(OpI, Ln);
3331 if (Data.APO != OpAPO || Data.IsUsed)
3332 return true;
3333 Value *OpILn = getValue(OpI, Ln);
3334 return (L && L->isLoopInvariant(OpILn)) ||
3335 (getSameOpcode({Op, OpILn}, TLI) &&
3336 allSameBlock({Op, OpILn}));
3337 }))
3338 return true;
3339 }
3340 return false;
3341 }
3342
3343 public:
3344 /// Initialize with all the operands of the instruction vector \p RootVL.
3346 const InstructionsState &S, const BoUpSLP &R)
3347 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3348 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3349 // Append all the operands of RootVL.
3350 appendOperands(RootVL, Operands, S);
3351 }
3352
3353 /// \Returns a value vector with the operands across all lanes for the
3354 /// opearnd at \p OpIdx.
3355 ValueList getVL(unsigned OpIdx) const {
3356 ValueList OpVL(OpsVec[OpIdx].size());
3357 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3358 "Expected same num of lanes across all operands");
3359 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3360 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3361 return OpVL;
3362 }
3363
3364 // Performs operand reordering for 2 or more operands.
3365 // The original operands are in OrigOps[OpIdx][Lane].
3366 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3367 void reorder() {
3368 unsigned NumOperands = getNumOperands();
3369 unsigned NumLanes = getNumLanes();
3370 // Each operand has its own mode. We are using this mode to help us select
3371 // the instructions for each lane, so that they match best with the ones
3372 // we have selected so far.
3373 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3374
3375 // This is a greedy single-pass algorithm. We are going over each lane
3376 // once and deciding on the best order right away with no back-tracking.
3377 // However, in order to increase its effectiveness, we start with the lane
3378 // that has operands that can move the least. For example, given the
3379 // following lanes:
3380 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3381 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3382 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3383 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3384 // we will start at Lane 1, since the operands of the subtraction cannot
3385 // be reordered. Then we will visit the rest of the lanes in a circular
3386 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3387
3388 // Find the first lane that we will start our search from.
3389 unsigned FirstLane = getBestLaneToStartReordering();
3390
3391 // Initialize the modes.
3392 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3393 Value *OpLane0 = getValue(OpIdx, FirstLane);
3394 // Keep track if we have instructions with all the same opcode on one
3395 // side.
3396 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3397 // Check if OpLane0 should be broadcast.
3398 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3399 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3400 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3401 else if (isa<LoadInst>(OpILane0))
3402 ReorderingModes[OpIdx] = ReorderingMode::Load;
3403 else
3404 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3405 } else if (isa<Constant>(OpLane0)) {
3406 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3407 } else if (isa<Argument>(OpLane0)) {
3408 // Our best hope is a Splat. It may save some cost in some cases.
3409 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3410 } else {
3411 llvm_unreachable("Unexpected value kind.");
3412 }
3413 }
3414
3415 // Check that we don't have same operands. No need to reorder if operands
3416 // are just perfect diamond or shuffled diamond match. Do not do it only
3417 // for possible broadcasts or non-power of 2 number of scalars (just for
3418 // now).
3419 auto &&SkipReordering = [this]() {
3420 SmallPtrSet<Value *, 4> UniqueValues;
3421 ArrayRef<OperandData> Op0 = OpsVec.front();
3422 for (const OperandData &Data : Op0)
3423 UniqueValues.insert(Data.V);
3425 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3426 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3427 return !UniqueValues.contains(Data.V);
3428 }))
3429 return false;
3430 }
3431 // TODO: Check if we can remove a check for non-power-2 number of
3432 // scalars after full support of non-power-2 vectorization.
3433 return UniqueValues.size() != 2 &&
3434 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3435 UniqueValues.size());
3436 };
3437
3438 // If the initial strategy fails for any of the operand indexes, then we
3439 // perform reordering again in a second pass. This helps avoid assigning
3440 // high priority to the failed strategy, and should improve reordering for
3441 // the non-failed operand indexes.
3442 for (int Pass = 0; Pass != 2; ++Pass) {
3443 // Check if no need to reorder operands since they're are perfect or
3444 // shuffled diamond match.
3445 // Need to do it to avoid extra external use cost counting for
3446 // shuffled matches, which may cause regressions.
3447 if (SkipReordering())
3448 break;
3449 // Skip the second pass if the first pass did not fail.
3450 bool StrategyFailed = false;
3451 // Mark all operand data as free to use.
3452 clearUsed();
3453 // We keep the original operand order for the FirstLane, so reorder the
3454 // rest of the lanes. We are visiting the nodes in a circular fashion,
3455 // using FirstLane as the center point and increasing the radius
3456 // distance.
3457 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3458 for (unsigned I = 0; I < NumOperands; ++I)
3459 MainAltOps[I].push_back(getData(I, FirstLane).V);
3460
3461 SmallBitVector UsedLanes(NumLanes);
3462 UsedLanes.set(FirstLane);
3463 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3464 // Visit the lane on the right and then the lane on the left.
3465 for (int Direction : {+1, -1}) {
3466 int Lane = FirstLane + Direction * Distance;
3467 if (Lane < 0 || Lane >= (int)NumLanes)
3468 continue;
3469 UsedLanes.set(Lane);
3470 int LastLane = Lane - Direction;
3471 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3472 "Out of bounds");
3473 // Look for a good match for each operand.
3474 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3475 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3476 std::optional<unsigned> BestIdx =
3477 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3478 MainAltOps[OpIdx], UsedLanes);
3479 // By not selecting a value, we allow the operands that follow to
3480 // select a better matching value. We will get a non-null value in
3481 // the next run of getBestOperand().
3482 if (BestIdx) {
3483 // Swap the current operand with the one returned by
3484 // getBestOperand().
3485 swap(OpIdx, *BestIdx, Lane);
3486 } else {
3487 // Enable the second pass.
3488 StrategyFailed = true;
3489 }
3490 // Try to get the alternate opcode and follow it during analysis.
3491 if (MainAltOps[OpIdx].size() != 2) {
3492 OperandData &AltOp = getData(OpIdx, Lane);
3493 InstructionsState OpS =
3494 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3495 if (OpS && OpS.isAltShuffle())
3496 MainAltOps[OpIdx].push_back(AltOp.V);
3497 }
3498 }
3499 }
3500 }
3501 // Skip second pass if the strategy did not fail.
3502 if (!StrategyFailed)
3503 break;
3504 }
3505 }
3506
3507#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3508 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3509 switch (RMode) {
3510 case ReorderingMode::Load:
3511 return "Load";
3512 case ReorderingMode::Opcode:
3513 return "Opcode";
3514 case ReorderingMode::Constant:
3515 return "Constant";
3516 case ReorderingMode::Splat:
3517 return "Splat";
3518 case ReorderingMode::Failed:
3519 return "Failed";
3520 }
3521 llvm_unreachable("Unimplemented Reordering Type");
3522 }
3523
3524 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3525 raw_ostream &OS) {
3526 return OS << getModeStr(RMode);
3527 }
3528
3529 /// Debug print.
3530 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3531 printMode(RMode, dbgs());
3532 }
3533
3534 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3535 return printMode(RMode, OS);
3536 }
3537
3539 const unsigned Indent = 2;
3540 unsigned Cnt = 0;
3541 for (const OperandDataVec &OpDataVec : OpsVec) {
3542 OS << "Operand " << Cnt++ << "\n";
3543 for (const OperandData &OpData : OpDataVec) {
3544 OS.indent(Indent) << "{";
3545 if (Value *V = OpData.V)
3546 OS << *V;
3547 else
3548 OS << "null";
3549 OS << ", APO:" << OpData.APO << "}\n";
3550 }
3551 OS << "\n";
3552 }
3553 return OS;
3554 }
3555
3556 /// Debug print.
3557 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3558#endif
3559 };
3560
3561 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3562 /// for a pair which have highest score deemed to have best chance to form
3563 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3564 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3565 /// of the cost, considered to be good enough score.
3566 std::pair<std::optional<int>, int>
3567 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3568 int Limit = LookAheadHeuristics::ScoreFail) const {
3569 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3571 int BestScore = Limit;
3572 std::optional<int> Index;
3573 for (int I : seq<int>(0, Candidates.size())) {
3574 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3575 Candidates[I].second,
3576 /*U1=*/nullptr, /*U2=*/nullptr,
3577 /*CurrLevel=*/1, {});
3578 if (Score > BestScore) {
3579 BestScore = Score;
3580 Index = I;
3581 }
3582 }
3583 return std::make_pair(Index, BestScore);
3584 }
3585
3586 /// Checks if the instruction is marked for deletion.
3587 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3588
3589 /// Removes an instruction from its block and eventually deletes it.
3590 /// It's like Instruction::eraseFromParent() except that the actual deletion
3591 /// is delayed until BoUpSLP is destructed.
3593 DeletedInstructions.insert(I);
3594 }
3595
3596 /// Remove instructions from the parent function and clear the operands of \p
3597 /// DeadVals instructions, marking for deletion trivially dead operands.
3598 template <typename T>
3600 ArrayRef<T *> DeadVals,
3601 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3602 VectorValuesAndScales) {
3604 for (T *V : DeadVals) {
3605 auto *I = cast<Instruction>(V);
3607 }
3608 DenseSet<Value *> Processed;
3609 for (T *V : DeadVals) {
3610 if (!V || !Processed.insert(V).second)
3611 continue;
3612 auto *I = cast<Instruction>(V);
3614 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3615 for (Use &U : I->operands()) {
3616 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3617 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3619 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3620 return Entry->VectorizedValue == OpI;
3621 })))
3622 DeadInsts.push_back(OpI);
3623 }
3624 I->dropAllReferences();
3625 }
3626 for (T *V : DeadVals) {
3627 auto *I = cast<Instruction>(V);
3628 if (!I->getParent())
3629 continue;
3630 assert((I->use_empty() || all_of(I->uses(),
3631 [&](Use &U) {
3632 return isDeleted(
3633 cast<Instruction>(U.getUser()));
3634 })) &&
3635 "trying to erase instruction with users.");
3636 I->removeFromParent();
3637 SE->forgetValue(I);
3638 }
3639 // Process the dead instruction list until empty.
3640 while (!DeadInsts.empty()) {
3641 Value *V = DeadInsts.pop_back_val();
3643 if (!VI || !VI->getParent())
3644 continue;
3646 "Live instruction found in dead worklist!");
3647 assert(VI->use_empty() && "Instructions with uses are not dead.");
3648
3649 // Don't lose the debug info while deleting the instructions.
3650 salvageDebugInfo(*VI);
3651
3652 // Null out all of the instruction's operands to see if any operand
3653 // becomes dead as we go.
3654 for (Use &OpU : VI->operands()) {
3655 Value *OpV = OpU.get();
3656 if (!OpV)
3657 continue;
3658 OpU.set(nullptr);
3659
3660 if (!OpV->use_empty())
3661 continue;
3662
3663 // If the operand is an instruction that became dead as we nulled out
3664 // the operand, and if it is 'trivially' dead, delete it in a future
3665 // loop iteration.
3666 if (auto *OpI = dyn_cast<Instruction>(OpV))
3667 if (!DeletedInstructions.contains(OpI) &&
3668 (!OpI->getType()->isVectorTy() ||
3669 none_of(
3670 VectorValuesAndScales,
3671 [&](const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3672 &V) { return std::get<0>(V) == OpI; })) &&
3674 DeadInsts.push_back(OpI);
3675 }
3676
3677 VI->removeFromParent();
3678 eraseInstruction(VI);
3679 SE->forgetValue(VI);
3680 }
3681 }
3682
3683 /// Checks if the instruction was already analyzed for being possible
3684 /// reduction root.
3686 return AnalyzedReductionsRoots.count(I);
3687 }
3688 /// Register given instruction as already analyzed for being possible
3689 /// reduction root.
3691 AnalyzedReductionsRoots.insert(I);
3692 }
3693 /// Checks if the provided list of reduced values was checked already for
3694 /// vectorization.
3696 return AnalyzedReductionVals.contains(hash_value(VL));
3697 }
3698 /// Adds the list of reduced values to list of already checked values for the
3699 /// vectorization.
3701 AnalyzedReductionVals.insert(hash_value(VL));
3702 }
3703 /// Clear the list of the analyzed reduction root instructions.
3705 AnalyzedReductionsRoots.clear();
3706 AnalyzedReductionVals.clear();
3707 AnalyzedMinBWVals.clear();
3708 }
3709 /// Checks if the given value is gathered in one of the nodes.
3710 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3711 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3712 }
3713 /// Checks if the given value is gathered in one of the nodes.
3714 bool isGathered(const Value *V) const {
3715 return MustGather.contains(V);
3716 }
3717 /// Checks if the specified value was not schedule.
3718 bool isNotScheduled(const Value *V) const {
3719 return NonScheduledFirst.contains(V);
3720 }
3721
3722 /// Check if the value is vectorized in the tree.
3723 bool isVectorized(const Value *V) const {
3724 assert(V && "V cannot be nullptr.");
3725 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
3726 return any_of(Entries, [&](const TreeEntry *E) {
3727 return !DeletedNodes.contains(E) && !TransformedToGatherNodes.contains(E);
3728 });
3729 }
3730
3731 /// Checks if it is legal and profitable to build SplitVectorize node for the
3732 /// given \p VL.
3733 /// \param Op1 first homogeneous scalars.
3734 /// \param Op2 second homogeneous scalars.
3735 /// \param ReorderIndices indices to reorder the scalars.
3736 /// \returns true if the node was successfully built.
3738 const InstructionsState &LocalState,
3741 OrdersType &ReorderIndices) const;
3742
3743 ~BoUpSLP();
3744
3745private:
3746 /// Determine if a node \p E in can be demoted to a smaller type with a
3747 /// truncation. We collect the entries that will be demoted in ToDemote.
3748 /// \param E Node for analysis
3749 /// \param ToDemote indices of the nodes to be demoted.
3750 bool collectValuesToDemote(
3751 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3753 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3754 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3755
3756 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3757 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3758 /// they have only one user and reordarable).
3759 /// \param ReorderableGathers List of all gather nodes that require reordering
3760 /// (e.g., gather of extractlements or partially vectorizable loads).
3761 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3762 /// reordering, subset of \p NonVectorized.
3763 void buildReorderableOperands(
3764 TreeEntry *UserTE,
3765 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3766 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3767 SmallVectorImpl<TreeEntry *> &GatherOps);
3768
3769 /// Checks if the given \p TE is a gather node with clustered reused scalars
3770 /// and reorders it per given \p Mask.
3771 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3772
3773 /// Checks if all users of \p I are the part of the vectorization tree.
3774 bool areAllUsersVectorized(
3775 Instruction *I,
3776 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3777
3778 /// Return information about the vector formed for the specified index
3779 /// of a vector of (the same) instruction.
3782
3783 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3784 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3785 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3786 return const_cast<TreeEntry *>(
3787 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3788 }
3789
3790 /// Gets the root instruction for the given node. If the node is a strided
3791 /// load/store node with the reverse order, the root instruction is the last
3792 /// one.
3793 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3794
3795 /// \returns Cast context for the given graph node.
3797 getCastContextHint(const TreeEntry &TE) const;
3798
3799 /// \returns the scale of the given tree entry to the loop iteration.
3800 /// \p Scalar is the scalar value from the entry, if using the parent for the
3801 /// external use.
3802 /// \p U is the user of the vectorized value from the entry, if using the
3803 /// parent for the external use.
3804 unsigned getScaleToLoopIterations(const TreeEntry &TE,
3805 Value *Scalar = nullptr,
3806 Instruction *U = nullptr);
3807
3808 /// Get the loop nest for the given loop \p L.
3809 ArrayRef<const Loop *> getLoopNest(const Loop *L);
3810
3811 /// \returns the cost of the vectorizable entry.
3812 InstructionCost getEntryCost(const TreeEntry *E,
3813 ArrayRef<Value *> VectorizedVals,
3814 SmallPtrSetImpl<Value *> &CheckedExtracts);
3815
3816 /// This is the recursive part of buildTree.
3817 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3818 unsigned InterleaveFactor = 0);
3819
3820 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3821 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3822 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3823 /// returns false, setting \p CurrentOrder to either an empty vector or a
3824 /// non-identity permutation that allows to reuse extract instructions.
3825 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3826 /// extract order.
3827 bool canReuseExtract(ArrayRef<Value *> VL,
3828 SmallVectorImpl<unsigned> &CurrentOrder,
3829 bool ResizeAllowed = false) const;
3830
3831 /// Vectorize a single entry in the tree.
3832 Value *vectorizeTree(TreeEntry *E);
3833
3834 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3835 /// \p E.
3836 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3837
3838 /// Create a new vector from a list of scalar values. Produces a sequence
3839 /// which exploits values reused across lanes, and arranges the inserts
3840 /// for ease of later optimization.
3841 template <typename BVTy, typename ResTy, typename... Args>
3842 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3843
3844 /// Create a new vector from a list of scalar values. Produces a sequence
3845 /// which exploits values reused across lanes, and arranges the inserts
3846 /// for ease of later optimization.
3847 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3848
3849 /// Returns the instruction in the bundle, which can be used as a base point
3850 /// for scheduling. Usually it is the last instruction in the bundle, except
3851 /// for the case when all operands are external (in this case, it is the first
3852 /// instruction in the list).
3853 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3854
3855 /// Tries to find extractelement instructions with constant indices from fixed
3856 /// vector type and gather such instructions into a bunch, which highly likely
3857 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3858 /// was successful, the matched scalars are replaced by poison values in \p VL
3859 /// for future analysis.
3860 std::optional<TargetTransformInfo::ShuffleKind>
3861 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3862 SmallVectorImpl<int> &Mask) const;
3863
3864 /// Tries to find extractelement instructions with constant indices from fixed
3865 /// vector type and gather such instructions into a bunch, which highly likely
3866 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3867 /// was successful, the matched scalars are replaced by poison values in \p VL
3868 /// for future analysis.
3870 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3872 unsigned NumParts) const;
3873
3874 /// Checks if the gathered \p VL can be represented as a single register
3875 /// shuffle(s) of previous tree entries.
3876 /// \param TE Tree entry checked for permutation.
3877 /// \param VL List of scalars (a subset of the TE scalar), checked for
3878 /// permutations. Must form single-register vector.
3879 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3880 /// commands to build the mask using the original vector value, without
3881 /// relying on the potential reordering.
3882 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3883 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3884 std::optional<TargetTransformInfo::ShuffleKind>
3885 isGatherShuffledSingleRegisterEntry(
3886 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3887 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3888 bool ForOrder);
3889
3890 /// Checks if the gathered \p VL can be represented as multi-register
3891 /// shuffle(s) of previous tree entries.
3892 /// \param TE Tree entry checked for permutation.
3893 /// \param VL List of scalars (a subset of the TE scalar), checked for
3894 /// permutations.
3895 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3896 /// commands to build the mask using the original vector value, without
3897 /// relying on the potential reordering.
3898 /// \returns per-register series of ShuffleKind, if gathered values can be
3899 /// represented as shuffles of previous tree entries. \p Mask is filled with
3900 /// the shuffle mask (also on per-register base).
3902 isGatherShuffledEntry(
3903 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3905 unsigned NumParts, bool ForOrder = false);
3906
3907 /// \returns the cost of gathering (inserting) the values in \p VL into a
3908 /// vector.
3909 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3910 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3911 Type *ScalarTy) const;
3912
3913 /// Set the Builder insert point to one after the last instruction in
3914 /// the bundle
3915 void setInsertPointAfterBundle(const TreeEntry *E);
3916
3917 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3918 /// specified, the starting vector value is poison.
3919 Value *
3920 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3921 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3922
3923 /// \returns whether the VectorizableTree is fully vectorizable and will
3924 /// be beneficial even the tree height is tiny.
3925 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3926
3927 /// Run through the list of all gathered loads in the graph and try to find
3928 /// vector loads/masked gathers instead of regular gathers. Later these loads
3929 /// are reshufled to build final gathered nodes.
3930 void tryToVectorizeGatheredLoads(
3931 const SmallMapVector<
3932 std::tuple<BasicBlock *, Value *, Type *>,
3933 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3934 &GatheredLoads);
3935
3936 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3937 /// users of \p TE and collects the stores. It returns the map from the store
3938 /// pointers to the collected stores.
3940 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3941
3942 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3943 /// stores in \p StoresVec can form a vector instruction. If so it returns
3944 /// true and populates \p ReorderIndices with the shuffle indices of the
3945 /// stores when compared to the sorted vector.
3946 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3947 OrdersType &ReorderIndices) const;
3948
3949 /// Iterates through the users of \p TE, looking for scalar stores that can be
3950 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3951 /// their order and builds an order index vector for each store bundle. It
3952 /// returns all these order vectors found.
3953 /// We run this after the tree has formed, otherwise we may come across user
3954 /// instructions that are not yet in the tree.
3956 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3957
3958 /// Tries to reorder the gathering node for better vectorization
3959 /// opportunities.
3960 void reorderGatherNode(TreeEntry &TE);
3961
3962 /// Checks if the tree represents disjoint or reduction of shl(zext, (0, 8,
3963 /// .., 56))-like pattern.
3964 /// If the int shifts unique, also strided, but not ordered, sets \p Order.
3965 /// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
3966 /// If the root nodes are loads, sets \p ForLoads to true.
3967 bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, bool &IsBSwap,
3968 bool &ForLoads) const;
3969
3970 /// Checks if the \p SelectTE matches zext+selects, which can be inversed for
3971 /// better codegen in case like zext (icmp ne), select (icmp eq), ....
3972 bool matchesInversedZExtSelect(
3973 const TreeEntry &SelectTE,
3974 SmallVectorImpl<unsigned> &InversedCmpsIndices) const;
3975
3976 /// Checks if the tree is reduction or of bit selects, like select %cmp, <1,
3977 /// 2, 4, 8, ..>, zeroinitializer, which can be reduced just to a bitcast %cmp
3978 /// to in.
3979 bool matchesSelectOfBits(const TreeEntry &SelectTE) const;
3980
3981 class TreeEntry {
3982 public:
3983 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3984 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3985
3986 /// \returns Common mask for reorder indices and reused scalars.
3987 SmallVector<int> getCommonMask() const {
3988 if (State == TreeEntry::SplitVectorize)
3989 return {};
3990 SmallVector<int> Mask;
3991 inversePermutation(ReorderIndices, Mask);
3992 ::addMask(Mask, ReuseShuffleIndices);
3993 return Mask;
3994 }
3995
3996 /// \returns The mask for split nodes.
3997 SmallVector<int> getSplitMask() const {
3998 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3999 "Expected only split vectorize node.");
4000 unsigned CommonVF = std::max<unsigned>(
4001 CombinedEntriesWithIndices.back().second,
4002 Scalars.size() - CombinedEntriesWithIndices.back().second);
4003 const unsigned Scale = getNumElements(Scalars.front()->getType());
4004 CommonVF *= Scale;
4005 SmallVector<int> Mask(getVectorFactor() * Scale, PoisonMaskElem);
4006 for (auto [Idx, I] : enumerate(ReorderIndices)) {
4007 for (unsigned K : seq<unsigned>(Scale)) {
4008 Mask[Scale * I + K] =
4009 Scale * Idx + K +
4010 (Idx >= CombinedEntriesWithIndices.back().second
4011 ? CommonVF - CombinedEntriesWithIndices.back().second * Scale
4012 : 0);
4013 }
4014 }
4015 return Mask;
4016 }
4017
4018 /// Updates (reorders) SplitVectorize node according to the given mask \p
4019 /// Mask and order \p MaskOrder.
4020 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
4021 ArrayRef<int> MaskOrder);
4022
4023 /// \returns true if the scalars in VL are equal to this entry.
4024 bool isSame(ArrayRef<Value *> VL) const {
4025 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
4026 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
4027 return std::equal(VL.begin(), VL.end(), Scalars.begin());
4028 return VL.size() == Mask.size() &&
4029 std::equal(VL.begin(), VL.end(), Mask.begin(),
4030 [Scalars](Value *V, int Idx) {
4031 return (isa<UndefValue>(V) &&
4032 Idx == PoisonMaskElem) ||
4033 (Idx != PoisonMaskElem && V == Scalars[Idx]);
4034 });
4035 };
4036 if (!ReorderIndices.empty()) {
4037 // TODO: implement matching if the nodes are just reordered, still can
4038 // treat the vector as the same if the list of scalars matches VL
4039 // directly, without reordering.
4040 SmallVector<int> Mask;
4041 inversePermutation(ReorderIndices, Mask);
4042 if (VL.size() == Scalars.size())
4043 return IsSame(Scalars, Mask);
4044 if (VL.size() == ReuseShuffleIndices.size()) {
4045 ::addMask(Mask, ReuseShuffleIndices);
4046 return IsSame(Scalars, Mask);
4047 }
4048 return false;
4049 }
4050 return IsSame(Scalars, ReuseShuffleIndices);
4051 }
4052
4053 /// \returns true if current entry has same operands as \p TE.
4054 bool hasEqualOperands(const TreeEntry &TE) const {
4055 if (TE.getNumOperands() != getNumOperands())
4056 return false;
4057 SmallBitVector Used(getNumOperands());
4058 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
4059 unsigned PrevCount = Used.count();
4060 for (unsigned K = 0; K < E; ++K) {
4061 if (Used.test(K))
4062 continue;
4063 if (getOperand(K) == TE.getOperand(I)) {
4064 Used.set(K);
4065 break;
4066 }
4067 }
4068 // Check if we actually found the matching operand.
4069 if (PrevCount == Used.count())
4070 return false;
4071 }
4072 return true;
4073 }
4074
4075 /// \return Final vectorization factor for the node. Defined by the total
4076 /// number of vectorized scalars, including those, used several times in the
4077 /// entry and counted in the \a ReuseShuffleIndices, if any.
4078 unsigned getVectorFactor() const {
4079 if (!ReuseShuffleIndices.empty())
4080 return ReuseShuffleIndices.size();
4081 return Scalars.size();
4082 };
4083
4084 /// Checks if the current node is a gather node.
4085 bool isGather() const { return State == NeedToGather; }
4086
4087 /// A vector of scalars.
4088 ValueList Scalars;
4089
4090 /// The Scalars are vectorized into this value. It is initialized to Null.
4091 WeakTrackingVH VectorizedValue = nullptr;
4092
4093 /// Do we need to gather this sequence or vectorize it
4094 /// (either with vector instruction or with scatter/gather
4095 /// intrinsics for store/load)?
4096 enum EntryState {
4097 Vectorize, ///< The node is regularly vectorized.
4098 ScatterVectorize, ///< Masked scatter/gather node.
4099 StridedVectorize, ///< Strided loads (and stores)
4100 CompressVectorize, ///< (Masked) load with compress.
4101 NeedToGather, ///< Gather/buildvector node.
4102 CombinedVectorize, ///< Vectorized node, combined with its user into more
4103 ///< complex node like select/cmp to minmax, mul/add to
4104 ///< fma, etc. Must be used for the following nodes in
4105 ///< the pattern, not the very first one.
4106 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4107 ///< independently and then combines back.
4108 };
4109 EntryState State;
4110
4111 /// List of combined opcodes supported by the vectorizer.
4112 enum CombinedOpcode {
4113 NotCombinedOp = -1,
4114 MinMax = Instruction::OtherOpsEnd + 1,
4115 FMulAdd,
4116 ReducedBitcast,
4117 ReducedBitcastBSwap,
4118 ReducedBitcastLoads,
4119 ReducedBitcastBSwapLoads,
4120 ReducedCmpBitcast,
4121 };
4122 CombinedOpcode CombinedOp = NotCombinedOp;
4123
4124 /// Does this sequence require some shuffling?
4125 SmallVector<int, 4> ReuseShuffleIndices;
4126
4127 /// Does this entry require reordering?
4128 SmallVector<unsigned, 4> ReorderIndices;
4129
4130 /// Points back to the VectorizableTree.
4131 ///
4132 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4133 /// to be a pointer and needs to be able to initialize the child iterator.
4134 /// Thus we need a reference back to the container to translate the indices
4135 /// to entries.
4136 VecTreeTy &Container;
4137
4138 /// The TreeEntry index containing the user of this entry.
4139 EdgeInfo UserTreeIndex;
4140
4141 /// The index of this treeEntry in VectorizableTree.
4142 unsigned Idx = 0;
4143
4144 /// For gather/buildvector/alt opcode nodes, which are combined from
4145 /// other nodes as a series of insertvector instructions.
4146 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4147
4148 private:
4149 /// The operands of each instruction in each lane Operands[op_index][lane].
4150 /// Note: This helps avoid the replication of the code that performs the
4151 /// reordering of operands during buildTreeRec() and vectorizeTree().
4152 SmallVector<ValueList, 2> Operands;
4153
4154 /// Copyable elements of the entry node.
4155 SmallPtrSet<const Value *, 4> CopyableElements;
4156
4157 /// MainOp and AltOp are recorded inside. S should be obtained from
4158 /// newTreeEntry.
4159 InstructionsState S = InstructionsState::invalid();
4160
4161 /// Interleaving factor for interleaved loads Vectorize nodes.
4162 unsigned InterleaveFactor = 0;
4163
4164 /// True if the node does not require scheduling.
4165 bool DoesNotNeedToSchedule = false;
4166
4167 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4168 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4169 if (Operands.size() < OpIdx + 1)
4170 Operands.resize(OpIdx + 1);
4171 assert(Operands[OpIdx].empty() && "Already resized?");
4172 assert(OpVL.size() <= Scalars.size() &&
4173 "Number of operands is greater than the number of scalars.");
4174 Operands[OpIdx].resize(OpVL.size());
4175 copy(OpVL, Operands[OpIdx].begin());
4176 }
4177
4178 /// Maps values to their lanes in the node.
4179 mutable SmallDenseMap<Value *, unsigned> ValueToLane;
4180
4181 public:
4182 /// Returns interleave factor for interleave nodes.
4183 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4184 /// Sets interleaving factor for the interleaving nodes.
4185 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4186
4187 /// Marks the node as one that does not require scheduling.
4188 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4189 /// Returns true if the node is marked as one that does not require
4190 /// scheduling.
4191 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4192
4193 /// Set this bundle's operands from \p Operands.
4194 void setOperands(ArrayRef<ValueList> Operands) {
4195 for (unsigned I : seq<unsigned>(Operands.size()))
4196 setOperand(I, Operands[I]);
4197 }
4198
4199 /// Reorders operands of the node to the given mask \p Mask.
4200 void reorderOperands(ArrayRef<int> Mask) {
4201 for (ValueList &Operand : Operands)
4202 reorderScalars(Operand, Mask);
4203 }
4204
4205 /// \returns the \p OpIdx operand of this TreeEntry.
4206 ValueList &getOperand(unsigned OpIdx) {
4207 assert(OpIdx < Operands.size() && "Off bounds");
4208 return Operands[OpIdx];
4209 }
4210
4211 /// \returns the \p OpIdx operand of this TreeEntry.
4212 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4213 assert(OpIdx < Operands.size() && "Off bounds");
4214 return Operands[OpIdx];
4215 }
4216
4217 /// \returns the number of operands.
4218 unsigned getNumOperands() const { return Operands.size(); }
4219
4220 /// \return the single \p OpIdx operand.
4221 Value *getSingleOperand(unsigned OpIdx) const {
4222 assert(OpIdx < Operands.size() && "Off bounds");
4223 assert(!Operands[OpIdx].empty() && "No operand available");
4224 return Operands[OpIdx][0];
4225 }
4226
4227 /// Some of the instructions in the list have alternate opcodes.
4228 bool isAltShuffle() const { return S.isAltShuffle(); }
4229
4230 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4231 return S.getMatchingMainOpOrAltOp(I);
4232 }
4233
4234 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4235 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4236 /// \p OpValue.
4237 Value *isOneOf(Value *Op) const {
4238 auto *I = dyn_cast<Instruction>(Op);
4239 if (I && getMatchingMainOpOrAltOp(I))
4240 return Op;
4241 return S.getMainOp();
4242 }
4243
4244 void setOperations(const InstructionsState &S) {
4245 assert(S && "InstructionsState is invalid.");
4246 this->S = S;
4247 }
4248
4249 Instruction *getMainOp() const { return S.getMainOp(); }
4250
4251 Instruction *getAltOp() const { return S.getAltOp(); }
4252
4253 /// The main/alternate opcodes for the list of instructions.
4254 unsigned getOpcode() const { return S.getOpcode(); }
4255
4256 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4257
4258 bool hasState() const { return S.valid(); }
4259
4260 /// Add \p V to the list of copyable elements.
4261 void addCopyableElement(Value *V) {
4262 assert(S.isCopyableElement(V) && "Not a copyable element.");
4263 CopyableElements.insert(V);
4264 }
4265
4266 /// Returns true if \p V is a copyable element.
4267 bool isCopyableElement(Value *V) const {
4268 return CopyableElements.contains(V);
4269 }
4270
4271 /// Returns true if any scalar in the list is a copyable element.
4272 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4273
4274 /// Returns the state of the operations.
4275 const InstructionsState &getOperations() const { return S; }
4276
4277 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4278 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4279 unsigned findLaneForValue(Value *V) const {
4280 auto Res = ValueToLane.try_emplace(V, getVectorFactor());
4281 if (!Res.second)
4282 return Res.first->second;
4283 unsigned &FoundLane = Res.first->getSecond();
4284 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4285 std::advance(It, 1)) {
4286 if (*It != V)
4287 continue;
4288 FoundLane = std::distance(Scalars.begin(), It);
4289 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4290 if (!ReorderIndices.empty())
4291 FoundLane = ReorderIndices[FoundLane];
4292 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4293 if (ReuseShuffleIndices.empty())
4294 break;
4295 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4296 RIt != ReuseShuffleIndices.end()) {
4297 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4298 break;
4299 }
4300 }
4301 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4302 return FoundLane;
4303 }
4304
4305 /// Build a shuffle mask for graph entry which represents a merge of main
4306 /// and alternate operations.
4307 void
4308 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4309 SmallVectorImpl<int> &Mask,
4310 SmallVectorImpl<Value *> *OpScalars = nullptr,
4311 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4312
4313 /// Return true if this is a non-power-of-2 node.
4314 bool isNonPowOf2Vec() const {
4315 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4316 return IsNonPowerOf2;
4317 }
4318
4319 /// Return true if this is a node, which tries to vectorize number of
4320 /// elements, forming whole vectors.
4321 bool
4322 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4323 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4324 TTI, getValueType(Scalars.front()), Scalars.size());
4325 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4326 "Reshuffling not supported with non-power-of-2 vectors yet.");
4327 return IsNonPowerOf2;
4328 }
4329
4330 Value *getOrdered(unsigned Idx) const {
4331 if (ReorderIndices.empty())
4332 return Scalars[Idx];
4333 SmallVector<int> Mask;
4334 inversePermutation(ReorderIndices, Mask);
4335 return Scalars[Mask[Idx]];
4336 }
4337
4338#ifndef NDEBUG
4339 /// Debug printer.
4340 LLVM_DUMP_METHOD void dump() const {
4341 dbgs() << Idx << ".\n";
4342 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4343 dbgs() << "Operand " << OpI << ":\n";
4344 for (const Value *V : Operands[OpI])
4345 dbgs().indent(2) << *V << "\n";
4346 }
4347 dbgs() << "Scalars: \n";
4348 for (Value *V : Scalars)
4349 dbgs().indent(2) << *V << "\n";
4350 dbgs() << "State: ";
4351 if (S && hasCopyableElements())
4352 dbgs() << "[[Copyable]] ";
4353 switch (State) {
4354 case Vectorize:
4355 if (InterleaveFactor > 0) {
4356 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4357 << "\n";
4358 } else {
4359 dbgs() << "Vectorize\n";
4360 }
4361 break;
4362 case ScatterVectorize:
4363 dbgs() << "ScatterVectorize\n";
4364 break;
4365 case StridedVectorize:
4366 dbgs() << "StridedVectorize\n";
4367 break;
4368 case CompressVectorize:
4369 dbgs() << "CompressVectorize\n";
4370 break;
4371 case NeedToGather:
4372 dbgs() << "NeedToGather\n";
4373 break;
4374 case CombinedVectorize:
4375 dbgs() << "CombinedVectorize\n";
4376 break;
4377 case SplitVectorize:
4378 dbgs() << "SplitVectorize\n";
4379 break;
4380 }
4381 if (S) {
4382 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4383 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4384 } else {
4385 dbgs() << "MainOp: NULL\n";
4386 dbgs() << "AltOp: NULL\n";
4387 }
4388 dbgs() << "VectorizedValue: ";
4389 if (VectorizedValue)
4390 dbgs() << *VectorizedValue << "\n";
4391 else
4392 dbgs() << "NULL\n";
4393 dbgs() << "ReuseShuffleIndices: ";
4394 if (ReuseShuffleIndices.empty())
4395 dbgs() << "Empty";
4396 else
4397 for (int ReuseIdx : ReuseShuffleIndices)
4398 dbgs() << ReuseIdx << ", ";
4399 dbgs() << "\n";
4400 dbgs() << "ReorderIndices: ";
4401 for (unsigned ReorderIdx : ReorderIndices)
4402 dbgs() << ReorderIdx << ", ";
4403 dbgs() << "\n";
4404 dbgs() << "UserTreeIndex: ";
4405 if (UserTreeIndex)
4406 dbgs() << UserTreeIndex;
4407 else
4408 dbgs() << "<invalid>";
4409 dbgs() << "\n";
4410 if (!CombinedEntriesWithIndices.empty()) {
4411 dbgs() << "Combined entries: ";
4412 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4413 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4414 });
4415 dbgs() << "\n";
4416 }
4417 }
4418#endif
4419 };
4420
4421#ifndef NDEBUG
4422 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4423 InstructionCost VecCost, InstructionCost ScalarCost,
4424 StringRef Banner) const {
4425 dbgs() << "SLP: " << Banner << ":\n";
4426 E->dump();
4427 dbgs() << "SLP: Costs:\n";
4428 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4429 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4430 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4431 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4432 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4433 }
4434#endif
4435
4436 /// Create a new gather TreeEntry
4437 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4438 const InstructionsState &S,
4439 const EdgeInfo &UserTreeIdx,
4440 ArrayRef<int> ReuseShuffleIndices = {}) {
4441 auto Invalid = ScheduleBundle::invalid();
4442 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4443 }
4444
4445 /// Create a new VectorizableTree entry.
4446 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4447 const InstructionsState &S,
4448 const EdgeInfo &UserTreeIdx,
4449 ArrayRef<int> ReuseShuffleIndices = {},
4450 ArrayRef<unsigned> ReorderIndices = {},
4451 unsigned InterleaveFactor = 0) {
4452 TreeEntry::EntryState EntryState =
4453 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4454 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4455 ReuseShuffleIndices, ReorderIndices);
4456 if (E && InterleaveFactor > 0)
4457 E->setInterleave(InterleaveFactor);
4458 return E;
4459 }
4460
4461 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4462 TreeEntry::EntryState EntryState,
4463 ScheduleBundle &Bundle, const InstructionsState &S,
4464 const EdgeInfo &UserTreeIdx,
4465 ArrayRef<int> ReuseShuffleIndices = {},
4466 ArrayRef<unsigned> ReorderIndices = {}) {
4467 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4468 EntryState == TreeEntry::SplitVectorize)) ||
4469 (Bundle && EntryState != TreeEntry::NeedToGather &&
4470 EntryState != TreeEntry::SplitVectorize)) &&
4471 "Need to vectorize gather entry?");
4472 // Gathered loads still gathered? Do not create entry, use the original one.
4473 if (GatheredLoadsEntriesFirst.has_value() &&
4474 EntryState == TreeEntry::NeedToGather && S &&
4475 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4476 !UserTreeIdx.UserTE)
4477 return nullptr;
4478 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4479 TreeEntry *Last = VectorizableTree.back().get();
4480 Last->Idx = VectorizableTree.size() - 1;
4481 Last->State = EntryState;
4482 if (UserTreeIdx.UserTE)
4483 OperandsToTreeEntry.try_emplace(
4484 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4485 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4486 // for non-power-of-two vectors.
4487 assert(
4488 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4489 ReuseShuffleIndices.empty()) &&
4490 "Reshuffling scalars not yet supported for nodes with padding");
4491 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4492 ReuseShuffleIndices.end());
4493 if (ReorderIndices.empty()) {
4494 Last->Scalars.assign(VL.begin(), VL.end());
4495 if (S)
4496 Last->setOperations(S);
4497 } else {
4498 // Reorder scalars and build final mask.
4499 Last->Scalars.assign(VL.size(), nullptr);
4500 transform(ReorderIndices, Last->Scalars.begin(),
4501 [VL](unsigned Idx) -> Value * {
4502 if (Idx >= VL.size())
4503 return UndefValue::get(VL.front()->getType());
4504 return VL[Idx];
4505 });
4506 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4507 if (S)
4508 Last->setOperations(S);
4509 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4510 }
4511 if (EntryState == TreeEntry::SplitVectorize) {
4512 assert(S && "Split nodes must have operations.");
4513 Last->setOperations(S);
4514 SmallPtrSet<Value *, 4> Processed;
4515 for (Value *V : VL) {
4516 auto *I = dyn_cast<Instruction>(V);
4517 if (!I)
4518 continue;
4519 auto It = ScalarsInSplitNodes.find(V);
4520 if (It == ScalarsInSplitNodes.end()) {
4521 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4522 (void)Processed.insert(V);
4523 } else if (Processed.insert(V).second) {
4524 assert(!is_contained(It->getSecond(), Last) &&
4525 "Value already associated with the node.");
4526 It->getSecond().push_back(Last);
4527 }
4528 }
4529 } else if (!Last->isGather()) {
4530 if (isa<PHINode>(S.getMainOp()) ||
4531 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4532 (!S.areInstructionsWithCopyableElements() &&
4533 doesNotNeedToSchedule(VL)) ||
4534 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4535 Last->setDoesNotNeedToSchedule();
4536 SmallPtrSet<Value *, 4> Processed;
4537 for (Value *V : VL) {
4538 if (isa<PoisonValue>(V))
4539 continue;
4540 if (S.isCopyableElement(V)) {
4541 Last->addCopyableElement(V);
4542 continue;
4543 }
4544 auto It = ScalarToTreeEntries.find(V);
4545 if (It == ScalarToTreeEntries.end()) {
4546 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4547 (void)Processed.insert(V);
4548 } else if (Processed.insert(V).second) {
4549 assert(!is_contained(It->getSecond(), Last) &&
4550 "Value already associated with the node.");
4551 It->getSecond().push_back(Last);
4552 }
4553 }
4554 // Update the scheduler bundle to point to this TreeEntry.
4555 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4556 "Bundle and VL out of sync");
4557 if (!Bundle.getBundle().empty()) {
4558#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4559 auto *BundleMember = Bundle.getBundle().begin();
4560 SmallPtrSet<Value *, 4> Processed;
4561 for (Value *V : VL) {
4562 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4563 continue;
4564 ++BundleMember;
4565 }
4566 assert(BundleMember == Bundle.getBundle().end() &&
4567 "Bundle and VL out of sync");
4568#endif
4569 Bundle.setTreeEntry(Last);
4570 }
4571 } else {
4572 // Build a map for gathered scalars to the nodes where they are used.
4573 bool AllConstsOrCasts = true;
4574 for (Value *V : VL) {
4575 if (S && S.areInstructionsWithCopyableElements() &&
4576 S.isCopyableElement(V))
4577 Last->addCopyableElement(V);
4578 if (!isConstant(V)) {
4579 auto *I = dyn_cast<CastInst>(V);
4580 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4581 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4582 !UserTreeIdx.UserTE->isGather())
4583 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4584 }
4585 }
4586 if (AllConstsOrCasts)
4587 CastMaxMinBWSizes =
4588 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4589 MustGather.insert_range(VL);
4590 }
4591
4592 if (UserTreeIdx.UserTE)
4593 Last->UserTreeIndex = UserTreeIdx;
4594 return Last;
4595 }
4596
4597 /// -- Vectorization State --
4598 /// Holds all of the tree entries.
4599 TreeEntry::VecTreeTy VectorizableTree;
4600
4601#ifndef NDEBUG
4602 /// Debug printer.
4603 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4604 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4605 VectorizableTree[Id]->dump();
4606 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4607 dbgs() << "[[TRANSFORMED TO GATHER]]";
4608 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4609 dbgs() << "[[DELETED NODE]]";
4610 dbgs() << "\n";
4611 }
4612 }
4613#endif
4614
4615 /// Get list of vector entries, associated with the value \p V.
4616 ArrayRef<TreeEntry *> getTreeEntries(const Value *V) const {
4617 assert(V && "V cannot be nullptr.");
4618 auto It = ScalarToTreeEntries.find(V);
4619 if (It == ScalarToTreeEntries.end())
4620 return {};
4621 return It->getSecond();
4622 }
4623
4624 /// Get list of split vector entries, associated with the value \p V.
4625 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4626 assert(V && "V cannot be nullptr.");
4627 auto It = ScalarsInSplitNodes.find(V);
4628 if (It == ScalarsInSplitNodes.end())
4629 return {};
4630 return It->getSecond();
4631 }
4632
4633 /// Returns first vector node for value \p V, matching values \p VL.
4634 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4635 bool SameVF = false) const {
4636 assert(V && "V cannot be nullptr.");
4637 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4638 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4639 return TE;
4640 return nullptr;
4641 }
4642
4643 /// Contains all the outputs of legality analysis for a list of values to
4644 /// vectorize.
4645 class ScalarsVectorizationLegality {
4646 InstructionsState S;
4647 bool IsLegal;
4648 bool TryToFindDuplicates;
4649 bool TrySplitVectorize;
4650
4651 public:
4652 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4653 bool TryToFindDuplicates = true,
4654 bool TrySplitVectorize = false)
4655 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4656 TrySplitVectorize(TrySplitVectorize) {
4657 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4658 "Inconsistent state");
4659 }
4660 const InstructionsState &getInstructionsState() const { return S; };
4661 bool isLegal() const { return IsLegal; }
4662 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4663 bool trySplitVectorize() const { return TrySplitVectorize; }
4664 };
4665
4666 /// Checks if the specified list of the instructions/values can be vectorized
4667 /// in general.
4668 ScalarsVectorizationLegality
4669 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4670 const EdgeInfo &UserTreeIdx) const;
4671
4672 /// Checks if the specified list of the instructions/values can be vectorized
4673 /// and fills required data before actual scheduling of the instructions.
4674 TreeEntry::EntryState getScalarsVectorizationState(
4675 const InstructionsState &S, ArrayRef<Value *> VL,
4676 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4677 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4678
4679 /// Maps a specific scalar to its tree entry(ies).
4680 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4681
4682 /// List of deleted non-profitable nodes.
4683 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4684
4685 /// List of nodes, transformed to gathered, with their conservative
4686 /// gather/buildvector cost estimation.
4687 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4688
4689 /// Maps the operand index and entry to the corresponding tree entry.
4690 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4691 OperandsToTreeEntry;
4692
4693 /// Scalars, used in split vectorize nodes.
4694 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4695
4696 /// Maps a value to the proposed vectorizable size.
4697 SmallDenseMap<Value *, unsigned> InstrElementSize;
4698
4699 /// A list of scalars that we found that we need to keep as scalars.
4700 ValueSet MustGather;
4701
4702 /// A set of first non-schedulable values.
4703 ValueSet NonScheduledFirst;
4704
4705 /// A map between the vectorized entries and the last instructions in the
4706 /// bundles. The bundles are built in use order, not in the def order of the
4707 /// instructions. So, we cannot rely directly on the last instruction in the
4708 /// bundle being the last instruction in the program order during
4709 /// vectorization process since the basic blocks are affected, need to
4710 /// pre-gather them before.
4711 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4712
4713 /// Keeps the mapping between the last instructions and their insertion
4714 /// points, which is an instruction-after-the-last-instruction.
4715 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4716
4717 /// List of gather nodes, depending on other gather/vector nodes, which should
4718 /// be emitted after the vector instruction emission process to correctly
4719 /// handle order of the vector instructions and shuffles.
4720 SetVector<const TreeEntry *> PostponedGathers;
4721
4722 using ValueToGatherNodesMap =
4723 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4724 ValueToGatherNodesMap ValueToGatherNodes;
4725
4726 /// A list of the load entries (node indices), which can be vectorized using
4727 /// strided or masked gather approach, but attempted to be represented as
4728 /// contiguous loads.
4729 SetVector<unsigned> LoadEntriesToVectorize;
4730
4731 /// true if graph nodes transforming mode is on.
4732 bool IsGraphTransformMode = false;
4733
4734 /// The index of the first gathered load entry in the VectorizeTree.
4735 std::optional<unsigned> GatheredLoadsEntriesFirst;
4736
4737 /// Maps compress entries to their mask data for the final codegen.
4738 SmallDenseMap<const TreeEntry *,
4739 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4740 CompressEntryToData;
4741
4742 /// The loop nest, used to check if only a single loop nest is vectorized, not
4743 /// multiple, to avoid side-effects from the loop-aware cost model.
4744 SmallVector<const Loop *> CurrentLoopNest;
4745
4746 /// Maps the loops to their loop nests.
4747 SmallDenseMap<const Loop *, SmallVector<const Loop *>> LoopToLoopNest;
4748
4749 /// Maps the loops to their scale factor, which is built as a multiplication
4750 /// of the tripcounts of the loops in the loop nest.
4751 SmallDenseMap<const Loop *, unsigned> LoopToScaleFactor;
4752
4753 /// This POD struct describes one external user in the vectorized tree.
4754 struct ExternalUser {
4755 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4756 : Scalar(S), User(U), E(E), Lane(L) {}
4757
4758 /// Which scalar in our function.
4759 Value *Scalar = nullptr;
4760
4761 /// Which user that uses the scalar.
4762 llvm::User *User = nullptr;
4763
4764 /// Vector node, the value is part of.
4765 const TreeEntry &E;
4766
4767 /// Which lane does the scalar belong to.
4768 unsigned Lane;
4769 };
4770 using UserList = SmallVector<ExternalUser, 16>;
4771
4772 /// Checks if two instructions may access the same memory.
4773 ///
4774 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4775 /// is invariant in the calling loop.
4776 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4777 Instruction *Inst2) {
4778 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4779 // First check if the result is already in the cache.
4780 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4781 auto Res = AliasCache.try_emplace(Key);
4782 if (!Res.second)
4783 return Res.first->second;
4784 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4785 // Store the result in the cache.
4786 Res.first->getSecond() = Aliased;
4787 return Aliased;
4788 }
4789
4790 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4791
4792 /// Cache for alias results.
4793 /// TODO: consider moving this to the AliasAnalysis itself.
4794 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4795
4796 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4797 // globally through SLP because we don't perform any action which
4798 // invalidates capture results.
4799 BatchAAResults BatchAA;
4800
4801 /// Temporary store for deleted instructions. Instructions will be deleted
4802 /// eventually when the BoUpSLP is destructed. The deferral is required to
4803 /// ensure that there are no incorrect collisions in the AliasCache, which
4804 /// can happen if a new instruction is allocated at the same address as a
4805 /// previously deleted instruction.
4806 DenseSet<Instruction *> DeletedInstructions;
4807
4808 /// Set of the instruction, being analyzed already for reductions.
4809 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4810
4811 /// Set of hashes for the list of reduction values already being analyzed.
4812 DenseSet<size_t> AnalyzedReductionVals;
4813
4814 /// Values, already been analyzed for mininmal bitwidth and found to be
4815 /// non-profitable.
4816 DenseSet<Value *> AnalyzedMinBWVals;
4817
4818 /// A list of values that need to extracted out of the tree.
4819 /// This list holds pairs of (Internal Scalar : External User). External User
4820 /// can be nullptr, it means that this Internal Scalar will be used later,
4821 /// after vectorization.
4822 UserList ExternalUses;
4823
4824 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4825 /// extractelement instructions.
4826 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4827
4828 /// A list of scalar to be extracted without specific user necause of too many
4829 /// uses.
4830 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4831
4832 /// Values used only by @llvm.assume calls.
4833 SmallPtrSet<const Value *, 32> EphValues;
4834
4835 /// Holds all of the instructions that we gathered, shuffle instructions and
4836 /// extractelements.
4837 SetVector<Instruction *> GatherShuffleExtractSeq;
4838
4839 /// A list of blocks that we are going to CSE.
4840 DenseSet<BasicBlock *> CSEBlocks;
4841
4842 /// List of hashes of vector of loads, which are known to be non vectorizable.
4843 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4844
4845 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4846 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4847 /// instructions, while ScheduleBundle represents a batch of instructions,
4848 /// going to be groupped together. ScheduleCopyableData models extra user for
4849 /// "copyable" instructions.
4850 class ScheduleEntity {
4851 friend class ScheduleBundle;
4852 friend class ScheduleData;
4853 friend class ScheduleCopyableData;
4854
4855 protected:
4856 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4857 Kind getKind() const { return K; }
4858 ScheduleEntity(Kind K) : K(K) {}
4859
4860 private:
4861 /// Used for getting a "good" final ordering of instructions.
4862 int SchedulingPriority = 0;
4863 /// True if this instruction (or bundle) is scheduled (or considered as
4864 /// scheduled in the dry-run).
4865 bool IsScheduled = false;
4866 /// The kind of the ScheduleEntity.
4867 const Kind K = Kind::ScheduleData;
4868
4869 public:
4870 ScheduleEntity() = delete;
4871 /// Gets/sets the scheduling priority.
4872 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4873 int getSchedulingPriority() const { return SchedulingPriority; }
4874 bool isReady() const {
4875 if (const auto *SD = dyn_cast<ScheduleData>(this))
4876 return SD->isReady();
4877 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4878 return CD->isReady();
4879 return cast<ScheduleBundle>(this)->isReady();
4880 }
4881 /// Returns true if the dependency information has been calculated.
4882 /// Note that depenendency validity can vary between instructions within
4883 /// a single bundle.
4884 bool hasValidDependencies() const {
4885 if (const auto *SD = dyn_cast<ScheduleData>(this))
4886 return SD->hasValidDependencies();
4887 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4888 return CD->hasValidDependencies();
4889 return cast<ScheduleBundle>(this)->hasValidDependencies();
4890 }
4891 /// Gets the number of unscheduled dependencies.
4892 int getUnscheduledDeps() const {
4893 if (const auto *SD = dyn_cast<ScheduleData>(this))
4894 return SD->getUnscheduledDeps();
4895 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4896 return CD->getUnscheduledDeps();
4897 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4898 }
4899 /// Increments the number of unscheduled dependencies.
4900 int incrementUnscheduledDeps(int Incr) {
4901 if (auto *SD = dyn_cast<ScheduleData>(this))
4902 return SD->incrementUnscheduledDeps(Incr);
4903 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4904 }
4905 /// Gets the number of dependencies.
4906 int getDependencies() const {
4907 if (const auto *SD = dyn_cast<ScheduleData>(this))
4908 return SD->getDependencies();
4909 return cast<ScheduleCopyableData>(this)->getDependencies();
4910 }
4911 /// Gets the instruction.
4912 Instruction *getInst() const {
4913 if (const auto *SD = dyn_cast<ScheduleData>(this))
4914 return SD->getInst();
4915 return cast<ScheduleCopyableData>(this)->getInst();
4916 }
4917
4918 /// Gets/sets if the bundle is scheduled.
4919 bool isScheduled() const { return IsScheduled; }
4920 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4921
4922 static bool classof(const ScheduleEntity *) { return true; }
4923
4924#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4925 void dump(raw_ostream &OS) const {
4926 if (const auto *SD = dyn_cast<ScheduleData>(this))
4927 return SD->dump(OS);
4928 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4929 return CD->dump(OS);
4930 return cast<ScheduleBundle>(this)->dump(OS);
4931 }
4932
4933 LLVM_DUMP_METHOD void dump() const {
4934 dump(dbgs());
4935 dbgs() << '\n';
4936 }
4937#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4938 };
4939
4940#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4942 const BoUpSLP::ScheduleEntity &SE) {
4943 SE.dump(OS);
4944 return OS;
4945 }
4946#endif
4947
4948 /// Contains all scheduling relevant data for an instruction.
4949 /// A ScheduleData either represents a single instruction or a member of an
4950 /// instruction bundle (= a group of instructions which is combined into a
4951 /// vector instruction).
4952 class ScheduleData final : public ScheduleEntity {
4953 public:
4954 // The initial value for the dependency counters. It means that the
4955 // dependencies are not calculated yet.
4956 enum { InvalidDeps = -1 };
4957
4958 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4959 static bool classof(const ScheduleEntity *Entity) {
4960 return Entity->getKind() == Kind::ScheduleData;
4961 }
4962
4963 void init(int BlockSchedulingRegionID, Instruction *I) {
4964 NextLoadStore = nullptr;
4965 IsScheduled = false;
4966 SchedulingRegionID = BlockSchedulingRegionID;
4967 clearDependencies();
4968 Inst = I;
4969 }
4970
4971 /// Verify basic self consistency properties
4972 void verify() {
4973 if (hasValidDependencies()) {
4974 assert(UnscheduledDeps <= Dependencies && "invariant");
4975 } else {
4976 assert(UnscheduledDeps == Dependencies && "invariant");
4977 }
4978
4979 if (IsScheduled) {
4980 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4981 "unexpected scheduled state");
4982 }
4983 }
4984
4985 /// Returns true if the dependency information has been calculated.
4986 /// Note that depenendency validity can vary between instructions within
4987 /// a single bundle.
4988 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4989
4990 /// Returns true if it is ready for scheduling, i.e. it has no more
4991 /// unscheduled depending instructions/bundles.
4992 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4993
4994 /// Modifies the number of unscheduled dependencies for this instruction,
4995 /// and returns the number of remaining dependencies for the containing
4996 /// bundle.
4997 int incrementUnscheduledDeps(int Incr) {
4998 assert(hasValidDependencies() &&
4999 "increment of unscheduled deps would be meaningless");
5000 UnscheduledDeps += Incr;
5001 assert(UnscheduledDeps >= 0 &&
5002 "Expected valid number of unscheduled deps");
5003 return UnscheduledDeps;
5004 }
5005
5006 /// Sets the number of unscheduled dependencies to the number of
5007 /// dependencies.
5008 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5009
5010 /// Clears all dependency information.
5011 void clearDependencies() {
5012 clearDirectDependencies();
5013 MemoryDependencies.clear();
5014 ControlDependencies.clear();
5015 }
5016
5017 /// Clears all direct dependencies only, except for control and memory
5018 /// dependencies.
5019 /// Required for copyable elements to correctly handle control/memory deps
5020 /// and avoid extra reclaculation of such deps.
5021 void clearDirectDependencies() {
5022 Dependencies = InvalidDeps;
5023 resetUnscheduledDeps();
5024 IsScheduled = false;
5025 }
5026
5027 /// Gets the number of unscheduled dependencies.
5028 int getUnscheduledDeps() const { return UnscheduledDeps; }
5029 /// Gets the number of dependencies.
5030 int getDependencies() const { return Dependencies; }
5031 /// Initializes the number of dependencies.
5032 void initDependencies() { Dependencies = 0; }
5033 /// Increments the number of dependencies.
5034 void incDependencies() { Dependencies++; }
5035
5036 /// Gets scheduling region ID.
5037 int getSchedulingRegionID() const { return SchedulingRegionID; }
5038
5039 /// Gets the instruction.
5040 Instruction *getInst() const { return Inst; }
5041
5042 /// Gets the list of memory dependencies.
5043 ArrayRef<ScheduleData *> getMemoryDependencies() const {
5044 return MemoryDependencies;
5045 }
5046 /// Adds a memory dependency.
5047 void addMemoryDependency(ScheduleData *Dep) {
5048 MemoryDependencies.push_back(Dep);
5049 }
5050 /// Gets the list of control dependencies.
5051 ArrayRef<ScheduleData *> getControlDependencies() const {
5052 return ControlDependencies;
5053 }
5054 /// Adds a control dependency.
5055 void addControlDependency(ScheduleData *Dep) {
5056 ControlDependencies.push_back(Dep);
5057 }
5058 /// Gets/sets the next load/store instruction in the block.
5059 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
5060 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
5061
5062 void dump(raw_ostream &OS) const { OS << *Inst; }
5063
5064 LLVM_DUMP_METHOD void dump() const {
5065 dump(dbgs());
5066 dbgs() << '\n';
5067 }
5068
5069 private:
5070 Instruction *Inst = nullptr;
5071
5072 /// Single linked list of all memory instructions (e.g. load, store, call)
5073 /// in the block - until the end of the scheduling region.
5074 ScheduleData *NextLoadStore = nullptr;
5075
5076 /// The dependent memory instructions.
5077 /// This list is derived on demand in calculateDependencies().
5078 SmallVector<ScheduleData *> MemoryDependencies;
5079
5080 /// List of instructions which this instruction could be control dependent
5081 /// on. Allowing such nodes to be scheduled below this one could introduce
5082 /// a runtime fault which didn't exist in the original program.
5083 /// ex: this is a load or udiv following a readonly call which inf loops
5084 SmallVector<ScheduleData *> ControlDependencies;
5085
5086 /// This ScheduleData is in the current scheduling region if this matches
5087 /// the current SchedulingRegionID of BlockScheduling.
5088 int SchedulingRegionID = 0;
5089
5090 /// The number of dependencies. Constitutes of the number of users of the
5091 /// instruction plus the number of dependent memory instructions (if any).
5092 /// This value is calculated on demand.
5093 /// If InvalidDeps, the number of dependencies is not calculated yet.
5094 int Dependencies = InvalidDeps;
5095
5096 /// The number of dependencies minus the number of dependencies of scheduled
5097 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5098 /// for scheduling.
5099 /// Note that this is negative as long as Dependencies is not calculated.
5100 int UnscheduledDeps = InvalidDeps;
5101 };
5102
5103#ifndef NDEBUG
5105 const BoUpSLP::ScheduleData &SD) {
5106 SD.dump(OS);
5107 return OS;
5108 }
5109#endif
5110
5111 class ScheduleBundle final : public ScheduleEntity {
5112 /// The schedule data for the instructions in the bundle.
5114 /// True if this bundle is valid.
5115 bool IsValid = true;
5116 /// The TreeEntry that this instruction corresponds to.
5117 TreeEntry *TE = nullptr;
5118 ScheduleBundle(bool IsValid)
5119 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5120
5121 public:
5122 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5123 static bool classof(const ScheduleEntity *Entity) {
5124 return Entity->getKind() == Kind::ScheduleBundle;
5125 }
5126
5127 /// Verify basic self consistency properties
5128 void verify() const {
5129 for (const ScheduleEntity *SD : Bundle) {
5130 if (SD->hasValidDependencies()) {
5131 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5132 "invariant");
5133 } else {
5134 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5135 "invariant");
5136 }
5137
5138 if (isScheduled()) {
5139 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5140 "unexpected scheduled state");
5141 }
5142 }
5143 }
5144
5145 /// Returns the number of unscheduled dependencies in the bundle.
5146 int unscheduledDepsInBundle() const {
5147 assert(*this && "bundle must not be empty");
5148 int Sum = 0;
5149 for (const ScheduleEntity *BundleMember : Bundle) {
5150 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5151 return ScheduleData::InvalidDeps;
5152 Sum += BundleMember->getUnscheduledDeps();
5153 }
5154 return Sum;
5155 }
5156
5157 /// Returns true if the dependency information has been calculated.
5158 /// Note that depenendency validity can vary between instructions within
5159 /// a single bundle.
5160 bool hasValidDependencies() const {
5161 return all_of(Bundle, [](const ScheduleEntity *SD) {
5162 return SD->hasValidDependencies();
5163 });
5164 }
5165
5166 /// Returns true if it is ready for scheduling, i.e. it has no more
5167 /// unscheduled depending instructions/bundles.
5168 bool isReady() const {
5169 assert(*this && "bundle must not be empty");
5170 return unscheduledDepsInBundle() == 0 && !isScheduled();
5171 }
5172
5173 /// Returns the bundle of scheduling data, associated with the current
5174 /// instruction.
5175 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5176 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5177 /// Adds an instruction to the bundle.
5178 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5179
5180 /// Gets/sets the associated tree entry.
5181 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5182 TreeEntry *getTreeEntry() const { return TE; }
5183
5184 static ScheduleBundle invalid() { return {false}; }
5185
5186 operator bool() const { return IsValid; }
5187
5188#ifndef NDEBUG
5189 void dump(raw_ostream &OS) const {
5190 if (!*this) {
5191 OS << "[]";
5192 return;
5193 }
5194 OS << '[';
5195 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5197 OS << "<Copyable>";
5198 OS << *SD->getInst();
5199 });
5200 OS << ']';
5201 }
5202
5203 LLVM_DUMP_METHOD void dump() const {
5204 dump(dbgs());
5205 dbgs() << '\n';
5206 }
5207#endif // NDEBUG
5208 };
5209
5210#ifndef NDEBUG
5212 const BoUpSLP::ScheduleBundle &Bundle) {
5213 Bundle.dump(OS);
5214 return OS;
5215 }
5216#endif
5217
5218 /// Contains all scheduling relevant data for the copyable instruction.
5219 /// It models the virtual instructions, supposed to replace the original
5220 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5221 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5222 /// instruction %virt = add %0, 0.
5223 class ScheduleCopyableData final : public ScheduleEntity {
5224 /// The source schedule data for the instruction.
5225 Instruction *Inst = nullptr;
5226 /// The edge information for the instruction.
5227 const EdgeInfo EI;
5228 /// This ScheduleData is in the current scheduling region if this matches
5229 /// the current SchedulingRegionID of BlockScheduling.
5230 int SchedulingRegionID = 0;
5231 /// Bundle, this data is part of.
5232 ScheduleBundle &Bundle;
5233
5234 public:
5235 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5236 const EdgeInfo &EI, ScheduleBundle &Bundle)
5237 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5238 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5239 static bool classof(const ScheduleEntity *Entity) {
5240 return Entity->getKind() == Kind::ScheduleCopyableData;
5241 }
5242
5243 /// Verify basic self consistency properties
5244 void verify() {
5245 if (hasValidDependencies()) {
5246 assert(UnscheduledDeps <= Dependencies && "invariant");
5247 } else {
5248 assert(UnscheduledDeps == Dependencies && "invariant");
5249 }
5250
5251 if (IsScheduled) {
5252 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5253 "unexpected scheduled state");
5254 }
5255 }
5256
5257 /// Returns true if the dependency information has been calculated.
5258 /// Note that depenendency validity can vary between instructions within
5259 /// a single bundle.
5260 bool hasValidDependencies() const {
5261 return Dependencies != ScheduleData::InvalidDeps;
5262 }
5263
5264 /// Returns true if it is ready for scheduling, i.e. it has no more
5265 /// unscheduled depending instructions/bundles.
5266 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5267
5268 /// Modifies the number of unscheduled dependencies for this instruction,
5269 /// and returns the number of remaining dependencies for the containing
5270 /// bundle.
5271 int incrementUnscheduledDeps(int Incr) {
5272 assert(hasValidDependencies() &&
5273 "increment of unscheduled deps would be meaningless");
5274 UnscheduledDeps += Incr;
5275 assert(UnscheduledDeps >= 0 && "invariant");
5276 return UnscheduledDeps;
5277 }
5278
5279 /// Sets the number of unscheduled dependencies to the number of
5280 /// dependencies.
5281 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5282
5283 /// Gets the number of unscheduled dependencies.
5284 int getUnscheduledDeps() const { return UnscheduledDeps; }
5285 /// Gets the number of dependencies.
5286 int getDependencies() const { return Dependencies; }
5287 /// Initializes the number of dependencies.
5288 void initDependencies() { Dependencies = 0; }
5289 /// Increments the number of dependencies.
5290 void incDependencies() { Dependencies++; }
5291
5292 /// Gets scheduling region ID.
5293 int getSchedulingRegionID() const { return SchedulingRegionID; }
5294
5295 /// Gets the instruction.
5296 Instruction *getInst() const { return Inst; }
5297
5298 /// Clears all dependency information.
5299 void clearDependencies() {
5300 Dependencies = ScheduleData::InvalidDeps;
5301 UnscheduledDeps = ScheduleData::InvalidDeps;
5302 IsScheduled = false;
5303 }
5304
5305 /// Gets the edge information.
5306 const EdgeInfo &getEdgeInfo() const { return EI; }
5307
5308 /// Gets the bundle.
5309 ScheduleBundle &getBundle() { return Bundle; }
5310 const ScheduleBundle &getBundle() const { return Bundle; }
5311
5312#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5313 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5314
5315 LLVM_DUMP_METHOD void dump() const {
5316 dump(dbgs());
5317 dbgs() << '\n';
5318 }
5319#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5320
5321 private:
5322 /// true, if it has valid dependency information. These nodes always have
5323 /// only single dependency.
5324 int Dependencies = ScheduleData::InvalidDeps;
5325
5326 /// The number of dependencies minus the number of dependencies of scheduled
5327 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5328 /// for scheduling.
5329 /// Note that this is negative as long as Dependencies is not calculated.
5330 int UnscheduledDeps = ScheduleData::InvalidDeps;
5331 };
5332
5333#ifndef NDEBUG
5334 friend inline raw_ostream &
5335 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5336 SD.dump(OS);
5337 return OS;
5338 }
5339#endif
5340
5341 friend struct GraphTraits<BoUpSLP *>;
5342 friend struct DOTGraphTraits<BoUpSLP *>;
5343
5344 /// Contains all scheduling data for a basic block.
5345 /// It does not schedules instructions, which are not memory read/write
5346 /// instructions and their operands are either constants, or arguments, or
5347 /// phis, or instructions from others blocks, or their users are phis or from
5348 /// the other blocks. The resulting vector instructions can be placed at the
5349 /// beginning of the basic block without scheduling (if operands does not need
5350 /// to be scheduled) or at the end of the block (if users are outside of the
5351 /// block). It allows to save some compile time and memory used by the
5352 /// compiler.
5353 /// ScheduleData is assigned for each instruction in between the boundaries of
5354 /// the tree entry, even for those, which are not part of the graph. It is
5355 /// required to correctly follow the dependencies between the instructions and
5356 /// their correct scheduling. The ScheduleData is not allocated for the
5357 /// instructions, which do not require scheduling, like phis, nodes with
5358 /// extractelements/insertelements only or nodes with instructions, with
5359 /// uses/operands outside of the block.
5360 struct BlockScheduling {
5361 BlockScheduling(BasicBlock *BB)
5362 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5363
5364 void clear() {
5365 ScheduledBundles.clear();
5366 ScheduledBundlesList.clear();
5367 ScheduleCopyableDataMap.clear();
5368 ScheduleCopyableDataMapByInst.clear();
5369 ScheduleCopyableDataMapByInstUser.clear();
5370 ScheduleCopyableDataMapByUsers.clear();
5371 ReadyInsts.clear();
5372 ScheduleStart = nullptr;
5373 ScheduleEnd = nullptr;
5374 FirstLoadStoreInRegion = nullptr;
5375 LastLoadStoreInRegion = nullptr;
5376 RegionHasStackSave = false;
5377
5378 // Reduce the maximum schedule region size by the size of the
5379 // previous scheduling run.
5380 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5381 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5382 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5383 ScheduleRegionSize = 0;
5384
5385 // Make a new scheduling region, i.e. all existing ScheduleData is not
5386 // in the new region yet.
5387 ++SchedulingRegionID;
5388 }
5389
5390 ScheduleData *getScheduleData(Instruction *I) {
5391 if (!I)
5392 return nullptr;
5393 if (BB != I->getParent())
5394 // Avoid lookup if can't possibly be in map.
5395 return nullptr;
5396 ScheduleData *SD = ScheduleDataMap.lookup(I);
5397 if (SD && isInSchedulingRegion(*SD))
5398 return SD;
5399 return nullptr;
5400 }
5401
5402 ScheduleData *getScheduleData(Value *V) {
5403 return getScheduleData(dyn_cast<Instruction>(V));
5404 }
5405
5406 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5407 /// operand number) and value.
5408 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5409 const Value *V) const {
5410 if (ScheduleCopyableDataMap.empty())
5411 return nullptr;
5412 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5413 if (It == ScheduleCopyableDataMap.end())
5414 return nullptr;
5415 ScheduleCopyableData *SD = It->getSecond().get();
5416 if (!isInSchedulingRegion(*SD))
5417 return nullptr;
5418 return SD;
5419 }
5420
5421 /// Returns the ScheduleCopyableData for the given user \p User, operand
5422 /// number and operand \p V.
5424 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5425 const Value *V) {
5426 if (ScheduleCopyableDataMapByInstUser.empty())
5427 return {};
5428 const auto It = ScheduleCopyableDataMapByInstUser.find(
5429 std::make_pair(std::make_pair(User, OperandIdx), V));
5430 if (It == ScheduleCopyableDataMapByInstUser.end())
5431 return {};
5433 for (ScheduleCopyableData *SD : It->getSecond()) {
5434 if (isInSchedulingRegion(*SD))
5435 Res.push_back(SD);
5436 }
5437 return Res;
5438 }
5439
5440 /// Returns true if all operands of the given instruction \p User are
5441 /// replaced by copyable data.
5442 /// \param User The user instruction.
5443 /// \param Op The operand, which might be replaced by the copyable data.
5444 /// \param SLP The SLP tree.
5445 /// \param NumOps The number of operands used. If the instruction uses the
5446 /// same operand several times, check for the first use, then the second,
5447 /// etc.
5448 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5449 Instruction *Op, BoUpSLP &SLP,
5450 unsigned NumOps) const {
5451 assert(NumOps > 0 && "No operands");
5452 if (ScheduleCopyableDataMap.empty())
5453 return false;
5454 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5455 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5456 if (Entries.empty())
5457 return false;
5458 unsigned CurNumOps = 0;
5459 for (const Use &U : User->operands()) {
5460 if (U.get() != Op)
5461 continue;
5462 ++CurNumOps;
5463 // Check all tree entries, if they have operands replaced by copyable
5464 // data.
5465 for (TreeEntry *TE : Entries) {
5466 unsigned Inc = 0;
5467 bool IsNonSchedulableWithParentPhiNode =
5468 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5469 TE->UserTreeIndex.UserTE->hasState() &&
5470 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5471 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5472 // Count the number of unique phi nodes, which are the parent for
5473 // parent entry, and exit, if all the unique phis are processed.
5474 if (IsNonSchedulableWithParentPhiNode) {
5475 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5476 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5477 for (Value *V : ParentTE->Scalars) {
5478 auto *PHI = dyn_cast<PHINode>(V);
5479 if (!PHI)
5480 continue;
5481 if (ParentsUniqueUsers.insert(PHI).second &&
5482 is_contained(PHI->incoming_values(), User))
5483 ++Inc;
5484 }
5485 } else {
5486 Inc = count(TE->Scalars, User);
5487 }
5488
5489 // Check if the user is commutative.
5490 // The commutatives are handled later, as their operands can be
5491 // reordered.
5492 // Same applies even for non-commutative cmps, because we can invert
5493 // their predicate potentially and, thus, reorder the operands.
5494 bool IsCommutativeUser =
5495 ::isCommutative(User) &&
5496 ::isCommutableOperand(User, User, U.getOperandNo());
5497 if (!IsCommutativeUser) {
5498 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
5499 IsCommutativeUser =
5500 ::isCommutative(MainOp, User) &&
5501 ::isCommutableOperand(MainOp, User, U.getOperandNo());
5502 }
5503 // The commutative user with the same operands can be safely
5504 // considered as non-commutative, operands reordering does not change
5505 // the semantics.
5506 assert(
5507 (!IsCommutativeUser ||
5508 (((::isCommutative(User) &&
5509 ::isCommutableOperand(User, User, 0) &&
5510 ::isCommutableOperand(User, User, 1)) ||
5511 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5512 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5513 User, 0) &&
5514 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5515 User, 1))))) &&
5516 "Expected commutative user with 2 first commutable operands");
5517 bool IsCommutativeWithSameOps =
5518 IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
5519 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5520 !isa<CmpInst>(User)) {
5521 EdgeInfo EI(TE, U.getOperandNo());
5522 if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
5523 continue;
5524 return false;
5525 }
5526 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5527 .first->getSecond() += Inc;
5528 }
5529 }
5530 if (PotentiallyReorderedEntriesCount.empty())
5531 return true;
5532 // Check the commutative/cmp entries.
5533 for (auto &P : PotentiallyReorderedEntriesCount) {
5534 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5535 bool IsNonSchedulableWithParentPhiNode =
5536 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5537 P.first->UserTreeIndex.UserTE->hasState() &&
5538 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5539 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5540 auto *It = find(P.first->Scalars, User);
5541 do {
5542 assert(It != P.first->Scalars.end() &&
5543 "User is not in the tree entry");
5544 int Lane = std::distance(P.first->Scalars.begin(), It);
5545 assert(Lane >= 0 && "Lane is not found");
5546 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5547 Lane = P.first->ReorderIndices[Lane];
5548 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5549 "Couldn't find extract lane");
5550 // Count the number of unique phi nodes, which are the parent for
5551 // parent entry, and exit, if all the unique phis are processed.
5552 if (IsNonSchedulableWithParentPhiNode) {
5553 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5554 Value *User = ParentTE->Scalars[Lane];
5555 if (!ParentsUniqueUsers.insert(User).second) {
5556 It =
5557 find(make_range(std::next(It), P.first->Scalars.end()), User);
5558 continue;
5559 }
5560 }
5561 for (unsigned OpIdx :
5563 P.first->getMainOp()))) {
5564 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5565 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5566 --P.getSecond();
5567 }
5568 // If parent node is schedulable, it will be handled correctly.
5569 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5570 } while (It != P.first->Scalars.end());
5571 }
5572 return all_of(PotentiallyReorderedEntriesCount,
5573 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5574 return P.second == NumOps - 1;
5575 });
5576 }
5577
5579 getScheduleCopyableData(const Instruction *I) const {
5580 if (ScheduleCopyableDataMapByInst.empty())
5581 return {};
5582 const auto It = ScheduleCopyableDataMapByInst.find(I);
5583 if (It == ScheduleCopyableDataMapByInst.end())
5584 return {};
5586 for (ScheduleCopyableData *SD : It->getSecond()) {
5587 if (isInSchedulingRegion(*SD))
5588 Res.push_back(SD);
5589 }
5590 return Res;
5591 }
5592
5594 getScheduleCopyableDataUsers(const Instruction *User) const {
5595 if (ScheduleCopyableDataMapByUsers.empty())
5596 return {};
5597 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5598 if (It == ScheduleCopyableDataMapByUsers.end())
5599 return {};
5601 for (ScheduleCopyableData *SD : It->getSecond()) {
5602 if (isInSchedulingRegion(*SD))
5603 Res.push_back(SD);
5604 }
5605 return Res;
5606 }
5607
5608 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5609 Instruction *I,
5610 int SchedulingRegionID,
5611 ScheduleBundle &Bundle) {
5612 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5613 ScheduleCopyableData *CD =
5614 ScheduleCopyableDataMap
5615 .try_emplace(std::make_pair(EI, I),
5616 std::make_unique<ScheduleCopyableData>(
5617 SchedulingRegionID, I, EI, Bundle))
5618 .first->getSecond()
5619 .get();
5620 ScheduleCopyableDataMapByInst[I].push_back(CD);
5621 if (EI.UserTE) {
5622 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5623 const auto *It = find(Op, I);
5624 assert(It != Op.end() && "Lane not set");
5625 SmallPtrSet<Instruction *, 4> Visited;
5626 do {
5627 int Lane = std::distance(Op.begin(), It);
5628 assert(Lane >= 0 && "Lane not set");
5629 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5630 !EI.UserTE->ReorderIndices.empty())
5631 Lane = EI.UserTE->ReorderIndices[Lane];
5632 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5633 "Couldn't find extract lane");
5634 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5635 if (!Visited.insert(In).second) {
5636 It = find(make_range(std::next(It), Op.end()), I);
5637 continue;
5638 }
5639 ScheduleCopyableDataMapByInstUser
5640 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5641 .first->getSecond()
5642 .push_back(CD);
5643 ScheduleCopyableDataMapByUsers.try_emplace(I)
5644 .first->getSecond()
5645 .insert(CD);
5646 // Remove extra deps for users, becoming non-immediate users of the
5647 // instruction. It may happen, if the chain of same copyable elements
5648 // appears in the tree.
5649 if (In == I) {
5650 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5651 if (ScheduleCopyableData *UserCD =
5652 getScheduleCopyableData(UserEI, In))
5653 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5654 }
5655 It = find(make_range(std::next(It), Op.end()), I);
5656 } while (It != Op.end());
5657 } else {
5658 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5659 CD);
5660 }
5661 return *CD;
5662 }
5663
5664 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5665 auto *I = dyn_cast<Instruction>(V);
5666 if (!I)
5667 return {};
5668 auto It = ScheduledBundles.find(I);
5669 if (It == ScheduledBundles.end())
5670 return {};
5671 return It->getSecond();
5672 }
5673
5674 /// Returns true if the entity is in the scheduling region.
5675 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5676 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5677 return Data->getSchedulingRegionID() == SchedulingRegionID;
5678 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5679 return CD->getSchedulingRegionID() == SchedulingRegionID;
5680 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5681 [&](const ScheduleEntity *BundleMember) {
5682 return isInSchedulingRegion(*BundleMember);
5683 });
5684 }
5685
5686 /// Marks an instruction as scheduled and puts all dependent ready
5687 /// instructions into the ready-list.
5688 template <typename ReadyListType>
5689 void schedule(const BoUpSLP &R, const InstructionsState &S,
5690 const EdgeInfo &EI, ScheduleEntity *Data,
5691 ReadyListType &ReadyList) {
5692 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5694 // Handle the def-use chain dependencies.
5695
5696 // Decrement the unscheduled counter and insert to ready list if ready.
5697 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5698 if ((IsControl || Data->hasValidDependencies()) &&
5699 Data->incrementUnscheduledDeps(-1) == 0) {
5700 // There are no more unscheduled dependencies after
5701 // decrementing, so we can put the dependent instruction
5702 // into the ready list.
5703 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5705 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5706 CopyableBundle.push_back(&CD->getBundle());
5707 Bundles = CopyableBundle;
5708 } else {
5709 Bundles = getScheduleBundles(Data->getInst());
5710 }
5711 if (!Bundles.empty()) {
5712 for (ScheduleBundle *Bundle : Bundles) {
5713 if (Bundle->unscheduledDepsInBundle() == 0) {
5714 assert(!Bundle->isScheduled() &&
5715 "already scheduled bundle gets ready");
5716 ReadyList.insert(Bundle);
5718 << "SLP: gets ready: " << *Bundle << "\n");
5719 }
5720 }
5721 return;
5722 }
5723 assert(!Data->isScheduled() &&
5724 "already scheduled bundle gets ready");
5726 "Expected non-copyable data");
5727 ReadyList.insert(Data);
5728 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5729 }
5730 };
5731
5732 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5733 Instruction *I) {
5734 if (!ScheduleCopyableDataMap.empty()) {
5736 getScheduleCopyableData(User, OpIdx, I);
5737 for (ScheduleCopyableData *CD : CopyableData)
5738 DecrUnsched(CD, /*IsControl=*/false);
5739 if (!CopyableData.empty())
5740 return;
5741 }
5742 if (ScheduleData *OpSD = getScheduleData(I))
5743 DecrUnsched(OpSD, /*IsControl=*/false);
5744 };
5745
5746 // If BundleMember is a vector bundle, its operands may have been
5747 // reordered during buildTree(). We therefore need to get its operands
5748 // through the TreeEntry.
5749 if (!Bundles.empty()) {
5750 auto *In = BundleMember->getInst();
5751 // Count uses of each instruction operand.
5752 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5753 unsigned TotalOpCount = 0;
5754 if (isa<ScheduleCopyableData>(BundleMember)) {
5755 // Copyable data is used only once (uses itself).
5756 TotalOpCount = OperandsUses[In] = 1;
5757 } else {
5758 for (const Use &U : In->operands()) {
5759 if (auto *I = dyn_cast<Instruction>(U.get())) {
5760 auto Res = OperandsUses.try_emplace(I, 0);
5761 ++Res.first->getSecond();
5762 ++TotalOpCount;
5763 }
5764 }
5765 }
5766 // Decrement the unscheduled counter and insert to ready list if
5767 // ready.
5768 auto DecrUnschedForInst =
5769 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5770 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5771 &Checked) {
5772 if (!ScheduleCopyableDataMap.empty()) {
5773 const EdgeInfo EI = {UserTE, OpIdx};
5774 if (ScheduleCopyableData *CD =
5775 getScheduleCopyableData(EI, I)) {
5776 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5777 return;
5778 DecrUnsched(CD, /*IsControl=*/false);
5779 return;
5780 }
5781 }
5782 auto It = OperandsUses.find(I);
5783 assert(It != OperandsUses.end() && "Operand not found");
5784 if (It->second > 0) {
5785 if (ScheduleData *OpSD = getScheduleData(I)) {
5786 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5787 return;
5788 --It->getSecond();
5789 assert(TotalOpCount > 0 && "No more operands to decrement");
5790 --TotalOpCount;
5791 DecrUnsched(OpSD, /*IsControl=*/false);
5792 } else {
5793 --It->getSecond();
5794 assert(TotalOpCount > 0 && "No more operands to decrement");
5795 --TotalOpCount;
5796 }
5797 }
5798 };
5799
5800 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5801 for (ScheduleBundle *Bundle : Bundles) {
5802 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5803 break;
5804 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5805 // Need to search for the lane since the tree entry can be
5806 // reordered.
5807 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5808 bool IsNonSchedulableWithParentPhiNode =
5809 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5810 Bundle->getTreeEntry()->UserTreeIndex &&
5811 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5812 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5813 TreeEntry::SplitVectorize &&
5814 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5815 Instruction::PHI;
5816 do {
5817 int Lane =
5818 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5819 assert(Lane >= 0 && "Lane not set");
5820 if (isa<StoreInst>(In) &&
5821 !Bundle->getTreeEntry()->ReorderIndices.empty())
5822 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5823 assert(Lane < static_cast<int>(
5824 Bundle->getTreeEntry()->Scalars.size()) &&
5825 "Couldn't find extract lane");
5826
5827 // Since vectorization tree is being built recursively this
5828 // assertion ensures that the tree entry has all operands set
5829 // before reaching this code. Couple of exceptions known at the
5830 // moment are extracts where their second (immediate) operand is
5831 // not added. Since immediates do not affect scheduler behavior
5832 // this is considered okay.
5833 assert(
5834 In &&
5836 In->getNumOperands() ==
5837 Bundle->getTreeEntry()->getNumOperands() ||
5838 (isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
5839 Instruction::Select) ||
5840 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5841 "Missed TreeEntry operands?");
5842
5843 // Count the number of unique phi nodes, which are the parent for
5844 // parent entry, and exit, if all the unique phis are processed.
5845 if (IsNonSchedulableWithParentPhiNode) {
5846 const TreeEntry *ParentTE =
5847 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5848 Value *User = ParentTE->Scalars[Lane];
5849 if (!ParentsUniqueUsers.insert(User).second) {
5850 It = std::find(std::next(It),
5851 Bundle->getTreeEntry()->Scalars.end(), In);
5852 continue;
5853 }
5854 }
5855
5856 for (unsigned OpIdx :
5857 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5858 if (auto *I = dyn_cast<Instruction>(
5859 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5860 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5861 << *I << "\n");
5862 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5863 }
5864 // If parent node is schedulable, it will be handled correctly.
5865 if (Bundle->getTreeEntry()->isCopyableElement(In))
5866 break;
5867 It = std::find(std::next(It),
5868 Bundle->getTreeEntry()->Scalars.end(), In);
5869 } while (It != Bundle->getTreeEntry()->Scalars.end());
5870 }
5871 } else {
5872 // If BundleMember is a stand-alone instruction, no operand reordering
5873 // has taken place, so we directly access its operands.
5874 for (Use &U : BundleMember->getInst()->operands()) {
5875 if (auto *I = dyn_cast<Instruction>(U.get())) {
5877 << "SLP: check for readiness (def): " << *I << "\n");
5878 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5879 }
5880 }
5881 }
5882 // Handle the memory dependencies.
5883 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5884 if (!SD)
5885 return;
5886 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5887 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5888 if (!VisitedMemory.insert(MemoryDep).second)
5889 continue;
5890 // There are no more unscheduled dependencies after decrementing,
5891 // so we can put the dependent instruction into the ready list.
5892 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5893 << *MemoryDep << "\n");
5894 DecrUnsched(MemoryDep);
5895 }
5896 // Handle the control dependencies.
5897 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5898 for (ScheduleData *Dep : SD->getControlDependencies()) {
5899 if (!VisitedControl.insert(Dep).second)
5900 continue;
5901 // There are no more unscheduled dependencies after decrementing,
5902 // so we can put the dependent instruction into the ready list.
5904 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5905 DecrUnsched(Dep, /*IsControl=*/true);
5906 }
5907 };
5908 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5909 SD->setScheduled(/*Scheduled=*/true);
5910 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5913 Instruction *In = SD->getInst();
5914 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5915 if (!Entries.empty()) {
5916 for (TreeEntry *TE : Entries) {
5918 In->getNumOperands() != TE->getNumOperands())
5919 continue;
5920 auto &BundlePtr =
5921 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5922 BundlePtr->setTreeEntry(TE);
5923 BundlePtr->add(SD);
5924 Bundles.push_back(BundlePtr.get());
5925 }
5926 }
5927 ProcessBundleMember(SD, Bundles);
5928 } else {
5929 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5930 Bundle.setScheduled(/*Scheduled=*/true);
5931 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5932 auto AreAllBundlesScheduled =
5933 [&](const ScheduleEntity *SD,
5934 ArrayRef<ScheduleBundle *> SDBundles) {
5936 return true;
5937 return !SDBundles.empty() &&
5938 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5939 return SDBundle->isScheduled();
5940 });
5941 };
5942 for (ScheduleEntity *SD : Bundle.getBundle()) {
5945 SDBundles = getScheduleBundles(SD->getInst());
5946 if (AreAllBundlesScheduled(SD, SDBundles)) {
5947 SD->setScheduled(/*Scheduled=*/true);
5948 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5949 : SDBundles);
5950 }
5951 }
5952 }
5953 }
5954
5955 /// Verify basic self consistency properties of the data structure.
5956 void verify() {
5957 if (!ScheduleStart)
5958 return;
5959
5960 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5961 ScheduleStart->comesBefore(ScheduleEnd) &&
5962 "Not a valid scheduling region?");
5963
5964 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5965 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5966 if (!Bundles.empty()) {
5967 for (ScheduleBundle *Bundle : Bundles) {
5968 assert(isInSchedulingRegion(*Bundle) &&
5969 "primary schedule data not in window?");
5970 Bundle->verify();
5971 }
5972 continue;
5973 }
5974 auto *SD = getScheduleData(I);
5975 if (!SD)
5976 continue;
5977 assert(isInSchedulingRegion(*SD) &&
5978 "primary schedule data not in window?");
5979 SD->verify();
5980 }
5981
5982 assert(all_of(ReadyInsts,
5983 [](const ScheduleEntity *Bundle) {
5984 return Bundle->isReady();
5985 }) &&
5986 "item in ready list not ready?");
5987 }
5988
5989 /// Put all instructions into the ReadyList which are ready for scheduling.
5990 template <typename ReadyListType>
5991 void initialFillReadyList(ReadyListType &ReadyList) {
5992 SmallPtrSet<ScheduleBundle *, 16> Visited;
5993 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5994 ScheduleData *SD = getScheduleData(I);
5995 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5996 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5997 !Bundles.empty()) {
5998 for (ScheduleBundle *Bundle : Bundles) {
5999 if (!Visited.insert(Bundle).second)
6000 continue;
6001 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
6002 ReadyList.insert(Bundle);
6003 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
6004 << *Bundle << "\n");
6005 }
6006 }
6007 continue;
6008 }
6009 ReadyList.insert(SD);
6011 << "SLP: initially in ready list: " << *SD << "\n");
6012 }
6013 }
6014 }
6015
6016 /// Build a bundle from the ScheduleData nodes corresponding to the
6017 /// scalar instruction for each lane.
6018 /// \param VL The list of scalar instructions.
6019 /// \param S The state of the instructions.
6020 /// \param EI The edge in the SLP graph or the user node/operand number.
6021 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
6022 const InstructionsState &S, const EdgeInfo &EI);
6023
6024 /// Checks if a bundle of instructions can be scheduled, i.e. has no
6025 /// cyclic dependencies. This is only a dry-run, no instructions are
6026 /// actually moved at this stage.
6027 /// \returns the scheduling bundle. The returned Optional value is not
6028 /// std::nullopt if \p VL is allowed to be scheduled.
6029 std::optional<ScheduleBundle *>
6030 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
6031 const InstructionsState &S, const EdgeInfo &EI);
6032
6033 /// Allocates schedule data chunk.
6034 ScheduleData *allocateScheduleDataChunks();
6035
6036 /// Extends the scheduling region so that V is inside the region.
6037 /// \returns true if the region size is within the limit.
6038 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
6039
6040 /// Initialize the ScheduleData structures for new instructions in the
6041 /// scheduling region.
6042 void initScheduleData(Instruction *FromI, Instruction *ToI,
6043 ScheduleData *PrevLoadStore,
6044 ScheduleData *NextLoadStore);
6045
6046 /// Updates the dependency information of a bundle and of all instructions/
6047 /// bundles which depend on the original bundle.
6048 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
6049 BoUpSLP *SLP,
6050 ArrayRef<ScheduleData *> ControlDeps = {});
6051
6052 /// Sets all instruction in the scheduling region to un-scheduled.
6053 void resetSchedule();
6054
6055 BasicBlock *BB;
6056
6057 /// Simple memory allocation for ScheduleData.
6059
6060 /// The size of a ScheduleData array in ScheduleDataChunks.
6061 int ChunkSize;
6062
6063 /// The allocator position in the current chunk, which is the last entry
6064 /// of ScheduleDataChunks.
6065 int ChunkPos;
6066
6067 /// Attaches ScheduleData to Instruction.
6068 /// Note that the mapping survives during all vectorization iterations, i.e.
6069 /// ScheduleData structures are recycled.
6070 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6071
6072 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
6073 /// number) and the operand instruction, represented as copyable element.
6074 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6075 std::unique_ptr<ScheduleCopyableData>>
6076 ScheduleCopyableDataMap;
6077
6078 /// Represents mapping between instruction and all related
6079 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
6080 /// element). The SLP tree may contain several representations of the same
6081 /// instruction.
6082 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6083 ScheduleCopyableDataMapByInst;
6084
6085 /// Represents mapping between user value and operand number, the operand
6086 /// value and all related ScheduleCopyableData. The relation is 1:n, because
6087 /// the same user may refernce the same operand in different tree entries
6088 /// and the operand may be modelled by the different copyable data element.
6089 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
6091 ScheduleCopyableDataMapByInstUser;
6092
6093 /// Represents mapping between instruction and all related
6094 /// ScheduleCopyableData. It represents the mapping between the actual
6095 /// instruction and the last copyable data element in the chain. E.g., if
6096 /// the graph models the following instructions:
6097 /// %0 = non-add instruction ...
6098 /// ...
6099 /// %4 = add %3, 1
6100 /// %5 = add %4, 1
6101 /// %6 = insertelement poison, %0, 0
6102 /// %7 = insertelement %6, %5, 1
6103 /// And the graph is modeled as:
6104 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6105 /// -> [1, 0] -> [%1, 0]
6106 ///
6107 /// this map will map %0 only to the copyable element <1>, which is the last
6108 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6109 /// keep the map to <0>, not the %0.
6110 SmallDenseMap<const Instruction *,
6111 SmallSetVector<ScheduleCopyableData *, 4>>
6112 ScheduleCopyableDataMapByUsers;
6113
6114 /// Attaches ScheduleBundle to Instruction.
6115 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6116 ScheduledBundles;
6117 /// The list of ScheduleBundles.
6118 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6119
6120 /// The ready-list for scheduling (only used for the dry-run).
6121 SetVector<ScheduleEntity *> ReadyInsts;
6122
6123 /// The first instruction of the scheduling region.
6124 Instruction *ScheduleStart = nullptr;
6125
6126 /// The first instruction _after_ the scheduling region.
6127 Instruction *ScheduleEnd = nullptr;
6128
6129 /// The first memory accessing instruction in the scheduling region
6130 /// (can be null).
6131 ScheduleData *FirstLoadStoreInRegion = nullptr;
6132
6133 /// The last memory accessing instruction in the scheduling region
6134 /// (can be null).
6135 ScheduleData *LastLoadStoreInRegion = nullptr;
6136
6137 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6138 /// region? Used to optimize the dependence calculation for the
6139 /// common case where there isn't.
6140 bool RegionHasStackSave = false;
6141
6142 /// The current size of the scheduling region.
6143 int ScheduleRegionSize = 0;
6144
6145 /// The maximum size allowed for the scheduling region.
6146 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6147
6148 /// The ID of the scheduling region. For a new vectorization iteration this
6149 /// is incremented which "removes" all ScheduleData from the region.
6150 /// Make sure that the initial SchedulingRegionID is greater than the
6151 /// initial SchedulingRegionID in ScheduleData (which is 0).
6152 int SchedulingRegionID = 1;
6153 };
6154
6155 /// Attaches the BlockScheduling structures to basic blocks.
6156 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6157
6158 /// Performs the "real" scheduling. Done before vectorization is actually
6159 /// performed in a basic block.
6160 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6161
6162 /// List of users to ignore during scheduling and that don't need extracting.
6163 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6164
6165 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6166 /// sorted SmallVectors of unsigned.
6167 struct OrdersTypeDenseMapInfo {
6168 static OrdersType getEmptyKey() {
6169 OrdersType V;
6170 V.push_back(~1U);
6171 return V;
6172 }
6173
6174 static OrdersType getTombstoneKey() {
6175 OrdersType V;
6176 V.push_back(~2U);
6177 return V;
6178 }
6179
6180 static unsigned getHashValue(const OrdersType &V) {
6181 return static_cast<unsigned>(hash_combine_range(V));
6182 }
6183
6184 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6185 return LHS == RHS;
6186 }
6187 };
6188
6189 // Analysis and block reference.
6190 Function *F;
6191 ScalarEvolution *SE;
6192 TargetTransformInfo *TTI;
6193 TargetLibraryInfo *TLI;
6194 LoopInfo *LI;
6195 DominatorTree *DT;
6196 AssumptionCache *AC;
6197 DemandedBits *DB;
6198 const DataLayout *DL;
6199 OptimizationRemarkEmitter *ORE;
6200
6201 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6202 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6203
6204 /// Instruction builder to construct the vectorized tree.
6205 IRBuilder<TargetFolder> Builder;
6206
6207 /// A map of scalar integer values to the smallest bit width with which they
6208 /// can legally be represented. The values map to (width, signed) pairs,
6209 /// where "width" indicates the minimum bit width and "signed" is True if the
6210 /// value must be signed-extended, rather than zero-extended, back to its
6211 /// original width.
6212 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6213
6214 /// Final size of the reduced vector, if the current graph represents the
6215 /// input for the reduction and it was possible to narrow the size of the
6216 /// reduction.
6217 unsigned ReductionBitWidth = 0;
6218
6219 /// Canonical graph size before the transformations.
6220 unsigned BaseGraphSize = 1;
6221
6222 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6223 /// type sizes, used in the tree.
6224 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6225
6226 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6227 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6228 DenseSet<unsigned> ExtraBitWidthNodes;
6229};
6230
6231template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6235 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6236 SecondInfo::getEmptyKey());
6237 }
6238
6240 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6241 SecondInfo::getTombstoneKey());
6242 }
6243
6244 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6245 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6246 SecondInfo::getHashValue(Val.EdgeIdx));
6247 }
6248
6249 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6250 const BoUpSLP::EdgeInfo &RHS) {
6251 return LHS == RHS;
6252 }
6253};
6254
6255template <> struct llvm::GraphTraits<BoUpSLP *> {
6256 using TreeEntry = BoUpSLP::TreeEntry;
6257
6258 /// NodeRef has to be a pointer per the GraphWriter.
6260
6261 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6262
6263 /// Add the VectorizableTree to the index iterator to be able to return
6264 /// TreeEntry pointers.
6266 : public iterator_adaptor_base<
6267 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6269
6273
6274 NodeRef operator*() { return I->UserTE; }
6275 };
6276
6278 return R.VectorizableTree[0].get();
6279 }
6280
6282 return {&N->UserTreeIndex, N->Container};
6283 }
6284
6286 return {&N->UserTreeIndex + 1, N->Container};
6287 }
6288
6289 /// For the node iterator we just need to turn the TreeEntry iterator into a
6290 /// TreeEntry* iterator so that it dereferences to NodeRef.
6292 using ItTy = ContainerTy::iterator;
6293 ItTy It;
6294
6295 public:
6296 nodes_iterator(const ItTy &It2) : It(It2) {}
6297 NodeRef operator*() { return It->get(); }
6299 ++It;
6300 return *this;
6301 }
6302 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6303 };
6304
6306 return nodes_iterator(R->VectorizableTree.begin());
6307 }
6308
6310 return nodes_iterator(R->VectorizableTree.end());
6311 }
6312
6313 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6314};
6315
6316template <>
6318 using TreeEntry = BoUpSLP::TreeEntry;
6319
6320 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6321
6322 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6323 std::string Str;
6324 raw_string_ostream OS(Str);
6325 OS << Entry->Idx << ".\n";
6326 if (isSplat(Entry->Scalars))
6327 OS << "<splat> ";
6328 for (auto *V : Entry->Scalars) {
6329 OS << *V;
6330 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6331 return EU.Scalar == V;
6332 }))
6333 OS << " <extract>";
6334 OS << "\n";
6335 }
6336 return Str;
6337 }
6338
6339 static std::string getNodeAttributes(const TreeEntry *Entry,
6340 const BoUpSLP *) {
6341 if (Entry->isGather())
6342 return "color=red";
6343 if (Entry->State == TreeEntry::ScatterVectorize ||
6344 Entry->State == TreeEntry::StridedVectorize ||
6345 Entry->State == TreeEntry::CompressVectorize)
6346 return "color=blue";
6347 return "";
6348 }
6349};
6350
6353 for (auto *I : DeletedInstructions) {
6354 if (!I->getParent()) {
6355 // Temporarily insert instruction back to erase them from parent and
6356 // memory later.
6357 if (isa<PHINode>(I))
6358 // Phi nodes must be the very first instructions in the block.
6359 I->insertBefore(F->getEntryBlock(),
6360 F->getEntryBlock().getFirstNonPHIIt());
6361 else
6362 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6363 continue;
6364 }
6365 for (Use &U : I->operands()) {
6366 auto *Op = dyn_cast<Instruction>(U.get());
6367 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6369 DeadInsts.emplace_back(Op);
6370 }
6371 I->dropAllReferences();
6372 }
6373 for (auto *I : DeletedInstructions) {
6374 assert(I->use_empty() &&
6375 "trying to erase instruction with users.");
6376 I->eraseFromParent();
6377 }
6378
6379 // Cleanup any dead scalar code feeding the vectorized instructions
6381
6382#ifdef EXPENSIVE_CHECKS
6383 // If we could guarantee that this call is not extremely slow, we could
6384 // remove the ifdef limitation (see PR47712).
6385 assert(!verifyFunction(*F, &dbgs()));
6386#endif
6387}
6388
6389/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6390/// contains original mask for the scalars reused in the node. Procedure
6391/// transform this mask in accordance with the given \p Mask.
6393 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6394 "Expected non-empty mask.");
6395 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6396 Prev.swap(Reuses);
6397 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6398 if (Mask[I] != PoisonMaskElem)
6399 Reuses[Mask[I]] = Prev[I];
6400}
6401
6402/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6403/// the original order of the scalars. Procedure transforms the provided order
6404/// in accordance with the given \p Mask. If the resulting \p Order is just an
6405/// identity order, \p Order is cleared.
6407 bool BottomOrder = false) {
6408 assert(!Mask.empty() && "Expected non-empty mask.");
6409 unsigned Sz = Mask.size();
6410 if (BottomOrder) {
6411 SmallVector<unsigned> PrevOrder;
6412 if (Order.empty()) {
6413 PrevOrder.resize(Sz);
6414 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6415 } else {
6416 PrevOrder.swap(Order);
6417 }
6418 Order.assign(Sz, Sz);
6419 for (unsigned I = 0; I < Sz; ++I)
6420 if (Mask[I] != PoisonMaskElem)
6421 Order[I] = PrevOrder[Mask[I]];
6422 if (all_of(enumerate(Order), [&](const auto &Data) {
6423 return Data.value() == Sz || Data.index() == Data.value();
6424 })) {
6425 Order.clear();
6426 return;
6427 }
6428 fixupOrderingIndices(Order);
6429 return;
6430 }
6431 SmallVector<int> MaskOrder;
6432 if (Order.empty()) {
6433 MaskOrder.resize(Sz);
6434 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6435 } else {
6436 inversePermutation(Order, MaskOrder);
6437 }
6438 reorderReuses(MaskOrder, Mask);
6439 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6440 Order.clear();
6441 return;
6442 }
6443 Order.assign(Sz, Sz);
6444 for (unsigned I = 0; I < Sz; ++I)
6445 if (MaskOrder[I] != PoisonMaskElem)
6446 Order[MaskOrder[I]] = I;
6447 fixupOrderingIndices(Order);
6448}
6449
6450std::optional<BoUpSLP::OrdersType>
6451BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6452 bool TopToBottom, bool IgnoreReorder) {
6453 assert(TE.isGather() && "Expected gather node only.");
6454 // Try to find subvector extract/insert patterns and reorder only such
6455 // patterns.
6456 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6457 Type *ScalarTy = GatheredScalars.front()->getType();
6458 size_t NumScalars = GatheredScalars.size();
6459 if (!isValidElementType(ScalarTy))
6460 return std::nullopt;
6461 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6462 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6463 SmallVector<int> ExtractMask;
6464 SmallVector<int> Mask;
6467 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6469 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6470 /*ForOrder=*/true);
6471 // No shuffled operands - ignore.
6472 if (GatherShuffles.empty() && ExtractShuffles.empty())
6473 return std::nullopt;
6474 OrdersType CurrentOrder(NumScalars, NumScalars);
6475 if (GatherShuffles.size() == 1 &&
6476 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6477 Entries.front().front()->isSame(TE.Scalars)) {
6478 // If the full matched node in whole tree rotation - no need to consider the
6479 // matching order, rotating the whole tree.
6480 if (TopToBottom)
6481 return std::nullopt;
6482 // No need to keep the order for the same user node.
6483 if (Entries.front().front()->UserTreeIndex.UserTE ==
6484 TE.UserTreeIndex.UserTE)
6485 return std::nullopt;
6486 // No need to keep the order for the matched root node, if it can be freely
6487 // reordered.
6488 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6489 return std::nullopt;
6490 // If shuffling 2 elements only and the matching node has reverse reuses -
6491 // no need to count order, both work fine.
6492 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6493 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6494 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6495 [](const auto &P) {
6496 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6497 }))
6498 return std::nullopt;
6499
6500 // Perfect match in the graph, will reuse the previously vectorized
6501 // node. Cost is 0.
6502 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6503 return CurrentOrder;
6504 }
6505 auto IsSplatMask = [](ArrayRef<int> Mask) {
6506 int SingleElt = PoisonMaskElem;
6507 return all_of(Mask, [&](int I) {
6508 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6509 SingleElt = I;
6510 return I == PoisonMaskElem || I == SingleElt;
6511 });
6512 };
6513 // Exclusive broadcast mask - ignore.
6514 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6515 (Entries.size() != 1 ||
6516 Entries.front().front()->ReorderIndices.empty())) ||
6517 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6518 return std::nullopt;
6519 SmallBitVector ShuffledSubMasks(NumParts);
6520 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6521 ArrayRef<int> Mask, int PartSz, int NumParts,
6522 function_ref<unsigned(unsigned)> GetVF) {
6523 for (int I : seq<int>(0, NumParts)) {
6524 if (ShuffledSubMasks.test(I))
6525 continue;
6526 const int VF = GetVF(I);
6527 if (VF == 0)
6528 continue;
6529 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6530 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6531 // Shuffle of at least 2 vectors - ignore.
6532 if (any_of(Slice, not_equal_to(NumScalars))) {
6533 llvm::fill(Slice, NumScalars);
6534 ShuffledSubMasks.set(I);
6535 continue;
6536 }
6537 // Try to include as much elements from the mask as possible.
6538 int FirstMin = INT_MAX;
6539 int SecondVecFound = false;
6540 for (int K : seq<int>(Limit)) {
6541 int Idx = Mask[I * PartSz + K];
6542 if (Idx == PoisonMaskElem) {
6543 Value *V = GatheredScalars[I * PartSz + K];
6544 if (isConstant(V) && !isa<PoisonValue>(V)) {
6545 SecondVecFound = true;
6546 break;
6547 }
6548 continue;
6549 }
6550 if (Idx < VF) {
6551 if (FirstMin > Idx)
6552 FirstMin = Idx;
6553 } else {
6554 SecondVecFound = true;
6555 break;
6556 }
6557 }
6558 FirstMin = (FirstMin / PartSz) * PartSz;
6559 // Shuffle of at least 2 vectors - ignore.
6560 if (SecondVecFound) {
6561 llvm::fill(Slice, NumScalars);
6562 ShuffledSubMasks.set(I);
6563 continue;
6564 }
6565 for (int K : seq<int>(Limit)) {
6566 int Idx = Mask[I * PartSz + K];
6567 if (Idx == PoisonMaskElem)
6568 continue;
6569 Idx -= FirstMin;
6570 if (Idx >= PartSz) {
6571 SecondVecFound = true;
6572 break;
6573 }
6574 if (CurrentOrder[I * PartSz + Idx] >
6575 static_cast<unsigned>(I * PartSz + K) &&
6576 CurrentOrder[I * PartSz + Idx] !=
6577 static_cast<unsigned>(I * PartSz + Idx))
6578 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6579 }
6580 // Shuffle of at least 2 vectors - ignore.
6581 if (SecondVecFound) {
6582 llvm::fill(Slice, NumScalars);
6583 ShuffledSubMasks.set(I);
6584 continue;
6585 }
6586 }
6587 };
6588 int PartSz = getPartNumElems(NumScalars, NumParts);
6589 if (!ExtractShuffles.empty())
6590 TransformMaskToOrder(
6591 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6592 if (!ExtractShuffles[I])
6593 return 0U;
6594 unsigned VF = 0;
6595 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6596 for (unsigned Idx : seq<unsigned>(Sz)) {
6597 int K = I * PartSz + Idx;
6598 if (ExtractMask[K] == PoisonMaskElem)
6599 continue;
6600 if (!TE.ReuseShuffleIndices.empty())
6601 K = TE.ReuseShuffleIndices[K];
6602 if (K == PoisonMaskElem)
6603 continue;
6604 if (!TE.ReorderIndices.empty())
6605 K = std::distance(TE.ReorderIndices.begin(),
6606 find(TE.ReorderIndices, K));
6607 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6608 if (!EI)
6609 continue;
6610 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6611 ->getElementCount()
6612 .getKnownMinValue());
6613 }
6614 return VF;
6615 });
6616 // Check special corner case - single shuffle of the same entry.
6617 if (GatherShuffles.size() == 1 && NumParts != 1) {
6618 if (ShuffledSubMasks.any())
6619 return std::nullopt;
6620 PartSz = NumScalars;
6621 NumParts = 1;
6622 }
6623 if (!Entries.empty())
6624 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6625 if (!GatherShuffles[I])
6626 return 0U;
6627 return std::max(Entries[I].front()->getVectorFactor(),
6628 Entries[I].back()->getVectorFactor());
6629 });
6630 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6631 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6632 return std::nullopt;
6633 return std::move(CurrentOrder);
6634}
6635
6636static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6637 const TargetLibraryInfo &TLI,
6638 bool CompareOpcodes = true) {
6641 return false;
6642 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6643 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6644 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6645 (!GEP2 || GEP2->getNumOperands() == 2) &&
6646 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6647 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6648 !CompareOpcodes ||
6649 (GEP1 && GEP2 &&
6650 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6651}
6652
6653/// Calculates minimal alignment as a common alignment.
6654template <typename T>
6656 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6657 for (Value *V : VL)
6658 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6659 return CommonAlignment;
6660}
6661
6662/// Check if \p Order represents reverse order.
6664 assert(!Order.empty() &&
6665 "Order is empty. Please check it before using isReverseOrder.");
6666 unsigned Sz = Order.size();
6667 return all_of(enumerate(Order), [&](const auto &Pair) {
6668 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6669 });
6670}
6671
6672/// Checks if the provided list of pointers \p Pointers represents the strided
6673/// pointers for type ElemTy. If they are not, nullptr is returned.
6674/// Otherwise, SCEV* of the stride value is returned.
6675/// If `PointerOps` can be rearanged into the following sequence:
6676/// ```
6677/// %x + c_0 * stride,
6678/// %x + c_1 * stride,
6679/// %x + c_2 * stride
6680/// ...
6681/// ```
6682/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6683/// and the SCEV of the `stride` will be returned.
6684static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6685 const DataLayout &DL, ScalarEvolution &SE,
6686 SmallVectorImpl<unsigned> &SortedIndices,
6687 SmallVectorImpl<int64_t> &Coeffs) {
6688 assert(Coeffs.size() == PointerOps.size() &&
6689 "Coeffs vector needs to be of correct size");
6691 const SCEV *PtrSCEVLowest = nullptr;
6692 const SCEV *PtrSCEVHighest = nullptr;
6693 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6694 // addresses).
6695 for (Value *Ptr : PointerOps) {
6696 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6697 if (!PtrSCEV)
6698 return nullptr;
6699 SCEVs.push_back(PtrSCEV);
6700 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6701 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6702 continue;
6703 }
6704 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6705 if (isa<SCEVCouldNotCompute>(Diff))
6706 return nullptr;
6707 if (Diff->isNonConstantNegative()) {
6708 PtrSCEVLowest = PtrSCEV;
6709 continue;
6710 }
6711 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6712 if (isa<SCEVCouldNotCompute>(Diff1))
6713 return nullptr;
6714 if (Diff1->isNonConstantNegative()) {
6715 PtrSCEVHighest = PtrSCEV;
6716 continue;
6717 }
6718 }
6719 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6720 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6721 if (isa<SCEVCouldNotCompute>(Dist))
6722 return nullptr;
6723 int Size = DL.getTypeStoreSize(ElemTy);
6724 auto TryGetStride = [&](const SCEV *Dist,
6725 const SCEV *Multiplier) -> const SCEV * {
6726 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6727 if (M->getOperand(0) == Multiplier)
6728 return M->getOperand(1);
6729 if (M->getOperand(1) == Multiplier)
6730 return M->getOperand(0);
6731 return nullptr;
6732 }
6733 if (Multiplier == Dist)
6734 return SE.getConstant(Dist->getType(), 1);
6735 return SE.getUDivExactExpr(Dist, Multiplier);
6736 };
6737 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6738 const SCEV *Stride = nullptr;
6739 if (Size != 1 || SCEVs.size() > 2) {
6740 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6741 Stride = TryGetStride(Dist, Sz);
6742 if (!Stride)
6743 return nullptr;
6744 }
6745 if (!Stride || isa<SCEVConstant>(Stride))
6746 return nullptr;
6747 // Iterate through all pointers and check if all distances are
6748 // unique multiple of Stride.
6749 using DistOrdPair = std::pair<int64_t, int>;
6750 auto Compare = llvm::less_first();
6751 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6752 int Cnt = 0;
6753 bool IsConsecutive = true;
6754 for (const auto [Idx, PtrSCEV] : enumerate(SCEVs)) {
6755 unsigned Dist = 0;
6756 if (PtrSCEV != PtrSCEVLowest) {
6757 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6758 const SCEV *Coeff = TryGetStride(Diff, Stride);
6759 if (!Coeff)
6760 return nullptr;
6761 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6762 if (!SC || isa<SCEVCouldNotCompute>(SC))
6763 return nullptr;
6764 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6765 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6766 SE.getMulExpr(Stride, SC)))
6767 ->isZero())
6768 return nullptr;
6769 Dist = SC->getAPInt().getZExtValue();
6770 } else {
6771 Coeffs[Idx] = 0;
6772 }
6773 // If the strides are not the same or repeated, we can't vectorize.
6774 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6775 return nullptr;
6776 auto Res = Offsets.emplace(Dist, Cnt);
6777 if (!Res.second)
6778 return nullptr;
6779 // Consecutive order if the inserted element is the last one.
6780 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6781 ++Cnt;
6782 }
6783 if (Offsets.size() != SCEVs.size())
6784 return nullptr;
6785 SortedIndices.clear();
6786 if (!IsConsecutive) {
6787 // Fill SortedIndices array only if it is non-consecutive.
6788 SortedIndices.resize(PointerOps.size());
6789 Cnt = 0;
6790 for (const std::pair<int64_t, int> &Pair : Offsets) {
6791 SortedIndices[Cnt] = Pair.second;
6792 ++Cnt;
6793 }
6794 }
6795 return Stride;
6796}
6797
6798static std::pair<InstructionCost, InstructionCost>
6800 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6801 Type *ScalarTy, VectorType *VecTy);
6802
6803/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6804/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6805/// subvector pattern.
6806static InstructionCost
6808 VectorType *Tp, ArrayRef<int> Mask = {},
6810 int Index = 0, VectorType *SubTp = nullptr,
6812 VectorType *DstTy = Tp;
6813 if (!Mask.empty())
6814 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6815
6816 if (Kind != TTI::SK_PermuteTwoSrc)
6817 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6818 Args);
6819 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6820 int NumSubElts;
6822 Mask, NumSrcElts, NumSubElts, Index)) {
6823 if (Index + NumSubElts > NumSrcElts &&
6824 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6825 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6826 TTI::TCK_RecipThroughput, Index, Tp);
6827 }
6828 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6829 Args);
6830}
6831
6832/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6833/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6834/// instead of a scalar.
6835static InstructionCost
6837 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6838 bool Extract, TTI::TargetCostKind CostKind,
6839 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6841 "ScalableVectorType is not supported.");
6842 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6843 getNumElements(Ty) &&
6844 "Incorrect usage.");
6845 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6846 assert(SLPReVec && "Only supported by REVEC.");
6847 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6848 // of CreateInsertElement.
6849 unsigned ScalarTyNumElements = VecTy->getNumElements();
6850 InstructionCost Cost = 0;
6851 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6852 if (!DemandedElts[I])
6853 continue;
6854 if (Insert)
6856 I * ScalarTyNumElements, VecTy);
6857 if (Extract)
6859 I * ScalarTyNumElements, VecTy);
6860 }
6861 return Cost;
6862 }
6863 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6864 CostKind, ForPoisonSrc, VL);
6865}
6866
6867/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6868/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6870 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6871 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6872 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6873 if (Opcode == Instruction::ExtractElement) {
6874 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6875 assert(SLPReVec && "Only supported by REVEC.");
6876 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6878 cast<VectorType>(Val), {}, CostKind,
6879 Index * VecTy->getNumElements(), VecTy);
6880 }
6881 }
6882 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6883 ScalarUserAndIdx);
6884}
6885
6886/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6887/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6889 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6890 VectorType *VecTy, unsigned Index,
6892 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6893 assert(SLPReVec && "Only supported by REVEC.");
6894 auto *SubTp =
6895 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6897 Index * ScalarTy->getNumElements(), SubTp) +
6898 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6899 CostKind);
6900 }
6901 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6902}
6903
6904/// Creates subvector insert. Generates shuffle using \p Generator or
6905/// using default shuffle.
6907 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6908 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6909 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6910 return Vec;
6911 const unsigned SubVecVF = getNumElements(V->getType());
6912 // Create shuffle, insertvector requires that index is multiple of
6913 // the subvector length.
6914 const unsigned VecVF = getNumElements(Vec->getType());
6916 if (isa<PoisonValue>(Vec)) {
6917 auto *Begin = std::next(Mask.begin(), Index);
6918 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6919 Vec = Builder.CreateShuffleVector(V, Mask);
6920 return Vec;
6921 }
6922 std::iota(Mask.begin(), Mask.end(), 0);
6923 std::iota(std::next(Mask.begin(), Index),
6924 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6925 if (Generator)
6926 return Generator(Vec, V, Mask);
6927 // 1. Resize V to the size of Vec.
6928 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6929 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6930 V = Builder.CreateShuffleVector(V, ResizeMask);
6931 // 2. Insert V into Vec.
6932 return Builder.CreateShuffleVector(Vec, V, Mask);
6933}
6934
6935/// Generates subvector extract using \p Generator or using default shuffle.
6937 unsigned SubVecVF, unsigned Index) {
6938 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6939 std::iota(Mask.begin(), Mask.end(), Index);
6940 return Builder.CreateShuffleVector(Vec, Mask);
6941}
6942
6943/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6944/// with \p Order.
6945/// \return true if the mask represents strided access, false - otherwise.
6947 ArrayRef<unsigned> Order, Type *ScalarTy,
6948 const DataLayout &DL, ScalarEvolution &SE,
6949 SmallVectorImpl<int> &CompressMask) {
6950 const unsigned Sz = PointerOps.size();
6951 CompressMask.assign(Sz, PoisonMaskElem);
6952 // The first element always set.
6953 CompressMask[0] = 0;
6954 // Check if the mask represents strided access.
6955 std::optional<unsigned> Stride = 0;
6956 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6957 for (unsigned I : seq<unsigned>(1, Sz)) {
6958 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6959 std::optional<int64_t> OptPos =
6960 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6961 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6962 return false;
6963 unsigned Pos = static_cast<unsigned>(*OptPos);
6964 CompressMask[I] = Pos;
6965 if (!Stride)
6966 continue;
6967 if (*Stride == 0) {
6968 *Stride = Pos;
6969 continue;
6970 }
6971 if (Pos != *Stride * I)
6972 Stride.reset();
6973 }
6974 return Stride.has_value();
6975}
6976
6977/// Checks if the \p VL can be transformed to a (masked)load + compress or
6978/// (masked) interleaved load.
6980 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6983 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6984 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6985 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6986 VectorType *&LoadVecTy) {
6987 InterleaveFactor = 0;
6988 Type *ScalarTy = VL.front()->getType();
6989 const size_t Sz = VL.size();
6990 auto *VecTy = getWidenedType(ScalarTy, Sz);
6992 SmallVector<int> Mask;
6993 if (!Order.empty())
6994 inversePermutation(Order, Mask);
6995 // Check external uses.
6996 for (const auto [I, V] : enumerate(VL)) {
6997 if (AreAllUsersVectorized(V))
6998 continue;
6999 InstructionCost ExtractCost =
7000 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
7001 Mask.empty() ? I : Mask[I]);
7002 InstructionCost ScalarCost =
7003 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
7004 if (ExtractCost <= ScalarCost)
7005 return false;
7006 }
7007 Value *Ptr0;
7008 Value *PtrN;
7009 if (Order.empty()) {
7010 Ptr0 = PointerOps.front();
7011 PtrN = PointerOps.back();
7012 } else {
7013 Ptr0 = PointerOps[Order.front()];
7014 PtrN = PointerOps[Order.back()];
7015 }
7016 std::optional<int64_t> Diff =
7017 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
7018 if (!Diff)
7019 return false;
7020 const size_t MaxRegSize =
7022 .getFixedValue();
7023 // Check for very large distances between elements.
7024 if (*Diff / Sz >= MaxRegSize / 8)
7025 return false;
7026 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
7027 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
7028 Align CommonAlignment = LI->getAlign();
7029 IsMasked = !isSafeToLoadUnconditionally(
7030 Ptr0, LoadVecTy, CommonAlignment, DL,
7031 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
7032 &TLI);
7033 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
7034 LI->getPointerAddressSpace()))
7035 return false;
7036 // TODO: perform the analysis of each scalar load for better
7037 // safe-load-unconditionally analysis.
7038 bool IsStrided =
7039 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
7040 assert(CompressMask.size() >= 2 && "At least two elements are required");
7041 SmallVector<Value *> OrderedPointerOps(PointerOps);
7042 if (!Order.empty())
7043 reorderScalars(OrderedPointerOps, Mask);
7044 auto [ScalarGEPCost, VectorGEPCost] =
7045 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
7046 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
7047 // The cost of scalar loads.
7048 InstructionCost ScalarLoadsCost =
7049 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7050 [&](InstructionCost C, Value *V) {
7051 return C + TTI.getInstructionCost(cast<Instruction>(V),
7052 CostKind);
7053 }) +
7054 ScalarGEPCost;
7055 APInt DemandedElts = APInt::getAllOnes(Sz);
7056 InstructionCost GatherCost =
7057 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7058 /*Insert=*/true,
7059 /*Extract=*/false, CostKind) +
7060 ScalarLoadsCost;
7061 InstructionCost LoadCost = 0;
7062 if (IsMasked) {
7063 LoadCost = TTI.getMemIntrinsicInstrCost(
7064 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
7065 CommonAlignment,
7066 LI->getPointerAddressSpace()),
7067 CostKind);
7068 } else {
7069 LoadCost =
7070 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
7071 LI->getPointerAddressSpace(), CostKind);
7072 }
7073 if (IsStrided && !IsMasked && Order.empty()) {
7074 // Check for potential segmented(interleaved) loads.
7075 VectorType *AlignedLoadVecTy = getWidenedType(
7076 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
7077 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
7078 DL, cast<LoadInst>(VL.back()), &AC, &DT,
7079 &TLI))
7080 AlignedLoadVecTy = LoadVecTy;
7081 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
7082 CommonAlignment,
7083 LI->getPointerAddressSpace())) {
7084 InstructionCost InterleavedCost =
7085 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
7086 Instruction::Load, AlignedLoadVecTy,
7087 CompressMask[1], {}, CommonAlignment,
7088 LI->getPointerAddressSpace(), CostKind, IsMasked);
7089 if (InterleavedCost < GatherCost) {
7090 InterleaveFactor = CompressMask[1];
7091 LoadVecTy = AlignedLoadVecTy;
7092 return true;
7093 }
7094 }
7095 }
7096 InstructionCost CompressCost = ::getShuffleCost(
7097 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
7098 if (!Order.empty()) {
7099 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7100 for (unsigned I : seq<unsigned>(Sz)) {
7101 NewMask[I] = CompressMask[Mask[I]];
7102 }
7103 CompressMask.swap(NewMask);
7104 }
7105 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7106 return TotalVecCost < GatherCost;
7107}
7108
7109/// Checks if the \p VL can be transformed to a (masked)load + compress or
7110/// (masked) interleaved load.
7111static bool
7114 const DataLayout &DL, ScalarEvolution &SE,
7115 AssumptionCache &AC, const DominatorTree &DT,
7116 const TargetLibraryInfo &TLI,
7117 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7118 bool IsMasked;
7119 unsigned InterleaveFactor;
7120 SmallVector<int> CompressMask;
7121 VectorType *LoadVecTy;
7122 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7123 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7124 CompressMask, LoadVecTy);
7125}
7126
7127/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7128/// PointerOps:
7129/// 1. Target with strided load support is detected.
7130/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7131/// potential stride <= MaxProfitableLoadStride and the potential stride is
7132/// power-of-2 (to avoid perf regressions for the very small number of loads)
7133/// and max distance > number of loads, or potential stride is -1.
7134/// 3. The loads are ordered, or number of unordered loads <=
7135/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7136/// to avoid extra costs for very expensive shuffles).
7137/// 4. Any pointer operand is an instruction with the users outside of the
7138/// current graph (for masked gathers extra extractelement instructions
7139/// might be required).
7141 Align Alignment, const int64_t Diff,
7142 const size_t Sz) const {
7143 if (Diff % (Sz - 1) != 0)
7144 return false;
7145
7146 // Try to generate strided load node.
7147 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
7148 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
7149 return !isVectorized(U) && !MustGather.contains(U);
7150 });
7151 });
7152
7153 const uint64_t AbsoluteDiff = std::abs(Diff);
7154 auto *VecTy = getWidenedType(ScalarTy, Sz);
7155 if (IsAnyPointerUsedOutGraph ||
7156 (AbsoluteDiff > Sz &&
7158 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7159 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
7160 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7161 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7162 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7163 return false;
7164 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7165 return false;
7166 return true;
7167 }
7168 return false;
7169}
7170
7172 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7173 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7174 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7175 const size_t Sz = PointerOps.size();
7176 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7177 // Go through `PointerOps` in sorted order and record offsets from
7178 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7179 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7180 // PointerOps[0]. This is safe since only offset differences are used below.
7181 for (unsigned I : seq<unsigned>(Sz)) {
7182 Value *Ptr =
7183 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7184 std::optional<int64_t> Offset =
7185 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr, *DL, *SE);
7186 assert(Offset && "sortPtrAccesses should have validated this pointer");
7187 SortedOffsetsFromBase[I] = *Offset;
7188 }
7189
7190 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7191 // ```
7192 // [
7193 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7194 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7195 // ...
7196 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7197 // GroupSize - 1}), // last group
7198 // ]
7199 // ```
7200 // The distance between consecutive elements within each group should all be
7201 // the same `StrideWithinGroup`. The distance between the first elements of
7202 // consecutive groups should all be the same `StrideBetweenGroups`.
7203
7204 int64_t StrideWithinGroup =
7205 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7206 // Determine size of the first group. Later we will check that all other
7207 // groups have the same size.
7208 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7209 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7210 StrideWithinGroup;
7211 };
7212 auto Indices = seq<unsigned>(1, Sz);
7213 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7214 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7215
7216 unsigned VecSz = Sz;
7217 Type *NewScalarTy = ScalarTy;
7218
7219 // Quick detour: at this point we can say what the type of strided load would
7220 // be if all the checks pass. Check if this type is legal for the target.
7221 bool NeedsWidening = Sz != GroupSize;
7222 if (NeedsWidening) {
7223 if (Sz % GroupSize != 0)
7224 return false;
7225
7226 if (StrideWithinGroup != 1)
7227 return false;
7228 VecSz = Sz / GroupSize;
7229 NewScalarTy = Type::getIntNTy(
7230 SE->getContext(),
7231 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7232 }
7233
7234 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7235 return false;
7236
7237 int64_t StrideIntVal = StrideWithinGroup;
7238 if (NeedsWidening) {
7239 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7240 // Check that the strides between groups are all the same.
7241 unsigned CurrentGroupStartIdx = GroupSize;
7242 int64_t StrideBetweenGroups =
7243 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7244 StrideIntVal = StrideBetweenGroups;
7245 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7246 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7247 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7248 StrideBetweenGroups)
7249 return false;
7250 }
7251
7252 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7253 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7254 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7255 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7256 return GroupEndIdx - StartIdx == GroupSize;
7257 };
7258 for (unsigned I = 0; I < Sz; I += GroupSize) {
7259 if (!CheckGroup(I))
7260 return false;
7261 }
7262 }
7263
7264 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7265 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
7266 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7267 return true;
7268}
7269
7271 Type *ScalarTy, Align CommonAlignment,
7272 SmallVectorImpl<unsigned> &SortedIndices,
7273 StridedPtrInfo &SPtrInfo) const {
7274 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7275 // is constant, we partition `PointerOps` sequence into subsequences of
7276 // pointers with the same offset. For each offset we record values from
7277 // `PointerOps` and their indicies in `PointerOps`.
7279 OffsetToPointerOpIdxMap;
7280 for (auto [Idx, Ptr] : enumerate(PointerOps)) {
7281 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7282 if (!PtrSCEV)
7283 return false;
7284
7285 const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
7286 int64_t Offset = 0;
7287 if (Add) {
7288 // `Offset` is non-zero.
7289 for (int I : seq<int>(Add->getNumOperands())) {
7290 const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
7291 if (!SC)
7292 continue;
7293 Offset = SC->getAPInt().getSExtValue();
7294 if (Offset >= std::numeric_limits<int64_t>::max() - 1) {
7295 Offset = 0;
7296 continue;
7297 }
7298 break;
7299 }
7300 }
7301 OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
7302 OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
7303 }
7304 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7305
7306 // Quick detour: at this point we can say what the type of strided load would
7307 // be if all the checks pass. Check if this type is legal for the target.
7308 const unsigned Sz = PointerOps.size();
7309 unsigned VecSz = Sz;
7310 Type *NewScalarTy = ScalarTy;
7311 if (NumOffsets > 1) {
7312 if (Sz % NumOffsets != 0)
7313 return false;
7314 VecSz = Sz / NumOffsets;
7315 NewScalarTy = Type::getIntNTy(
7316 SE->getContext(),
7317 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7318 }
7319 FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
7320 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7321 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7322 return false;
7323
7324 // Check if the offsets are contiguous and that each group has the required
7325 // size.
7326 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7327 for (auto [Idx, MapPair] : enumerate(OffsetToPointerOpIdxMap)) {
7328 if (MapPair.second.first.size() != VecSz)
7329 return false;
7330 SortedOffsetsV[Idx] = MapPair.first;
7331 }
7332 sort(SortedOffsetsV);
7333
7334 if (NumOffsets > 1) {
7335 for (int I : seq<int>(1, SortedOffsetsV.size())) {
7336 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != 1)
7337 return false;
7338 }
7339 }
7340
7341 // Introduce some notation for the explanations below. Let `PointerOps_j`
7342 // denote the subsequence of `PointerOps` with offsets equal to
7343 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7344 // ```
7345 // PointerOps_j[SortedIndices_j[0]],
7346 // PointerOps_j[SortedIndices_j[1]],
7347 // PointerOps_j[SortedIndices_j[2]],
7348 // ...
7349 // ```
7350 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7351 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7352 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7353 // The entire sorted `PointerOps` looks like this:
7354 // ```
7355 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7356 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7357 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7358 // ...
7359 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7360 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7361 //
7362 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7363 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7364 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7365 // ...
7366 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7367 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7368 //
7369 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7370 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7371 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7372 // ...
7373 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7374 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7375 // ...
7376 // ...
7377 // ...
7378 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7379 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7380 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7381 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7382 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7383 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7384 // ...
7385 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7386 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7387 // ```
7388 // In order to be able to generate a strided load, we need the following
7389 // checks to pass:
7390 //
7391 // (1) for each `PointerOps_j` check that the distance
7392 // between adjacent pointers are all equal to the same value (stride).
7393 // (2) for each `PointerOps_j` check that coefficients calculated by
7394 // `calculateRtStride` are all the same.
7395 //
7396 // As we do that, also calculate SortedIndices. Since we should not modify
7397 // `SortedIndices` unless we know that all the checks succeed, record the
7398 // indicies into `SortedIndicesDraft`.
7399 SmallVector<unsigned> SortedIndicesDraft(Sz);
7400
7401 // Given sorted indices for a particular offset (as calculated by
7402 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7403 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7404 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7405 // \param `IndicesInAllPointerOps` vector of indices of the
7406 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7407 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7408 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7409 auto UpdateSortedIndices =
7410 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7411 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7412 if (SortedIndicesForOffset.empty()) {
7413 SortedIndicesForOffset.resize(IndicesInAllPointerOps.size());
7414 std::iota(SortedIndicesForOffset.begin(),
7415 SortedIndicesForOffset.end(), 0);
7416 }
7417 for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7418 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7419 IndicesInAllPointerOps[Idx];
7420 }
7421 };
7422
7423 int64_t LowestOffset = SortedOffsetsV[0];
7424 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7425
7426 SmallVector<int64_t> Coeffs0(VecSz);
7427 SmallVector<unsigned> SortedIndicesForOffset0;
7428 const SCEV *Stride0 = calculateRtStride(PointerOps0, ScalarTy, *DL, *SE,
7429 SortedIndicesForOffset0, Coeffs0);
7430 if (!Stride0)
7431 return false;
7432 unsigned NumCoeffs0 = Coeffs0.size();
7433 if (NumCoeffs0 * NumOffsets != Sz)
7434 return false;
7435 sort(Coeffs0);
7436
7437 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7438 OffsetToPointerOpIdxMap[LowestOffset].second;
7439 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7440
7441 // Now that we know what the common stride and coefficients has to be check
7442 // the remaining `PointerOps_j`.
7443 SmallVector<int64_t> Coeffs;
7444 SmallVector<unsigned> SortedIndicesForOffset;
7445 for (int J : seq<int>(1, NumOffsets)) {
7446 Coeffs.clear();
7447 Coeffs.resize(VecSz);
7448 SortedIndicesForOffset.clear();
7449
7450 int64_t Offset = SortedOffsetsV[J];
7451 ArrayRef<Value *> PointerOpsForOffset =
7452 OffsetToPointerOpIdxMap[Offset].first;
7453 ArrayRef<unsigned> IndicesInAllPointerOps =
7454 OffsetToPointerOpIdxMap[Offset].second;
7455 const SCEV *StrideWithinGroup =
7456 calculateRtStride(PointerOpsForOffset, ScalarTy, *DL, *SE,
7457 SortedIndicesForOffset, Coeffs);
7458
7459 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7460 return false;
7461 if (Coeffs.size() != NumCoeffs0)
7462 return false;
7463 sort(Coeffs);
7464 if (Coeffs != Coeffs0)
7465 return false;
7466
7467 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7468 }
7469
7470 SortedIndices.clear();
7471 SortedIndices = std::move(SortedIndicesDraft);
7472 SPtrInfo.StrideSCEV = Stride0;
7473 SPtrInfo.Ty = StridedLoadTy;
7474 return true;
7475}
7476
7478 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7479 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7480 unsigned *BestVF, bool TryRecursiveCheck) const {
7481 // Check that a vectorized load would load the same memory as a scalar
7482 // load. For example, we don't want to vectorize loads that are smaller
7483 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7484 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7485 // from such a struct, we read/write packed bits disagreeing with the
7486 // unvectorized version.
7487 if (BestVF)
7488 *BestVF = 0;
7490 return LoadsState::Gather;
7491 Type *ScalarTy = VL0->getType();
7492
7493 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7494 return LoadsState::Gather;
7495
7496 // Make sure all loads in the bundle are simple - we can't vectorize
7497 // atomic or volatile loads.
7498 PointerOps.clear();
7499 const size_t Sz = VL.size();
7500 PointerOps.resize(Sz);
7501 auto *POIter = PointerOps.begin();
7502 for (Value *V : VL) {
7503 auto *L = dyn_cast<LoadInst>(V);
7504 if (!L || !L->isSimple())
7505 return LoadsState::Gather;
7506 *POIter = L->getPointerOperand();
7507 ++POIter;
7508 }
7509
7510 Order.clear();
7511 // Check the order of pointer operands or that all pointers are the same.
7512 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7513
7514 auto *VecTy = getWidenedType(ScalarTy, Sz);
7515 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7516 if (!IsSorted) {
7517 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7518 SPtrInfo))
7520
7521 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7522 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7523 return LoadsState::Gather;
7524
7525 if (!all_of(PointerOps, [&](Value *P) {
7526 return arePointersCompatible(P, PointerOps.front(), *TLI);
7527 }))
7528 return LoadsState::Gather;
7529
7530 } else {
7531 Value *Ptr0;
7532 Value *PtrN;
7533 if (Order.empty()) {
7534 Ptr0 = PointerOps.front();
7535 PtrN = PointerOps.back();
7536 } else {
7537 Ptr0 = PointerOps[Order.front()];
7538 PtrN = PointerOps[Order.back()];
7539 }
7540 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7541 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7542 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7543 std::optional<int64_t> Diff0 =
7544 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr0, *DL, *SE);
7545 std::optional<int64_t> DiffN =
7546 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, PtrN, *DL, *SE);
7547 assert(Diff0 && DiffN &&
7548 "sortPtrAccesses should have validated these pointers");
7549 int64_t Diff = *DiffN - *Diff0;
7550 // Check that the sorted loads are consecutive.
7551 if (static_cast<uint64_t>(Diff) == Sz - 1)
7552 return LoadsState::Vectorize;
7553 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7554 *TLI, [&](Value *V) {
7555 return areAllUsersVectorized(
7556 cast<Instruction>(V), UserIgnoreList);
7557 }))
7559 Align Alignment =
7560 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7561 ->getAlign();
7562 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7563 Diff, Ptr0, PtrN, SPtrInfo))
7565 }
7566 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7567 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7568 return LoadsState::Gather;
7569 // Correctly identify compare the cost of loads + shuffles rather than
7570 // strided/masked gather loads. Returns true if vectorized + shuffles
7571 // representation is better than just gather.
7572 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7573 unsigned *BestVF,
7574 bool ProfitableGatherPointers) {
7575 if (BestVF)
7576 *BestVF = 0;
7577 // Compare masked gather cost and loads + insert subvector costs.
7579 auto [ScalarGEPCost, VectorGEPCost] =
7580 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7581 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7582 // Estimate the cost of masked gather GEP. If not a splat, roughly
7583 // estimate as a buildvector, otherwise estimate as splat.
7584 APInt DemandedElts = APInt::getAllOnes(Sz);
7585 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7586 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7587 if (static_cast<unsigned>(count_if(
7588 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7589 any_of(PointerOps, [&](Value *V) {
7590 return getUnderlyingObject(V) !=
7591 getUnderlyingObject(PointerOps.front());
7592 }))
7593 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7594 DemandedElts, /*Insert=*/true,
7595 /*Extract=*/false, CostKind);
7596 else
7597 VectorGEPCost +=
7599 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7600 /*Insert=*/true, /*Extract=*/false, CostKind) +
7601 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7602 // The cost of scalar loads.
7603 InstructionCost ScalarLoadsCost =
7604 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7605 [&](InstructionCost C, Value *V) {
7606 return C + TTI.getInstructionCost(
7608 }) +
7609 ScalarGEPCost;
7610 // The cost of masked gather.
7611 InstructionCost MaskedGatherCost =
7612 TTI.getMemIntrinsicInstrCost(
7613 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7615 /*VariableMask=*/false, CommonAlignment),
7616 CostKind) +
7617 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7618 InstructionCost GatherCost =
7619 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7620 /*Insert=*/true,
7621 /*Extract=*/false, CostKind) +
7622 ScalarLoadsCost;
7623 // The list of loads is small or perform partial check already - directly
7624 // compare masked gather cost and gather cost.
7625 constexpr unsigned ListLimit = 4;
7626 if (!TryRecursiveCheck || VL.size() < ListLimit)
7627 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7628
7629 // FIXME: The following code has not been updated for non-power-of-2
7630 // vectors (and not whole registers). The splitting logic here does not
7631 // cover the original vector if the vector factor is not a power of two.
7632 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7633 return false;
7634
7635 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7636 unsigned MinVF = getMinVF(2 * Sz);
7637 DemandedElts.clearAllBits();
7638 // Iterate through possible vectorization factors and check if vectorized +
7639 // shuffles is better than just gather.
7640 for (unsigned VF =
7641 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7642 VF >= MinVF;
7643 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7645 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7646 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7648 SmallVector<Value *> PointerOps;
7649 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7650 PointerOps, SPtrInfo, BestVF,
7651 /*TryRecursiveCheck=*/false);
7652 // Check that the sorted loads are consecutive.
7653 if (LS == LoadsState::Gather) {
7654 if (BestVF) {
7655 DemandedElts.setAllBits();
7656 break;
7657 }
7658 DemandedElts.setBits(Cnt, Cnt + VF);
7659 continue;
7660 }
7661 // If need the reorder - consider as high-cost masked gather for now.
7662 if ((LS == LoadsState::Vectorize ||
7665 !Order.empty() && !isReverseOrder(Order))
7667 States.push_back(LS);
7668 }
7669 if (DemandedElts.isAllOnes())
7670 // All loads gathered - try smaller VF.
7671 continue;
7672 // Can be vectorized later as a serie of loads/insertelements.
7673 InstructionCost VecLdCost = 0;
7674 if (!DemandedElts.isZero()) {
7675 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7676 /*Insert=*/true,
7677 /*Extract=*/false, CostKind) +
7678 ScalarGEPCost;
7679 for (unsigned Idx : seq<unsigned>(VL.size()))
7680 if (DemandedElts[Idx])
7681 VecLdCost +=
7682 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7683 }
7684 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7685 for (auto [I, LS] : enumerate(States)) {
7686 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7687 InstructionCost VectorGEPCost =
7688 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7689 ? 0
7690 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7691 LI0->getPointerOperand(),
7692 Instruction::GetElementPtr, CostKind, ScalarTy,
7693 SubVecTy)
7694 .second;
7695 if (LS == LoadsState::ScatterVectorize) {
7696 if (static_cast<unsigned>(
7697 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7698 PointerOps.size() - 1 ||
7699 any_of(PointerOps, [&](Value *V) {
7700 return getUnderlyingObject(V) !=
7701 getUnderlyingObject(PointerOps.front());
7702 }))
7703 VectorGEPCost += getScalarizationOverhead(
7704 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7705 /*Insert=*/true, /*Extract=*/false, CostKind);
7706 else
7707 VectorGEPCost +=
7709 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7710 /*Insert=*/true, /*Extract=*/false, CostKind) +
7711 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7712 CostKind);
7713 }
7714 switch (LS) {
7716 VecLdCost +=
7717 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7718 LI0->getPointerAddressSpace(), CostKind,
7720 VectorGEPCost;
7721 break;
7723 VecLdCost += TTI.getMemIntrinsicInstrCost(
7725 Intrinsic::experimental_vp_strided_load,
7726 SubVecTy, LI0->getPointerOperand(),
7727 /*VariableMask=*/false, CommonAlignment),
7728 CostKind) +
7729 VectorGEPCost;
7730 break;
7732 VecLdCost += TTI.getMemIntrinsicInstrCost(
7734 Intrinsic::masked_load, SubVecTy,
7735 CommonAlignment, LI0->getPointerAddressSpace()),
7736 CostKind) +
7738 {}, CostKind);
7739 break;
7741 VecLdCost += TTI.getMemIntrinsicInstrCost(
7743 Intrinsic::masked_gather, SubVecTy,
7744 LI0->getPointerOperand(),
7745 /*VariableMask=*/false, CommonAlignment),
7746 CostKind) +
7747 VectorGEPCost;
7748 break;
7749 case LoadsState::Gather:
7750 // Gathers are already calculated - ignore.
7751 continue;
7752 }
7753 SmallVector<int> ShuffleMask(VL.size());
7754 for (int Idx : seq<int>(0, VL.size()))
7755 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7756 if (I > 0)
7757 VecLdCost +=
7758 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7759 CostKind, I * VF, SubVecTy);
7760 }
7761 // If masked gather cost is higher - better to vectorize, so
7762 // consider it as a gather node. It will be better estimated
7763 // later.
7764 if (MaskedGatherCost >= VecLdCost &&
7765 VecLdCost - GatherCost < -SLPCostThreshold) {
7766 if (BestVF)
7767 *BestVF = VF;
7768 return true;
7769 }
7770 }
7771 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7772 };
7773 // TODO: need to improve analysis of the pointers, if not all of them are
7774 // GEPs or have > 2 operands, we end up with a gather node, which just
7775 // increases the cost.
7776 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7777 bool ProfitableGatherPointers =
7778 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7779 return L->isLoopInvariant(V);
7780 })) <= Sz / 2;
7781 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7783 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7784 (GEP && GEP->getNumOperands() == 2 &&
7785 isa<Constant, Instruction>(GEP->getOperand(1)));
7786 })) {
7787 // Check if potential masked gather can be represented as series
7788 // of loads + insertsubvectors.
7789 // If masked gather cost is higher - better to vectorize, so
7790 // consider it as a gather node. It will be better estimated
7791 // later.
7792 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7793 ProfitableGatherPointers))
7795 }
7796
7797 return LoadsState::Gather;
7798}
7799
7801 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7802 const DataLayout &DL, ScalarEvolution &SE,
7803 SmallVectorImpl<unsigned> &SortedIndices) {
7804 assert(
7805 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7806 "Expected list of pointer operands.");
7807 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7808 // Ptr into, sort and return the sorted indices with values next to one
7809 // another.
7811 std::pair<BasicBlock *, Value *>,
7813 Bases;
7814 Bases
7815 .try_emplace(std::make_pair(
7817 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7818
7819 SortedIndices.clear();
7820 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7821 auto Key = std::make_pair(BBs[Cnt + 1],
7823 bool Found = any_of(Bases.try_emplace(Key).first->second,
7824 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7825 std::optional<int64_t> Diff =
7826 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7827 ElemTy, Ptr, DL, SE,
7828 /*StrictCheck=*/true);
7829 if (!Diff)
7830 return false;
7831
7832 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7833 return true;
7834 });
7835
7836 if (!Found) {
7837 // If we haven't found enough to usefully cluster, return early.
7838 if (Bases.size() > VL.size() / 2 - 1)
7839 return false;
7840
7841 // Not found already - add a new Base
7842 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7843 }
7844 }
7845
7846 if (Bases.size() == VL.size())
7847 return false;
7848
7849 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7850 Bases.front().second.size() == VL.size()))
7851 return false;
7852
7853 // For each of the bases sort the pointers by Offset and check if any of the
7854 // base become consecutively allocated.
7855 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7856 SmallPtrSet<Value *, 13> FirstPointers;
7857 SmallPtrSet<Value *, 13> SecondPointers;
7858 Value *P1 = Ptr1;
7859 Value *P2 = Ptr2;
7860 unsigned Depth = 0;
7861 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7862 if (P1 == P2 || Depth > RecursionMaxDepth)
7863 return false;
7864 FirstPointers.insert(P1);
7865 SecondPointers.insert(P2);
7866 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7867 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7868 ++Depth;
7869 }
7870 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7871 "Unable to find matching root.");
7872 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7873 };
7874 for (auto &Base : Bases) {
7875 for (auto &Vec : Base.second) {
7876 if (Vec.size() > 1) {
7878 int64_t InitialOffset = std::get<1>(Vec[0]);
7879 bool AnyConsecutive =
7880 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7881 return std::get<1>(P.value()) ==
7882 int64_t(P.index()) + InitialOffset;
7883 });
7884 // Fill SortedIndices array only if it looks worth-while to sort the
7885 // ptrs.
7886 if (!AnyConsecutive)
7887 return false;
7888 }
7889 }
7890 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7891 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7892 });
7893 }
7894
7895 for (auto &T : Bases)
7896 for (const auto &Vec : T.second)
7897 for (const auto &P : Vec)
7898 SortedIndices.push_back(std::get<2>(P));
7899
7900 assert(SortedIndices.size() == VL.size() &&
7901 "Expected SortedIndices to be the size of VL");
7902 return true;
7903}
7904
7905std::optional<BoUpSLP::OrdersType>
7906BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7907 assert(TE.isGather() && "Expected gather node only.");
7908 Type *ScalarTy = TE.Scalars[0]->getType();
7909
7911 Ptrs.reserve(TE.Scalars.size());
7913 BBs.reserve(TE.Scalars.size());
7914 for (Value *V : TE.Scalars) {
7915 auto *L = dyn_cast<LoadInst>(V);
7916 if (!L || !L->isSimple())
7917 return std::nullopt;
7918 Ptrs.push_back(L->getPointerOperand());
7919 BBs.push_back(L->getParent());
7920 }
7921
7922 BoUpSLP::OrdersType Order;
7923 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7924 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7925 return std::move(Order);
7926 return std::nullopt;
7927}
7928
7929/// Check if two insertelement instructions are from the same buildvector.
7932 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7933 // Instructions must be from the same basic blocks.
7934 if (VU->getParent() != V->getParent())
7935 return false;
7936 // Checks if 2 insertelements are from the same buildvector.
7937 if (VU->getType() != V->getType())
7938 return false;
7939 // Multiple used inserts are separate nodes.
7940 if (!VU->hasOneUse() && !V->hasOneUse())
7941 return false;
7942 auto *IE1 = VU;
7943 auto *IE2 = V;
7944 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7945 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7946 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7947 return false;
7948 // Go through the vector operand of insertelement instructions trying to find
7949 // either VU as the original vector for IE2 or V as the original vector for
7950 // IE1.
7951 SmallBitVector ReusedIdx(
7952 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7953 bool IsReusedIdx = false;
7954 do {
7955 if (IE2 == VU && !IE1)
7956 return VU->hasOneUse();
7957 if (IE1 == V && !IE2)
7958 return V->hasOneUse();
7959 if (IE1 && IE1 != V) {
7960 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7961 IsReusedIdx |= ReusedIdx.test(Idx1);
7962 ReusedIdx.set(Idx1);
7963 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7964 IE1 = nullptr;
7965 else
7966 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7967 }
7968 if (IE2 && IE2 != VU) {
7969 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7970 IsReusedIdx |= ReusedIdx.test(Idx2);
7971 ReusedIdx.set(Idx2);
7972 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7973 IE2 = nullptr;
7974 else
7975 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7976 }
7977 } while (!IsReusedIdx && (IE1 || IE2));
7978 return false;
7979}
7980
7981/// Checks if the specified instruction \p I is an alternate operation for
7982/// the given \p MainOp and \p AltOp instructions.
7983static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7984 Instruction *AltOp,
7985 const TargetLibraryInfo &TLI);
7986
7987std::optional<BoUpSLP::OrdersType>
7988BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7989 bool IgnoreReorder) {
7990 // No need to reorder if need to shuffle reuses, still need to shuffle the
7991 // node.
7992 if (!TE.ReuseShuffleIndices.empty()) {
7993 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7994 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7995 "Reshuffling scalars not yet supported for nodes with padding");
7996
7997 if (isSplat(TE.Scalars))
7998 return std::nullopt;
7999 // Check if reuse shuffle indices can be improved by reordering.
8000 // For this, check that reuse mask is "clustered", i.e. each scalar values
8001 // is used once in each submask of size <number_of_scalars>.
8002 // Example: 4 scalar values.
8003 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
8004 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
8005 // element 3 is used twice in the second submask.
8006 unsigned Sz = TE.Scalars.size();
8007 if (TE.isGather()) {
8008 if (std::optional<OrdersType> CurrentOrder =
8009 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
8010 SmallVector<int> Mask;
8011 fixupOrderingIndices(*CurrentOrder);
8012 inversePermutation(*CurrentOrder, Mask);
8013 ::addMask(Mask, TE.ReuseShuffleIndices);
8014 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
8015 unsigned Sz = TE.Scalars.size();
8016 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
8017 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
8018 if (Idx != PoisonMaskElem)
8019 Res[Idx + K * Sz] = I + K * Sz;
8020 }
8021 return std::move(Res);
8022 }
8023 }
8024 if (Sz == 2 && TE.getVectorFactor() == 4 &&
8025 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
8026 2 * TE.getVectorFactor())) == 1)
8027 return std::nullopt;
8028 if (TE.ReuseShuffleIndices.size() % Sz != 0)
8029 return std::nullopt;
8030 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
8031 Sz)) {
8032 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8033 if (TE.ReorderIndices.empty())
8034 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
8035 else
8036 inversePermutation(TE.ReorderIndices, ReorderMask);
8037 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
8038 unsigned VF = ReorderMask.size();
8039 OrdersType ResOrder(VF, VF);
8040 unsigned NumParts = divideCeil(VF, Sz);
8041 SmallBitVector UsedVals(NumParts);
8042 for (unsigned I = 0; I < VF; I += Sz) {
8043 int Val = PoisonMaskElem;
8044 unsigned UndefCnt = 0;
8045 unsigned Limit = std::min(Sz, VF - I);
8046 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
8047 [&](int Idx) {
8048 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
8049 Val = Idx;
8050 if (Idx == PoisonMaskElem)
8051 ++UndefCnt;
8052 return Idx != PoisonMaskElem && Idx != Val;
8053 }) ||
8054 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
8055 UndefCnt > Sz / 2)
8056 return std::nullopt;
8057 UsedVals.set(Val);
8058 for (unsigned K = 0; K < NumParts; ++K) {
8059 unsigned Idx = Val + Sz * K;
8060 if (Idx < VF && I + K < VF)
8061 ResOrder[Idx] = I + K;
8062 }
8063 }
8064 return std::move(ResOrder);
8065 }
8066 unsigned VF = TE.getVectorFactor();
8067 // Try build correct order for extractelement instructions.
8068 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
8069 TE.ReuseShuffleIndices.end());
8070 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8071 all_of(TE.Scalars, [Sz](Value *V) {
8072 if (isa<PoisonValue>(V))
8073 return true;
8074 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
8075 return Idx && *Idx < Sz;
8076 })) {
8077 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
8078 "by BinaryOperator and CastInst.");
8079 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8080 if (TE.ReorderIndices.empty())
8081 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
8082 else
8083 inversePermutation(TE.ReorderIndices, ReorderMask);
8084 for (unsigned I = 0; I < VF; ++I) {
8085 int &Idx = ReusedMask[I];
8086 if (Idx == PoisonMaskElem)
8087 continue;
8088 Value *V = TE.Scalars[ReorderMask[Idx]];
8089 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
8090 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
8091 }
8092 }
8093 // Build the order of the VF size, need to reorder reuses shuffles, they are
8094 // always of VF size.
8095 OrdersType ResOrder(VF);
8096 std::iota(ResOrder.begin(), ResOrder.end(), 0);
8097 auto *It = ResOrder.begin();
8098 for (unsigned K = 0; K < VF; K += Sz) {
8099 OrdersType CurrentOrder(TE.ReorderIndices);
8100 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
8101 if (SubMask.front() == PoisonMaskElem)
8102 std::iota(SubMask.begin(), SubMask.end(), 0);
8103 reorderOrder(CurrentOrder, SubMask);
8104 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
8105 std::advance(It, Sz);
8106 }
8107 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
8108 return Data.index() == Data.value();
8109 }))
8110 return std::nullopt; // No need to reorder.
8111 return std::move(ResOrder);
8112 }
8113 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8114 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8115 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
8116 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
8117 return std::nullopt;
8118 if (TE.State == TreeEntry::SplitVectorize ||
8119 ((TE.State == TreeEntry::Vectorize ||
8120 TE.State == TreeEntry::StridedVectorize ||
8121 TE.State == TreeEntry::CompressVectorize) &&
8123 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
8124 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8125 "Alternate instructions are only supported by "
8126 "BinaryOperator and CastInst.");
8127 return TE.ReorderIndices;
8128 }
8129 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8130 TE.isAltShuffle()) {
8131 assert(TE.ReuseShuffleIndices.empty() &&
8132 "ReuseShuffleIndices should be "
8133 "empty for alternate instructions.");
8134 SmallVector<int> Mask;
8135 TE.buildAltOpShuffleMask(
8136 [&](Instruction *I) {
8137 assert(TE.getMatchingMainOpOrAltOp(I) &&
8138 "Unexpected main/alternate opcode");
8139 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
8140 },
8141 Mask);
8142 const int VF = TE.getVectorFactor();
8143 OrdersType ResOrder(VF, VF);
8144 for (unsigned I : seq<unsigned>(VF)) {
8145 if (Mask[I] == PoisonMaskElem)
8146 continue;
8147 ResOrder[Mask[I] % VF] = I;
8148 }
8149 return std::move(ResOrder);
8150 }
8151 if (!TE.ReorderIndices.empty())
8152 return TE.ReorderIndices;
8153 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8154 if (!TE.ReorderIndices.empty())
8155 return TE.ReorderIndices;
8156
8157 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8158 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
8159 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
8160 continue;
8161 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
8162 if (!II)
8163 continue;
8164 Instruction *BVHead = nullptr;
8165 BasicBlock *BB = II->getParent();
8166 while (II && II->hasOneUse() && II->getParent() == BB) {
8167 BVHead = II;
8168 II = dyn_cast<InsertElementInst>(II->getOperand(0));
8169 }
8170 I = BVHead;
8171 }
8172
8173 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8174 assert(BB1 != BB2 && "Expected different basic blocks.");
8175 if (!DT->isReachableFromEntry(BB1))
8176 return false;
8177 if (!DT->isReachableFromEntry(BB2))
8178 return true;
8179 auto *NodeA = DT->getNode(BB1);
8180 auto *NodeB = DT->getNode(BB2);
8181 assert(NodeA && "Should only process reachable instructions");
8182 assert(NodeB && "Should only process reachable instructions");
8183 assert((NodeA == NodeB) ==
8184 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8185 "Different nodes should have different DFS numbers");
8186 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8187 };
8188 auto PHICompare = [&](unsigned I1, unsigned I2) {
8189 Value *V1 = TE.Scalars[I1];
8190 Value *V2 = TE.Scalars[I2];
8191 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8192 return false;
8193 if (isa<PoisonValue>(V1))
8194 return true;
8195 if (isa<PoisonValue>(V2))
8196 return false;
8197 if (V1->getNumUses() < V2->getNumUses())
8198 return true;
8199 if (V1->getNumUses() > V2->getNumUses())
8200 return false;
8201 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
8202 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
8203 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8204 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8205 FirstUserOfPhi2->getParent());
8206 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
8207 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
8208 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
8209 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
8210 if (IE1 && !IE2)
8211 return true;
8212 if (!IE1 && IE2)
8213 return false;
8214 if (IE1 && IE2) {
8215 if (UserBVHead[I1] && !UserBVHead[I2])
8216 return true;
8217 if (!UserBVHead[I1])
8218 return false;
8219 if (UserBVHead[I1] == UserBVHead[I2])
8220 return getElementIndex(IE1) < getElementIndex(IE2);
8221 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8222 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8223 UserBVHead[I2]->getParent());
8224 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8225 }
8226 if (EE1 && !EE2)
8227 return true;
8228 if (!EE1 && EE2)
8229 return false;
8230 if (EE1 && EE2) {
8231 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
8232 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
8233 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
8234 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
8235 if (!Inst2 && !P2)
8236 return Inst1 || P1;
8237 if (EE1->getOperand(0) == EE2->getOperand(0))
8238 return getElementIndex(EE1) < getElementIndex(EE2);
8239 if (!Inst1 && Inst2)
8240 return false;
8241 if (Inst1 && Inst2) {
8242 if (Inst1->getParent() != Inst2->getParent())
8243 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8244 return Inst1->comesBefore(Inst2);
8245 }
8246 if (!P1 && P2)
8247 return false;
8248 assert(P1 && P2 &&
8249 "Expected either instructions or arguments vector operands.");
8250 return P1->getArgNo() < P2->getArgNo();
8251 }
8252 return false;
8253 };
8254 OrdersType Phis(TE.Scalars.size());
8255 std::iota(Phis.begin(), Phis.end(), 0);
8256 stable_sort(Phis, PHICompare);
8257 if (isIdentityOrder(Phis))
8258 return std::nullopt; // No need to reorder.
8259 return std::move(Phis);
8260 }
8261 if (TE.isGather() &&
8262 (!TE.hasState() || !TE.isAltShuffle() ||
8263 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8264 allSameType(TE.Scalars)) {
8265 // TODO: add analysis of other gather nodes with extractelement
8266 // instructions and other values/instructions, not only undefs.
8267 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8269 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
8270 all_of(TE.Scalars, [](Value *V) {
8271 auto *EE = dyn_cast<ExtractElementInst>(V);
8272 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8273 })) {
8274 // Check that gather of extractelements can be represented as
8275 // just a shuffle of a single vector.
8276 OrdersType CurrentOrder;
8277 bool Reuse =
8278 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8279 if (Reuse || !CurrentOrder.empty())
8280 return std::move(CurrentOrder);
8281 }
8282 // If the gather node is <undef, v, .., poison> and
8283 // insertelement poison, v, 0 [+ permute]
8284 // is cheaper than
8285 // insertelement poison, v, n - try to reorder.
8286 // If rotating the whole graph, exclude the permute cost, the whole graph
8287 // might be transformed.
8288 int Sz = TE.Scalars.size();
8289 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
8290 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
8291 const auto *It = find_if_not(TE.Scalars, isConstant);
8292 if (It == TE.Scalars.begin())
8293 return OrdersType();
8294 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
8295 if (It != TE.Scalars.end()) {
8296 OrdersType Order(Sz, Sz);
8297 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8298 Order[Idx] = 0;
8299 fixupOrderingIndices(Order);
8300 SmallVector<int> Mask;
8301 inversePermutation(Order, Mask);
8302 InstructionCost PermuteCost =
8303 TopToBottom
8304 ? 0
8305 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
8306 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8307 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
8308 PoisonValue::get(Ty), *It);
8309 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8310 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
8311 PoisonValue::get(Ty), *It);
8312 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8313 OrdersType Order(Sz, Sz);
8314 Order[Idx] = 0;
8315 return std::move(Order);
8316 }
8317 }
8318 }
8319 if (isSplat(TE.Scalars))
8320 return std::nullopt;
8321 if (TE.Scalars.size() >= 3)
8322 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8323 return Order;
8324 // Check if can include the order of vectorized loads. For masked gathers do
8325 // extra analysis later, so include such nodes into a special list.
8326 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8327 SmallVector<Value *> PointerOps;
8328 StridedPtrInfo SPtrInfo;
8329 OrdersType CurrentOrder;
8330 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
8331 CurrentOrder, PointerOps, SPtrInfo);
8334 return std::move(CurrentOrder);
8335 }
8336 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8337 // has been auditted for correctness with non-power-of-two vectors.
8338 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
8339 if (std::optional<OrdersType> CurrentOrder =
8340 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8341 return CurrentOrder;
8342 }
8343 return std::nullopt;
8344}
8345
8346/// Checks if the given mask is a "clustered" mask with the same clusters of
8347/// size \p Sz, which are not identity submasks.
8349 unsigned Sz) {
8350 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
8351 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
8352 return false;
8353 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8354 ArrayRef<int> Cluster = Mask.slice(I, Sz);
8355 if (Cluster != FirstCluster)
8356 return false;
8357 }
8358 return true;
8359}
8360
8361void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8362 // Reorder reuses mask.
8363 reorderReuses(TE.ReuseShuffleIndices, Mask);
8364 const unsigned Sz = TE.Scalars.size();
8365 // For vectorized and non-clustered reused no need to do anything else.
8366 if (!TE.isGather() ||
8368 Sz) ||
8369 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8370 return;
8371 SmallVector<int> NewMask;
8372 inversePermutation(TE.ReorderIndices, NewMask);
8373 addMask(NewMask, TE.ReuseShuffleIndices);
8374 // Clear reorder since it is going to be applied to the new mask.
8375 TE.ReorderIndices.clear();
8376 // Try to improve gathered nodes with clustered reuses, if possible.
8377 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8378 SmallVector<unsigned> NewOrder(Slice);
8379 inversePermutation(NewOrder, NewMask);
8380 reorderScalars(TE.Scalars, NewMask);
8381 // Fill the reuses mask with the identity submasks.
8382 for (auto *It = TE.ReuseShuffleIndices.begin(),
8383 *End = TE.ReuseShuffleIndices.end();
8384 It != End; std::advance(It, Sz))
8385 std::iota(It, std::next(It, Sz), 0);
8386}
8387
8389 ArrayRef<unsigned> SecondaryOrder) {
8390 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8391 "Expected same size of orders");
8392 size_t Sz = Order.size();
8393 SmallBitVector UsedIndices(Sz);
8394 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8395 if (Order[Idx] != Sz)
8396 UsedIndices.set(Order[Idx]);
8397 }
8398 if (SecondaryOrder.empty()) {
8399 for (unsigned Idx : seq<unsigned>(0, Sz))
8400 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8401 Order[Idx] = Idx;
8402 } else {
8403 for (unsigned Idx : seq<unsigned>(0, Sz))
8404 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8405 !UsedIndices.test(SecondaryOrder[Idx]))
8406 Order[Idx] = SecondaryOrder[Idx];
8407 }
8408}
8409
8412 return false;
8413
8414 constexpr unsigned TinyVF = 2;
8415 constexpr unsigned TinyTree = 10;
8416 constexpr unsigned PhiOpsLimit = 12;
8417 constexpr unsigned GatherLoadsLimit = 2;
8418 if (VectorizableTree.size() <= TinyTree)
8419 return true;
8420 if (VectorizableTree.front()->hasState() &&
8421 !VectorizableTree.front()->isGather() &&
8422 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8423 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8424 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8425 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8426 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8427 VectorizableTree.front()->ReorderIndices.empty()) {
8428 // Check if the tree has only single store and single (unordered) load node,
8429 // other nodes are phis or geps/binops, combined with phis, and/or single
8430 // gather load node
8431 if (VectorizableTree.front()->hasState() &&
8432 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8433 VectorizableTree.front()->Scalars.size() == TinyVF &&
8434 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8435 return false;
8436 // Single node, which require reorder - skip.
8437 if (VectorizableTree.front()->hasState() &&
8438 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8439 VectorizableTree.front()->ReorderIndices.empty()) {
8440 const unsigned ReorderedSplitsCnt =
8441 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8442 return TE->State == TreeEntry::SplitVectorize &&
8443 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8444 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8445 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8446 });
8447 if (ReorderedSplitsCnt <= 1 &&
8448 static_cast<unsigned>(count_if(
8449 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8450 return ((!TE->isGather() &&
8451 (TE->ReorderIndices.empty() ||
8452 (TE->UserTreeIndex.UserTE &&
8453 TE->UserTreeIndex.UserTE->State ==
8454 TreeEntry::Vectorize &&
8455 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8456 .empty()))) ||
8457 (TE->isGather() && TE->ReorderIndices.empty() &&
8458 (!TE->hasState() || TE->isAltShuffle() ||
8459 TE->getOpcode() == Instruction::Load ||
8460 TE->getOpcode() == Instruction::ZExt ||
8461 TE->getOpcode() == Instruction::SExt))) &&
8462 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8463 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8464 return !isConstant(V) && isVectorized(V);
8465 }));
8466 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8467 return false;
8468 }
8469 bool HasPhis = false;
8470 bool HasLoad = true;
8471 unsigned GatherLoads = 0;
8472 for (const std::unique_ptr<TreeEntry> &TE :
8473 ArrayRef(VectorizableTree).drop_front()) {
8474 if (TE->State == TreeEntry::SplitVectorize)
8475 continue;
8476 if (!TE->hasState()) {
8477 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8479 continue;
8480 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8482 continue;
8483 return true;
8484 }
8485 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8486 if (!TE->isGather()) {
8487 HasLoad = false;
8488 continue;
8489 }
8490 if (HasLoad)
8491 return true;
8492 ++GatherLoads;
8493 if (GatherLoads >= GatherLoadsLimit)
8494 return true;
8495 }
8496 if (TE->getOpcode() == Instruction::GetElementPtr ||
8497 Instruction::isBinaryOp(TE->getOpcode()))
8498 continue;
8499 if (TE->getOpcode() != Instruction::PHI &&
8500 (!TE->hasCopyableElements() ||
8501 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8502 TE->Scalars.size() / 2))
8503 return true;
8504 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8505 TE->getNumOperands() > PhiOpsLimit)
8506 return false;
8507 HasPhis = true;
8508 }
8509 return !HasPhis;
8510 }
8511 return true;
8512}
8513
8514void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8515 ArrayRef<int> MaskOrder) {
8516 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8517 SmallVector<int> NewMask(getVectorFactor());
8518 SmallVector<int> NewMaskOrder(getVectorFactor());
8519 std::iota(NewMask.begin(), NewMask.end(), 0);
8520 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8521 if (Idx == 0) {
8522 copy(Mask, NewMask.begin());
8523 copy(MaskOrder, NewMaskOrder.begin());
8524 } else {
8525 assert(Idx == 1 && "Expected either 0 or 1 index.");
8526 unsigned Offset = CombinedEntriesWithIndices.back().second;
8527 for (unsigned I : seq<unsigned>(Mask.size())) {
8528 NewMask[I + Offset] = Mask[I] + Offset;
8529 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8530 }
8531 }
8532 reorderScalars(Scalars, NewMask);
8533 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8534 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8535 ReorderIndices.clear();
8536}
8537
8539 // Maps VF to the graph nodes.
8541 // ExtractElement gather nodes which can be vectorized and need to handle
8542 // their ordering.
8544
8545 // Phi nodes can have preferred ordering based on their result users
8547
8548 // AltShuffles can also have a preferred ordering that leads to fewer
8549 // instructions, e.g., the addsub instruction in x86.
8550 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8551
8552 // Maps a TreeEntry to the reorder indices of external users.
8554 ExternalUserReorderMap;
8555 // Find all reorderable nodes with the given VF.
8556 // Currently the are vectorized stores,loads,extracts + some gathering of
8557 // extracts.
8558 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8559 const std::unique_ptr<TreeEntry> &TE) {
8560 // Look for external users that will probably be vectorized.
8561 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8562 findExternalStoreUsersReorderIndices(TE.get());
8563 if (!ExternalUserReorderIndices.empty()) {
8564 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8565 ExternalUserReorderMap.try_emplace(TE.get(),
8566 std::move(ExternalUserReorderIndices));
8567 }
8568
8569 // Patterns like [fadd,fsub] can be combined into a single instruction in
8570 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8571 // to take into account their order when looking for the most used order.
8572 if (TE->hasState() && TE->isAltShuffle() &&
8573 TE->State != TreeEntry::SplitVectorize) {
8574 Type *ScalarTy = TE->Scalars[0]->getType();
8575 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8576 unsigned Opcode0 = TE->getOpcode();
8577 unsigned Opcode1 = TE->getAltOpcode();
8578 SmallBitVector OpcodeMask(
8579 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8580 // If this pattern is supported by the target then we consider the order.
8581 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8582 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8583 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8584 }
8585 // TODO: Check the reverse order too.
8586 }
8587
8588 bool IgnoreReorder =
8589 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8590 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8591 VectorizableTree.front()->getOpcode() == Instruction::Store);
8592 if (std::optional<OrdersType> CurrentOrder =
8593 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8594 // Do not include ordering for nodes used in the alt opcode vectorization,
8595 // better to reorder them during bottom-to-top stage. If follow the order
8596 // here, it causes reordering of the whole graph though actually it is
8597 // profitable just to reorder the subgraph that starts from the alternate
8598 // opcode vectorization node. Such nodes already end-up with the shuffle
8599 // instruction and it is just enough to change this shuffle rather than
8600 // rotate the scalars for the whole graph.
8601 unsigned Cnt = 0;
8602 const TreeEntry *UserTE = TE.get();
8603 while (UserTE && Cnt < RecursionMaxDepth) {
8604 if (!UserTE->UserTreeIndex)
8605 break;
8606 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8607 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8608 UserTE->UserTreeIndex.UserTE->Idx != 0)
8609 return;
8610 UserTE = UserTE->UserTreeIndex.UserTE;
8611 ++Cnt;
8612 }
8613 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8614 if (!(TE->State == TreeEntry::Vectorize ||
8615 TE->State == TreeEntry::StridedVectorize ||
8616 TE->State == TreeEntry::SplitVectorize ||
8617 TE->State == TreeEntry::CompressVectorize) ||
8618 !TE->ReuseShuffleIndices.empty())
8619 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8620 if (TE->State == TreeEntry::Vectorize &&
8621 TE->getOpcode() == Instruction::PHI)
8622 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8623 }
8624 });
8625
8626 // Reorder the graph nodes according to their vectorization factor.
8627 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8628 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8629 auto It = VFToOrderedEntries.find(VF);
8630 if (It == VFToOrderedEntries.end())
8631 continue;
8632 // Try to find the most profitable order. We just are looking for the most
8633 // used order and reorder scalar elements in the nodes according to this
8634 // mostly used order.
8635 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8636 // Delete VF entry upon exit.
8637 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(It); });
8638
8639 // All operands are reordered and used only in this node - propagate the
8640 // most used order to the user node.
8643 OrdersUses;
8644 for (const TreeEntry *OpTE : OrderedEntries) {
8645 // No need to reorder this nodes, still need to extend and to use shuffle,
8646 // just need to merge reordering shuffle and the reuse shuffle.
8647 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8648 OpTE->State != TreeEntry::SplitVectorize)
8649 continue;
8650 // Count number of orders uses.
8651 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8652 &PhisToOrders]() -> const OrdersType & {
8653 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8654 auto It = GathersToOrders.find(OpTE);
8655 if (It != GathersToOrders.end())
8656 return It->second;
8657 }
8658 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8659 auto It = AltShufflesToOrders.find(OpTE);
8660 if (It != AltShufflesToOrders.end())
8661 return It->second;
8662 }
8663 if (OpTE->State == TreeEntry::Vectorize &&
8664 OpTE->getOpcode() == Instruction::PHI) {
8665 auto It = PhisToOrders.find(OpTE);
8666 if (It != PhisToOrders.end())
8667 return It->second;
8668 }
8669 return OpTE->ReorderIndices;
8670 }();
8671 // First consider the order of the external scalar users.
8672 auto It = ExternalUserReorderMap.find(OpTE);
8673 if (It != ExternalUserReorderMap.end()) {
8674 const auto &ExternalUserReorderIndices = It->second;
8675 // If the OpTE vector factor != number of scalars - use natural order,
8676 // it is an attempt to reorder node with reused scalars but with
8677 // external uses.
8678 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8679 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8680 ExternalUserReorderIndices.size();
8681 } else {
8682 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8683 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8684 }
8685 // No other useful reorder data in this entry.
8686 if (Order.empty())
8687 continue;
8688 }
8689 // Stores actually store the mask, not the order, need to invert.
8690 if (OpTE->State == TreeEntry::Vectorize &&
8691 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8692 assert(!OpTE->isAltShuffle() &&
8693 "Alternate instructions are only supported by BinaryOperator "
8694 "and CastInst.");
8695 SmallVector<int> Mask;
8696 inversePermutation(Order, Mask);
8697 unsigned E = Order.size();
8698 OrdersType CurrentOrder(E, E);
8699 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8700 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8701 });
8702 fixupOrderingIndices(CurrentOrder);
8703 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8704 } else {
8705 ++OrdersUses.try_emplace(Order, 0).first->second;
8706 }
8707 }
8708 if (OrdersUses.empty())
8709 continue;
8710 // Choose the most used order.
8711 unsigned IdentityCnt = 0;
8712 unsigned FilledIdentityCnt = 0;
8713 OrdersType IdentityOrder(VF, VF);
8714 for (auto &Pair : OrdersUses) {
8715 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8716 if (!Pair.first.empty())
8717 FilledIdentityCnt += Pair.second;
8718 IdentityCnt += Pair.second;
8719 combineOrders(IdentityOrder, Pair.first);
8720 }
8721 }
8722 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8723 unsigned Cnt = IdentityCnt;
8724 for (auto &Pair : OrdersUses) {
8725 // Prefer identity order. But, if filled identity found (non-empty order)
8726 // with same number of uses, as the new candidate order, we can choose
8727 // this candidate order.
8728 if (Cnt < Pair.second ||
8729 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8730 Cnt == Pair.second && !BestOrder.empty() &&
8731 isIdentityOrder(BestOrder))) {
8732 combineOrders(Pair.first, BestOrder);
8733 BestOrder = Pair.first;
8734 Cnt = Pair.second;
8735 } else {
8736 combineOrders(BestOrder, Pair.first);
8737 }
8738 }
8739 // Set order of the user node.
8740 if (isIdentityOrder(BestOrder))
8741 continue;
8742 fixupOrderingIndices(BestOrder);
8743 SmallVector<int> Mask;
8744 inversePermutation(BestOrder, Mask);
8745 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8746 unsigned E = BestOrder.size();
8747 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8748 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8749 });
8750 // Do an actual reordering, if profitable.
8751 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8752 // Just do the reordering for the nodes with the given VF.
8753 if (TE->Scalars.size() != VF) {
8754 if (TE->ReuseShuffleIndices.size() == VF) {
8755 assert(TE->State != TreeEntry::SplitVectorize &&
8756 "Split vectorized not expected.");
8757 // Need to reorder the reuses masks of the operands with smaller VF to
8758 // be able to find the match between the graph nodes and scalar
8759 // operands of the given node during vectorization/cost estimation.
8760 assert(
8761 (!TE->UserTreeIndex ||
8762 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8763 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8764 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8765 "All users must be of VF size.");
8766 if (SLPReVec) {
8767 assert(SLPReVec && "Only supported by REVEC.");
8768 // ShuffleVectorInst does not do reorderOperands (and it should not
8769 // because ShuffleVectorInst supports only a limited set of
8770 // patterns). Only do reorderNodeWithReuses if the user is not
8771 // ShuffleVectorInst.
8772 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8773 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8774 continue;
8775 }
8776 // Update ordering of the operands with the smaller VF than the given
8777 // one.
8778 reorderNodeWithReuses(*TE, Mask);
8779 // Update orders in user split vectorize nodes.
8780 if (TE->UserTreeIndex &&
8781 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8782 TE->UserTreeIndex.UserTE->reorderSplitNode(
8783 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8784 }
8785 continue;
8786 }
8787 if ((TE->State == TreeEntry::SplitVectorize &&
8788 TE->ReuseShuffleIndices.empty()) ||
8789 ((TE->State == TreeEntry::Vectorize ||
8790 TE->State == TreeEntry::StridedVectorize ||
8791 TE->State == TreeEntry::CompressVectorize) &&
8793 InsertElementInst>(TE->getMainOp()) ||
8794 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8795 assert(
8796 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8797 TE->ReuseShuffleIndices.empty())) &&
8798 "Alternate instructions are only supported by BinaryOperator "
8799 "and CastInst.");
8800 // Build correct orders for extract{element,value}, loads,
8801 // stores and alternate (split) nodes.
8802 reorderOrder(TE->ReorderIndices, Mask);
8803 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8804 TE->reorderOperands(Mask);
8805 } else {
8806 // Reorder the node and its operands.
8807 TE->reorderOperands(Mask);
8808 assert(TE->ReorderIndices.empty() &&
8809 "Expected empty reorder sequence.");
8810 reorderScalars(TE->Scalars, Mask);
8811 }
8812 if (!TE->ReuseShuffleIndices.empty()) {
8813 // Apply reversed order to keep the original ordering of the reused
8814 // elements to avoid extra reorder indices shuffling.
8815 OrdersType CurrentOrder;
8816 reorderOrder(CurrentOrder, MaskOrder);
8817 SmallVector<int> NewReuses;
8818 inversePermutation(CurrentOrder, NewReuses);
8819 addMask(NewReuses, TE->ReuseShuffleIndices);
8820 TE->ReuseShuffleIndices.swap(NewReuses);
8821 } else if (TE->UserTreeIndex &&
8822 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8823 // Update orders in user split vectorize nodes.
8824 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8825 Mask, MaskOrder);
8826 }
8827 }
8828}
8829
8830void BoUpSLP::buildReorderableOperands(
8831 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8832 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8833 SmallVectorImpl<TreeEntry *> &GatherOps) {
8834 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8835 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8836 return OpData.first == I &&
8837 (OpData.second->State == TreeEntry::Vectorize ||
8838 OpData.second->State == TreeEntry::StridedVectorize ||
8839 OpData.second->State == TreeEntry::CompressVectorize ||
8840 OpData.second->State == TreeEntry::SplitVectorize);
8841 }))
8842 continue;
8843 // Do not request operands, if they do not exist.
8844 if (UserTE->hasState()) {
8845 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8846 UserTE->getOpcode() == Instruction::ExtractValue)
8847 continue;
8848 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8849 continue;
8850 if (UserTE->getOpcode() == Instruction::Store &&
8851 UserTE->State == TreeEntry::Vectorize && I == 1)
8852 continue;
8853 if (UserTE->getOpcode() == Instruction::Load &&
8854 (UserTE->State == TreeEntry::Vectorize ||
8855 UserTE->State == TreeEntry::StridedVectorize ||
8856 UserTE->State == TreeEntry::CompressVectorize))
8857 continue;
8858 }
8859 TreeEntry *TE = getOperandEntry(UserTE, I);
8860 assert(TE && "Expected operand entry.");
8861 if (!TE->isGather()) {
8862 // Add the node to the list of the ordered nodes with the identity
8863 // order.
8864 Edges.emplace_back(I, TE);
8865 // Add ScatterVectorize nodes to the list of operands, where just
8866 // reordering of the scalars is required. Similar to the gathers, so
8867 // simply add to the list of gathered ops.
8868 // If there are reused scalars, process this node as a regular vectorize
8869 // node, just reorder reuses mask.
8870 if (TE->State == TreeEntry::ScatterVectorize &&
8871 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8872 GatherOps.push_back(TE);
8873 continue;
8874 }
8875 if (ReorderableGathers.contains(TE))
8876 GatherOps.push_back(TE);
8877 }
8878}
8879
8880void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8881 struct TreeEntryCompare {
8882 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8883 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8884 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8885 return LHS->Idx < RHS->Idx;
8886 }
8887 };
8889 DenseSet<const TreeEntry *> GathersToOrders;
8890 // Find all reorderable leaf nodes with the given VF.
8891 // Currently the are vectorized loads,extracts without alternate operands +
8892 // some gathering of extracts.
8894 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8895 if (TE->State != TreeEntry::Vectorize &&
8896 TE->State != TreeEntry::StridedVectorize &&
8897 TE->State != TreeEntry::CompressVectorize &&
8898 TE->State != TreeEntry::SplitVectorize)
8899 NonVectorized.insert(TE.get());
8900 if (std::optional<OrdersType> CurrentOrder =
8901 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8902 Queue.push(TE.get());
8903 if (!(TE->State == TreeEntry::Vectorize ||
8904 TE->State == TreeEntry::StridedVectorize ||
8905 TE->State == TreeEntry::CompressVectorize ||
8906 TE->State == TreeEntry::SplitVectorize) ||
8907 !TE->ReuseShuffleIndices.empty())
8908 GathersToOrders.insert(TE.get());
8909 }
8910 }
8911
8912 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8913 // I.e., if the node has operands, that are reordered, try to make at least
8914 // one operand order in the natural order and reorder others + reorder the
8915 // user node itself.
8916 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8917 while (!Queue.empty()) {
8918 // 1. Filter out only reordered nodes.
8919 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8920 TreeEntry *TE = Queue.top();
8921 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8922 Queue.pop();
8923 SmallVector<TreeEntry *> OrderedOps(1, TE);
8924 while (!Queue.empty()) {
8925 TE = Queue.top();
8926 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8927 break;
8928 Queue.pop();
8929 OrderedOps.push_back(TE);
8930 }
8931 for (TreeEntry *TE : OrderedOps) {
8932 if (!(TE->State == TreeEntry::Vectorize ||
8933 TE->State == TreeEntry::StridedVectorize ||
8934 TE->State == TreeEntry::CompressVectorize ||
8935 TE->State == TreeEntry::SplitVectorize ||
8936 (TE->isGather() && GathersToOrders.contains(TE))) ||
8937 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8938 !Visited.insert(TE).second)
8939 continue;
8940 // Build a map between user nodes and their operands order to speedup
8941 // search. The graph currently does not provide this dependency directly.
8942 Users.first = TE->UserTreeIndex.UserTE;
8943 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8944 }
8945 if (Users.first) {
8946 auto &Data = Users;
8947 if (Data.first->State == TreeEntry::SplitVectorize) {
8948 assert(
8949 Data.second.size() <= 2 &&
8950 "Expected not greater than 2 operands for split vectorize node.");
8951 if (any_of(Data.second,
8952 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8953 continue;
8954 // Update orders in user split vectorize nodes.
8955 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8956 "Expected exactly 2 entries.");
8957 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8958 TreeEntry &OpTE = *VectorizableTree[P.first];
8959 OrdersType Order = OpTE.ReorderIndices;
8960 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8961 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8962 continue;
8963 const auto BestOrder =
8964 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8965 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8966 continue;
8967 Order = *BestOrder;
8968 }
8969 fixupOrderingIndices(Order);
8970 SmallVector<int> Mask;
8971 inversePermutation(Order, Mask);
8972 const unsigned E = Order.size();
8973 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8974 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8975 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8976 });
8977 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8978 // Clear ordering of the operand.
8979 if (!OpTE.ReorderIndices.empty()) {
8980 OpTE.ReorderIndices.clear();
8981 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8982 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8983 } else {
8984 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8985 reorderScalars(OpTE.Scalars, Mask);
8986 }
8987 }
8988 if (Data.first->ReuseShuffleIndices.empty() &&
8989 !Data.first->ReorderIndices.empty()) {
8990 // Insert user node to the list to try to sink reordering deeper in
8991 // the graph.
8992 Queue.push(Data.first);
8993 }
8994 continue;
8995 }
8996 // Check that operands are used only in the User node.
8997 SmallVector<TreeEntry *> GatherOps;
8998 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8999 GatherOps);
9000 // All operands are reordered and used only in this node - propagate the
9001 // most used order to the user node.
9004 OrdersUses;
9005 // Do the analysis for each tree entry only once, otherwise the order of
9006 // the same node my be considered several times, though might be not
9007 // profitable.
9010 for (const auto &Op : Data.second) {
9011 TreeEntry *OpTE = Op.second;
9012 if (!VisitedOps.insert(OpTE).second)
9013 continue;
9014 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
9015 continue;
9016 const auto Order = [&]() -> const OrdersType {
9017 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
9018 return getReorderingData(*OpTE, /*TopToBottom=*/false,
9019 IgnoreReorder)
9020 .value_or(OrdersType(1));
9021 return OpTE->ReorderIndices;
9022 }();
9023 // The order is partially ordered, skip it in favor of fully non-ordered
9024 // orders.
9025 if (Order.size() == 1)
9026 continue;
9027
9028 // Check that the reordering does not increase number of shuffles, i.e.
9029 // same-values-nodes has same parents or their parents has same parents.
9030 if (!Order.empty() && !isIdentityOrder(Order)) {
9031 Value *Root = OpTE->hasState()
9032 ? OpTE->getMainOp()
9033 : *find_if_not(OpTE->Scalars, isConstant);
9034 auto GetSameNodesUsers = [&](Value *Root) {
9036 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
9037 if (TE != OpTE && TE->UserTreeIndex &&
9038 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9039 TE->Scalars.size() == OpTE->Scalars.size() &&
9040 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9041 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9042 Res.insert(TE->UserTreeIndex.UserTE);
9043 }
9044 for (const TreeEntry *TE : getTreeEntries(Root)) {
9045 if (TE != OpTE && TE->UserTreeIndex &&
9046 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9047 TE->Scalars.size() == OpTE->Scalars.size() &&
9048 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9049 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9050 Res.insert(TE->UserTreeIndex.UserTE);
9051 }
9052 return Res.takeVector();
9053 };
9054 auto GetNumOperands = [](const TreeEntry *TE) {
9055 if (TE->State == TreeEntry::SplitVectorize)
9056 return TE->getNumOperands();
9057 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
9058 return CI->arg_size();
9059 return TE->getNumOperands();
9060 };
9061 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9062 const TreeEntry *TE) {
9064 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
9066 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
9069 continue;
9070 const TreeEntry *Op = getOperandEntry(TE, Idx);
9071 if (Op->isGather() && Op->hasState()) {
9072 const TreeEntry *VecOp =
9073 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
9074 if (VecOp)
9075 Op = VecOp;
9076 }
9077 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
9078 return false;
9079 }
9080 return true;
9081 };
9082 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
9083 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
9084 if (!RevisitedOps.insert(UTE).second)
9085 return false;
9086 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
9087 !UTE->ReuseShuffleIndices.empty() ||
9088 (UTE->UserTreeIndex &&
9089 UTE->UserTreeIndex.UserTE == Data.first) ||
9090 (Data.first->UserTreeIndex &&
9091 Data.first->UserTreeIndex.UserTE == UTE) ||
9092 (IgnoreReorder && UTE->UserTreeIndex &&
9093 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9094 NodeShouldBeReorderedWithOperands(UTE);
9095 }))
9096 continue;
9097 for (TreeEntry *UTE : Users) {
9099 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
9101 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
9104 continue;
9105 const TreeEntry *Op = getOperandEntry(UTE, Idx);
9106 Visited.erase(Op);
9107 Queue.push(const_cast<TreeEntry *>(Op));
9108 }
9109 }
9110 }
9111 unsigned NumOps = count_if(
9112 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9113 return P.second == OpTE;
9114 });
9115 // Stores actually store the mask, not the order, need to invert.
9116 if (OpTE->State == TreeEntry::Vectorize &&
9117 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9118 assert(!OpTE->isAltShuffle() &&
9119 "Alternate instructions are only supported by BinaryOperator "
9120 "and CastInst.");
9121 SmallVector<int> Mask;
9122 inversePermutation(Order, Mask);
9123 unsigned E = Order.size();
9124 OrdersType CurrentOrder(E, E);
9125 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
9126 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9127 });
9128 fixupOrderingIndices(CurrentOrder);
9129 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
9130 } else {
9131 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
9132 }
9133 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
9134 const auto AllowsReordering = [&](const TreeEntry *TE) {
9135 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9136 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9137 (IgnoreReorder && TE->Idx == 0))
9138 return true;
9139 if (TE->isGather()) {
9140 if (GathersToOrders.contains(TE))
9141 return !getReorderingData(*TE, /*TopToBottom=*/false,
9142 IgnoreReorder)
9143 .value_or(OrdersType(1))
9144 .empty();
9145 return true;
9146 }
9147 return false;
9148 };
9149 if (OpTE->UserTreeIndex) {
9150 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9151 if (!VisitedUsers.insert(UserTE).second)
9152 continue;
9153 // May reorder user node if it requires reordering, has reused
9154 // scalars, is an alternate op vectorize node or its op nodes require
9155 // reordering.
9156 if (AllowsReordering(UserTE))
9157 continue;
9158 // Check if users allow reordering.
9159 // Currently look up just 1 level of operands to avoid increase of
9160 // the compile time.
9161 // Profitable to reorder if definitely more operands allow
9162 // reordering rather than those with natural order.
9164 if (static_cast<unsigned>(count_if(
9165 Ops, [UserTE, &AllowsReordering](
9166 const std::pair<unsigned, TreeEntry *> &Op) {
9167 return AllowsReordering(Op.second) &&
9168 Op.second->UserTreeIndex.UserTE == UserTE;
9169 })) <= Ops.size() / 2)
9170 ++Res.first->second;
9171 }
9172 }
9173 if (OrdersUses.empty()) {
9174 Visited.insert_range(llvm::make_second_range(Data.second));
9175 continue;
9176 }
9177 // Choose the most used order.
9178 unsigned IdentityCnt = 0;
9179 unsigned VF = Data.second.front().second->getVectorFactor();
9180 OrdersType IdentityOrder(VF, VF);
9181 for (auto &Pair : OrdersUses) {
9182 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
9183 IdentityCnt += Pair.second;
9184 combineOrders(IdentityOrder, Pair.first);
9185 }
9186 }
9187 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9188 unsigned Cnt = IdentityCnt;
9189 for (auto &Pair : OrdersUses) {
9190 // Prefer identity order. But, if filled identity found (non-empty
9191 // order) with same number of uses, as the new candidate order, we can
9192 // choose this candidate order.
9193 if (Cnt < Pair.second) {
9194 combineOrders(Pair.first, BestOrder);
9195 BestOrder = Pair.first;
9196 Cnt = Pair.second;
9197 } else {
9198 combineOrders(BestOrder, Pair.first);
9199 }
9200 }
9201 // Set order of the user node.
9202 if (isIdentityOrder(BestOrder)) {
9203 Visited.insert_range(llvm::make_second_range(Data.second));
9204 continue;
9205 }
9206 fixupOrderingIndices(BestOrder);
9207 // Erase operands from OrderedEntries list and adjust their orders.
9208 VisitedOps.clear();
9209 SmallVector<int> Mask;
9210 inversePermutation(BestOrder, Mask);
9211 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9212 unsigned E = BestOrder.size();
9213 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
9214 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9215 });
9216 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9217 TreeEntry *TE = Op.second;
9218 if (!VisitedOps.insert(TE).second)
9219 continue;
9220 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9221 reorderNodeWithReuses(*TE, Mask);
9222 continue;
9223 }
9224 // Gathers are processed separately.
9225 if (TE->State != TreeEntry::Vectorize &&
9226 TE->State != TreeEntry::StridedVectorize &&
9227 TE->State != TreeEntry::CompressVectorize &&
9228 TE->State != TreeEntry::SplitVectorize &&
9229 (TE->State != TreeEntry::ScatterVectorize ||
9230 TE->ReorderIndices.empty()))
9231 continue;
9232 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9233 TE->ReorderIndices.empty()) &&
9234 "Non-matching sizes of user/operand entries.");
9235 reorderOrder(TE->ReorderIndices, Mask);
9236 if (IgnoreReorder && TE == VectorizableTree.front().get())
9237 IgnoreReorder = false;
9238 }
9239 // For gathers just need to reorder its scalars.
9240 for (TreeEntry *Gather : GatherOps) {
9241 assert(Gather->ReorderIndices.empty() &&
9242 "Unexpected reordering of gathers.");
9243 if (!Gather->ReuseShuffleIndices.empty()) {
9244 // Just reorder reuses indices.
9245 reorderReuses(Gather->ReuseShuffleIndices, Mask);
9246 continue;
9247 }
9248 reorderScalars(Gather->Scalars, Mask);
9249 Visited.insert(Gather);
9250 }
9251 // Reorder operands of the user node and set the ordering for the user
9252 // node itself.
9253 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9254 return TE.isAltShuffle() &&
9255 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9256 TE.ReorderIndices.empty());
9257 };
9258 if (Data.first->State != TreeEntry::Vectorize ||
9260 Data.first->getMainOp()) ||
9261 IsNotProfitableAltCodeNode(*Data.first))
9262 Data.first->reorderOperands(Mask);
9263 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
9264 IsNotProfitableAltCodeNode(*Data.first) ||
9265 Data.first->State == TreeEntry::StridedVectorize ||
9266 Data.first->State == TreeEntry::CompressVectorize) {
9267 reorderScalars(Data.first->Scalars, Mask);
9268 reorderOrder(Data.first->ReorderIndices, MaskOrder,
9269 /*BottomOrder=*/true);
9270 if (Data.first->ReuseShuffleIndices.empty() &&
9271 !Data.first->ReorderIndices.empty() &&
9272 !IsNotProfitableAltCodeNode(*Data.first)) {
9273 // Insert user node to the list to try to sink reordering deeper in
9274 // the graph.
9275 Queue.push(Data.first);
9276 }
9277 } else {
9278 reorderOrder(Data.first->ReorderIndices, Mask);
9279 }
9280 }
9281 }
9282 // If the reordering is unnecessary, just remove the reorder.
9283 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9284 VectorizableTree.front()->ReuseShuffleIndices.empty())
9285 VectorizableTree.front()->ReorderIndices.clear();
9286}
9287
9288Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9289 if (Entry.hasState() &&
9290 (Entry.getOpcode() == Instruction::Store ||
9291 Entry.getOpcode() == Instruction::Load) &&
9292 Entry.State == TreeEntry::StridedVectorize &&
9293 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
9294 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
9295 return dyn_cast<Instruction>(Entry.Scalars.front());
9296}
9297
9299 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9300 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9301 DenseMap<Value *, unsigned> ScalarToExtUses;
9302 // Collect the values that we need to extract from the tree.
9303 for (auto &TEPtr : VectorizableTree) {
9304 TreeEntry *Entry = TEPtr.get();
9305
9306 // No need to handle users of gathered values.
9307 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9308 DeletedNodes.contains(Entry) ||
9309 TransformedToGatherNodes.contains(Entry))
9310 continue;
9311
9312 // For each lane:
9313 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9314 Value *Scalar = Entry->Scalars[Lane];
9315 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
9316 continue;
9317
9318 // All uses must be replaced already? No need to do it again.
9319 auto It = ScalarToExtUses.find(Scalar);
9320 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9321 continue;
9322
9323 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9324 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9325 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9326 << " from " << *Scalar << "for many users.\n");
9327 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9328 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9329 ExternalUsesWithNonUsers.insert(Scalar);
9330 continue;
9331 }
9332
9333 // Check if the scalar is externally used as an extra arg.
9334 const auto ExtI = ExternallyUsedValues.find(Scalar);
9335 if (ExtI != ExternallyUsedValues.end()) {
9336 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9337 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9338 << FoundLane << " from " << *Scalar << ".\n");
9339 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
9340 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9341 continue;
9342 }
9343 for (User *U : Scalar->users()) {
9344 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9345
9346 Instruction *UserInst = dyn_cast<Instruction>(U);
9347 if (!UserInst || isDeleted(UserInst))
9348 continue;
9349
9350 // Ignore users in the user ignore list.
9351 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9352 continue;
9353
9354 // Skip in-tree scalars that become vectors
9355 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
9356 any_of(UseEntries, [this](const TreeEntry *UseEntry) {
9357 return !DeletedNodes.contains(UseEntry) &&
9358 !TransformedToGatherNodes.contains(UseEntry);
9359 })) {
9360 // Some in-tree scalars will remain as scalar in vectorized
9361 // instructions. If that is the case, the one in FoundLane will
9362 // be used.
9363 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9364 isa<LoadInst, StoreInst>(UserInst)) ||
9365 isa<CallInst>(UserInst)) ||
9366 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9367 if (DeletedNodes.contains(UseEntry) ||
9368 TransformedToGatherNodes.contains(UseEntry))
9369 return true;
9370 return UseEntry->State == TreeEntry::ScatterVectorize ||
9372 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9373 TTI);
9374 })) {
9375 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9376 << ".\n");
9377 assert(none_of(UseEntries,
9378 [](TreeEntry *UseEntry) {
9379 return UseEntry->isGather();
9380 }) &&
9381 "Bad state");
9382 continue;
9383 }
9384 U = nullptr;
9385 if (It != ScalarToExtUses.end()) {
9386 ExternalUses[It->second].User = nullptr;
9387 break;
9388 }
9389 }
9390
9391 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9392 U = nullptr;
9393 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9394 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9395 << " from lane " << FoundLane << " from " << *Scalar
9396 << ".\n");
9397 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9398 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9399 ExternalUsesWithNonUsers.insert(Scalar);
9400 if (!U)
9401 break;
9402 }
9403 }
9404 }
9405}
9406
9408BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9411 PtrToStoresMap;
9412 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9413 Value *V = TE->Scalars[Lane];
9414 // Don't iterate over the users of constant data.
9415 if (!isa<Instruction>(V))
9416 continue;
9417 // To save compilation time we don't visit if we have too many users.
9418 if (V->hasNUsesOrMore(UsesLimit))
9419 break;
9420
9421 // Collect stores per pointer object.
9422 for (User *U : V->users()) {
9423 auto *SI = dyn_cast<StoreInst>(U);
9424 // Test whether we can handle the store. V might be a global, which could
9425 // be used in a different function.
9426 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9427 !isValidElementType(SI->getValueOperand()->getType()))
9428 continue;
9429 // Skip entry if already
9430 if (isVectorized(U))
9431 continue;
9432
9433 Value *Ptr =
9434 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9435 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9436 SI->getValueOperand()->getType(), Ptr}];
9437 // For now just keep one store per pointer object per lane.
9438 // TODO: Extend this to support multiple stores per pointer per lane
9439 if (StoresVec.size() > Lane)
9440 continue;
9441 if (!StoresVec.empty()) {
9442 std::optional<int64_t> Diff = getPointersDiff(
9443 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9444 SI->getValueOperand()->getType(),
9445 StoresVec.front()->getPointerOperand(), *DL, *SE,
9446 /*StrictCheck=*/true);
9447 // We failed to compare the pointers so just abandon this store.
9448 if (!Diff)
9449 continue;
9450 }
9451 StoresVec.push_back(SI);
9452 }
9453 }
9454 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9455 unsigned I = 0;
9456 for (auto &P : PtrToStoresMap) {
9457 Res[I].swap(P.second);
9458 ++I;
9459 }
9460 return Res;
9461}
9462
9463bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9464 OrdersType &ReorderIndices) const {
9465 // We check whether the stores in StoreVec can form a vector by sorting them
9466 // and checking whether they are consecutive.
9467
9468 // To avoid calling getPointersDiff() while sorting we create a vector of
9469 // pairs {store, offset from first} and sort this instead.
9471 StoreInst *S0 = StoresVec[0];
9472 StoreOffsetVec.emplace_back(0, 0);
9473 Type *S0Ty = S0->getValueOperand()->getType();
9474 Value *S0Ptr = S0->getPointerOperand();
9475 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9476 StoreInst *SI = StoresVec[Idx];
9477 std::optional<int64_t> Diff =
9478 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9479 SI->getPointerOperand(), *DL, *SE,
9480 /*StrictCheck=*/true);
9481 StoreOffsetVec.emplace_back(*Diff, Idx);
9482 }
9483
9484 // Check if the stores are consecutive by checking if their difference is 1.
9485 if (StoreOffsetVec.size() != StoresVec.size())
9486 return false;
9487 sort(StoreOffsetVec, llvm::less_first());
9488 unsigned Idx = 0;
9489 int64_t PrevDist = 0;
9490 for (const auto &P : StoreOffsetVec) {
9491 if (Idx > 0 && P.first != PrevDist + 1)
9492 return false;
9493 PrevDist = P.first;
9494 ++Idx;
9495 }
9496
9497 // Calculate the shuffle indices according to their offset against the sorted
9498 // StoreOffsetVec.
9499 ReorderIndices.assign(StoresVec.size(), 0);
9500 bool IsIdentity = true;
9501 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9502 ReorderIndices[P.second] = I;
9503 IsIdentity &= P.second == I;
9504 }
9505 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9506 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9507 // same convention here.
9508 if (IsIdentity)
9509 ReorderIndices.clear();
9510
9511 return true;
9512}
9513
9514#ifndef NDEBUG
9516 for (unsigned Idx : Order)
9517 dbgs() << Idx << ", ";
9518 dbgs() << "\n";
9519}
9520#endif
9521
9523BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9524 unsigned NumLanes = TE->Scalars.size();
9525
9526 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9527
9528 // Holds the reorder indices for each candidate store vector that is a user of
9529 // the current TreeEntry.
9530 SmallVector<OrdersType, 1> ExternalReorderIndices;
9531
9532 // Now inspect the stores collected per pointer and look for vectorization
9533 // candidates. For each candidate calculate the reorder index vector and push
9534 // it into `ExternalReorderIndices`
9535 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9536 // If we have fewer than NumLanes stores, then we can't form a vector.
9537 if (StoresVec.size() != NumLanes)
9538 continue;
9539
9540 // If the stores are not consecutive then abandon this StoresVec.
9541 OrdersType ReorderIndices;
9542 if (!canFormVector(StoresVec, ReorderIndices))
9543 continue;
9544
9545 // We now know that the scalars in StoresVec can form a vector instruction,
9546 // so set the reorder indices.
9547 ExternalReorderIndices.push_back(ReorderIndices);
9548 }
9549 return ExternalReorderIndices;
9550}
9551
9553 const SmallDenseSet<Value *> &UserIgnoreLst) {
9554 deleteTree();
9555 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9556 "TreeEntryToStridedPtrInfoMap is not cleared");
9557 UserIgnoreList = &UserIgnoreLst;
9558 if (!allSameType(Roots))
9559 return;
9560 buildTreeRec(Roots, 0, EdgeInfo());
9561}
9562
9564 deleteTree();
9565 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9566 "TreeEntryToStridedPtrInfoMap is not cleared");
9567 if (!allSameType(Roots))
9568 return;
9569 buildTreeRec(Roots, 0, EdgeInfo());
9570}
9571
9572/// Tries to find subvector of loads and builds new vector of only loads if can
9573/// be profitable.
9575 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9577 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9578 bool AddNew = true) {
9579 if (VL.empty())
9580 return;
9581 Type *ScalarTy = getValueType(VL.front());
9582 if (!isValidElementType(ScalarTy))
9583 return;
9585 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9586 for (Value *V : VL) {
9587 auto *LI = dyn_cast<LoadInst>(V);
9588 if (!LI)
9589 continue;
9590 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9591 continue;
9592 bool IsFound = false;
9593 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9594 assert(LI->getParent() == Data.front().first->getParent() &&
9595 LI->getType() == Data.front().first->getType() &&
9596 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9597 getUnderlyingObject(Data.front().first->getPointerOperand(),
9599 "Expected loads with the same type, same parent and same "
9600 "underlying pointer.");
9601 std::optional<int64_t> Dist = getPointersDiff(
9602 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9603 Data.front().first->getPointerOperand(), DL, SE,
9604 /*StrictCheck=*/true);
9605 if (!Dist)
9606 continue;
9607 auto It = Map.find(*Dist);
9608 if (It != Map.end() && It->second != LI)
9609 continue;
9610 if (It == Map.end()) {
9611 Data.emplace_back(LI, *Dist);
9612 Map.try_emplace(*Dist, LI);
9613 }
9614 IsFound = true;
9615 break;
9616 }
9617 if (!IsFound) {
9618 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9619 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9620 }
9621 }
9622 auto FindMatchingLoads =
9625 &GatheredLoads,
9627 int64_t &Offset, unsigned &Start) {
9628 if (Loads.empty())
9629 return GatheredLoads.end();
9630 LoadInst *LI = Loads.front().first;
9631 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9632 if (Idx < Start)
9633 continue;
9634 ToAdd.clear();
9635 if (LI->getParent() != Data.front().first->getParent() ||
9636 LI->getType() != Data.front().first->getType())
9637 continue;
9638 std::optional<int64_t> Dist =
9640 Data.front().first->getType(),
9641 Data.front().first->getPointerOperand(), DL, SE,
9642 /*StrictCheck=*/true);
9643 if (!Dist)
9644 continue;
9645 SmallSet<int64_t, 4> DataDists;
9647 for (std::pair<LoadInst *, int64_t> P : Data) {
9648 DataDists.insert(P.second);
9649 DataLoads.insert(P.first);
9650 }
9651 // Found matching gathered loads - check if all loads are unique or
9652 // can be effectively vectorized.
9653 unsigned NumUniques = 0;
9654 for (auto [Cnt, Pair] : enumerate(Loads)) {
9655 bool Used = DataLoads.contains(Pair.first);
9656 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9657 ++NumUniques;
9658 ToAdd.insert(Cnt);
9659 } else if (Used) {
9660 Repeated.insert(Cnt);
9661 }
9662 }
9663 if (NumUniques > 0 &&
9664 (Loads.size() == NumUniques ||
9665 (Loads.size() - NumUniques >= 2 &&
9666 Loads.size() - NumUniques >= Loads.size() / 2 &&
9667 (has_single_bit(Data.size() + NumUniques) ||
9668 bit_ceil(Data.size()) <
9669 bit_ceil(Data.size() + NumUniques))))) {
9670 Offset = *Dist;
9671 Start = Idx + 1;
9672 return std::next(GatheredLoads.begin(), Idx);
9673 }
9674 }
9675 ToAdd.clear();
9676 return GatheredLoads.end();
9677 };
9678 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9679 unsigned Start = 0;
9680 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9681 int64_t Offset = 0;
9682 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9683 Offset, Start);
9684 while (It != GatheredLoads.end()) {
9685 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9686 for (unsigned Idx : LocalToAdd)
9687 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9688 ToAdd.insert_range(LocalToAdd);
9689 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9690 Start);
9691 }
9692 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9693 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9694 })) {
9695 auto AddNewLoads =
9697 for (unsigned Idx : seq<unsigned>(Data.size())) {
9698 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9699 continue;
9700 Loads.push_back(Data[Idx]);
9701 }
9702 };
9703 if (!AddNew) {
9704 LoadInst *LI = Data.front().first;
9705 It = find_if(
9706 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9707 return PD.front().first->getParent() == LI->getParent() &&
9708 PD.front().first->getType() == LI->getType();
9709 });
9710 while (It != GatheredLoads.end()) {
9711 AddNewLoads(*It);
9712 It = std::find_if(
9713 std::next(It), GatheredLoads.end(),
9714 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9715 return PD.front().first->getParent() == LI->getParent() &&
9716 PD.front().first->getType() == LI->getType();
9717 });
9718 }
9719 }
9720 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9721 AddNewLoads(GatheredLoads.emplace_back());
9722 }
9723 }
9724}
9725
9726void BoUpSLP::tryToVectorizeGatheredLoads(
9727 const SmallMapVector<
9728 std::tuple<BasicBlock *, Value *, Type *>,
9729 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9730 &GatheredLoads) {
9731 GatheredLoadsEntriesFirst = VectorizableTree.size();
9732
9733 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9734 LoadEntriesToVectorize.size());
9735 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9736 Set.insert_range(VectorizableTree[Idx]->Scalars);
9737
9738 // Sort loads by distance.
9739 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9740 const std::pair<LoadInst *, int64_t> &L2) {
9741 return L1.second > L2.second;
9742 };
9743
9744 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9745 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9746 Loads.size());
9747 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9748 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9749 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9750 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9751 };
9752
9753 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9754 BoUpSLP::ValueSet &VectorizedLoads,
9755 SmallVectorImpl<LoadInst *> &NonVectorized,
9756 bool Final, unsigned MaxVF) {
9758 unsigned StartIdx = 0;
9759 SmallVector<int> CandidateVFs;
9760 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9761 CandidateVFs.push_back(MaxVF);
9762 for (int NumElts = getFloorFullVectorNumberOfElements(
9763 *TTI, Loads.front()->getType(), MaxVF);
9764 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9765 *TTI, Loads.front()->getType(), NumElts - 1)) {
9766 CandidateVFs.push_back(NumElts);
9767 if (VectorizeNonPowerOf2 && NumElts > 2)
9768 CandidateVFs.push_back(NumElts - 1);
9769 }
9770
9771 if (Final && CandidateVFs.empty())
9772 return Results;
9773
9774 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9775 for (unsigned NumElts : CandidateVFs) {
9776 if (Final && NumElts > BestVF)
9777 continue;
9778 SmallVector<unsigned> MaskedGatherVectorized;
9779 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9780 ++Cnt) {
9781 ArrayRef<LoadInst *> Slice =
9782 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9783 if (VectorizedLoads.count(Slice.front()) ||
9784 VectorizedLoads.count(Slice.back()) ||
9786 continue;
9787 // Check if it is profitable to try vectorizing gathered loads. It is
9788 // profitable if we have more than 3 consecutive loads or if we have
9789 // less but all users are vectorized or deleted.
9790 bool AllowToVectorize = false;
9791 // Check if it is profitable to vectorize 2-elements loads.
9792 if (NumElts == 2) {
9793 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9794 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9795 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9796 for (LoadInst *LI : Slice) {
9797 // If single use/user - allow to vectorize.
9798 if (LI->hasOneUse())
9799 continue;
9800 // 1. Check if number of uses equals number of users.
9801 // 2. All users are deleted.
9802 // 3. The load broadcasts are not allowed or the load is not
9803 // broadcasted.
9804 if (static_cast<unsigned int>(std::distance(
9805 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9806 return false;
9807 if (!IsLegalBroadcastLoad)
9808 continue;
9809 if (LI->hasNUsesOrMore(UsesLimit))
9810 return false;
9811 for (User *U : LI->users()) {
9812 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9813 continue;
9814 for (const TreeEntry *UTE : getTreeEntries(U)) {
9815 for (int I : seq<int>(UTE->getNumOperands())) {
9816 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9817 return V == LI || isa<PoisonValue>(V);
9818 }))
9819 // Found legal broadcast - do not vectorize.
9820 return false;
9821 }
9822 }
9823 }
9824 }
9825 return true;
9826 };
9827 AllowToVectorize = CheckIfAllowed(Slice);
9828 } else {
9829 AllowToVectorize =
9830 (NumElts >= 3 ||
9831 any_of(ValueToGatherNodes.at(Slice.front()),
9832 [=](const TreeEntry *TE) {
9833 return TE->Scalars.size() == 2 &&
9834 ((TE->Scalars.front() == Slice.front() &&
9835 TE->Scalars.back() == Slice.back()) ||
9836 (TE->Scalars.front() == Slice.back() &&
9837 TE->Scalars.back() == Slice.front()));
9838 })) &&
9839 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9840 Slice.size());
9841 }
9842 if (AllowToVectorize) {
9843 SmallVector<Value *> PointerOps;
9844 OrdersType CurrentOrder;
9845 // Try to build vector load.
9846 ArrayRef<Value *> Values(
9847 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9848 StridedPtrInfo SPtrInfo;
9849 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9850 PointerOps, SPtrInfo, &BestVF);
9851 if (LS != LoadsState::Gather ||
9852 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9853 if (LS == LoadsState::ScatterVectorize) {
9854 if (MaskedGatherVectorized.empty() ||
9855 Cnt >= MaskedGatherVectorized.back() + NumElts)
9856 MaskedGatherVectorized.push_back(Cnt);
9857 continue;
9858 }
9859 if (LS != LoadsState::Gather) {
9860 Results.emplace_back(Values, LS);
9861 VectorizedLoads.insert_range(Slice);
9862 // If we vectorized initial block, no need to try to vectorize it
9863 // again.
9864 if (Cnt == StartIdx)
9865 StartIdx += NumElts;
9866 }
9867 // Check if the whole array was vectorized already - exit.
9868 if (StartIdx >= Loads.size())
9869 break;
9870 // Erase last masked gather candidate, if another candidate within
9871 // the range is found to be better.
9872 if (!MaskedGatherVectorized.empty() &&
9873 Cnt < MaskedGatherVectorized.back() + NumElts)
9874 MaskedGatherVectorized.pop_back();
9875 Cnt += NumElts - 1;
9876 continue;
9877 }
9878 }
9879 if (!AllowToVectorize || BestVF == 0)
9881 }
9882 // Mark masked gathers candidates as vectorized, if any.
9883 for (unsigned Cnt : MaskedGatherVectorized) {
9884 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9885 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9886 ArrayRef<Value *> Values(
9887 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9888 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9889 VectorizedLoads.insert_range(Slice);
9890 // If we vectorized initial block, no need to try to vectorize it again.
9891 if (Cnt == StartIdx)
9892 StartIdx += NumElts;
9893 }
9894 }
9895 for (LoadInst *LI : Loads) {
9896 if (!VectorizedLoads.contains(LI))
9897 NonVectorized.push_back(LI);
9898 }
9899 return Results;
9900 };
9901 auto ProcessGatheredLoads =
9902 [&, &TTI = *TTI](
9904 bool Final = false) {
9905 SmallVector<LoadInst *> NonVectorized;
9906 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9907 GatheredLoads) {
9908 if (LoadsDists.size() <= 1) {
9909 NonVectorized.push_back(LoadsDists.back().first);
9910 continue;
9911 }
9913 LoadsDists);
9914 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9915 stable_sort(LocalLoadsDists, LoadSorter);
9917 unsigned MaxConsecutiveDistance = 0;
9918 unsigned CurrentConsecutiveDist = 1;
9919 int64_t LastDist = LocalLoadsDists.front().second;
9920 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9921 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9922 if (isVectorized(L.first))
9923 continue;
9924 assert(LastDist >= L.second &&
9925 "Expected first distance always not less than second");
9926 if (static_cast<uint64_t>(LastDist - L.second) ==
9927 CurrentConsecutiveDist) {
9928 ++CurrentConsecutiveDist;
9929 MaxConsecutiveDistance =
9930 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9931 Loads.push_back(L.first);
9932 continue;
9933 }
9934 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9935 !Loads.empty())
9936 Loads.pop_back();
9937 CurrentConsecutiveDist = 1;
9938 LastDist = L.second;
9939 Loads.push_back(L.first);
9940 }
9941 if (Loads.size() <= 1)
9942 continue;
9943 if (AllowMaskedGather)
9944 MaxConsecutiveDistance = Loads.size();
9945 else if (MaxConsecutiveDistance < 2)
9946 continue;
9947 BoUpSLP::ValueSet VectorizedLoads;
9948 SmallVector<LoadInst *> SortedNonVectorized;
9950 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9951 Final, MaxConsecutiveDistance);
9952 if (!Results.empty() && !SortedNonVectorized.empty() &&
9953 OriginalLoads.size() == Loads.size() &&
9954 MaxConsecutiveDistance == Loads.size() &&
9956 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9957 return P.second == LoadsState::ScatterVectorize;
9958 })) {
9959 VectorizedLoads.clear();
9960 SmallVector<LoadInst *> UnsortedNonVectorized;
9962 UnsortedResults =
9963 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9964 UnsortedNonVectorized, Final,
9965 OriginalLoads.size());
9966 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9967 SortedNonVectorized.swap(UnsortedNonVectorized);
9968 Results.swap(UnsortedResults);
9969 }
9970 }
9971 for (auto [Slice, _] : Results) {
9972 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9973 << Slice.size() << ")\n");
9974 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9975 for (Value *L : Slice)
9976 if (!isVectorized(L))
9977 SortedNonVectorized.push_back(cast<LoadInst>(L));
9978 continue;
9979 }
9980
9981 // Select maximum VF as a maximum of user gathered nodes and
9982 // distance between scalar loads in these nodes.
9983 unsigned MaxVF = Slice.size();
9984 unsigned UserMaxVF = 0;
9985 unsigned InterleaveFactor = 0;
9986 if (MaxVF == 2) {
9987 UserMaxVF = MaxVF;
9988 } else {
9989 // Found distance between segments of the interleaved loads.
9990 std::optional<unsigned> InterleavedLoadsDistance = 0;
9991 unsigned Order = 0;
9992 std::optional<unsigned> CommonVF = 0;
9993 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9994 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9995 for (auto [Idx, V] : enumerate(Slice)) {
9996 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9997 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9998 unsigned Pos =
9999 EntryToPosition.try_emplace(E, Idx).first->second;
10000 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
10001 if (CommonVF) {
10002 if (*CommonVF == 0) {
10003 CommonVF = E->Scalars.size();
10004 continue;
10005 }
10006 if (*CommonVF != E->Scalars.size())
10007 CommonVF.reset();
10008 }
10009 // Check if the load is the part of the interleaved load.
10010 if (Pos != Idx && InterleavedLoadsDistance) {
10011 if (!DeinterleavedNodes.contains(E) &&
10012 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
10013 if (isa<Constant>(V))
10014 return false;
10015 if (isVectorized(V))
10016 return true;
10017 const auto &Nodes = ValueToGatherNodes.at(V);
10018 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
10019 !is_contained(Slice, V);
10020 })) {
10021 InterleavedLoadsDistance.reset();
10022 continue;
10023 }
10024 DeinterleavedNodes.insert(E);
10025 if (*InterleavedLoadsDistance == 0) {
10026 InterleavedLoadsDistance = Idx - Pos;
10027 continue;
10028 }
10029 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
10030 (Idx - Pos) / *InterleavedLoadsDistance < Order)
10031 InterleavedLoadsDistance.reset();
10032 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
10033 }
10034 }
10035 }
10036 DeinterleavedNodes.clear();
10037 // Check if the large load represents interleaved load operation.
10038 if (InterleavedLoadsDistance.value_or(0) > 1 &&
10039 CommonVF.value_or(0) != 0) {
10040 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
10041 unsigned VF = *CommonVF;
10042 OrdersType Order;
10043 SmallVector<Value *> PointerOps;
10044 StridedPtrInfo SPtrInfo;
10045 // Segmented load detected - vectorize at maximum vector factor.
10046 if (InterleaveFactor <= Slice.size() &&
10047 TTI.isLegalInterleavedAccessType(
10048 getWidenedType(Slice.front()->getType(), VF),
10049 InterleaveFactor,
10050 cast<LoadInst>(Slice.front())->getAlign(),
10051 cast<LoadInst>(Slice.front())
10053 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
10054 SPtrInfo) == LoadsState::Vectorize) {
10055 UserMaxVF = InterleaveFactor * VF;
10056 } else {
10057 InterleaveFactor = 0;
10058 }
10059 }
10060 // Cannot represent the loads as consecutive vectorizable nodes -
10061 // just exit.
10062 unsigned ConsecutiveNodesSize = 0;
10063 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
10064 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10065 [&, Slice = Slice](const auto &P) {
10066 const auto *It = find_if(Slice, [&](Value *V) {
10067 return std::get<1>(P).contains(V);
10068 });
10069 if (It == Slice.end())
10070 return false;
10071 const TreeEntry &TE =
10072 *VectorizableTree[std::get<0>(P)];
10073 ArrayRef<Value *> VL = TE.Scalars;
10074 OrdersType Order;
10075 SmallVector<Value *> PointerOps;
10076 StridedPtrInfo SPtrInfo;
10078 VL, VL.front(), Order, PointerOps, SPtrInfo);
10079 if (State == LoadsState::ScatterVectorize ||
10081 return false;
10082 ConsecutiveNodesSize += VL.size();
10083 size_t Start = std::distance(Slice.begin(), It);
10084 size_t Sz = Slice.size() - Start;
10085 return Sz < VL.size() ||
10086 Slice.slice(Start, VL.size()) != VL;
10087 }))
10088 continue;
10089 // Try to build long masked gather loads.
10090 UserMaxVF = bit_ceil(UserMaxVF);
10091 if (InterleaveFactor == 0 &&
10092 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
10093 [&, Slice = Slice](unsigned Idx) {
10094 OrdersType Order;
10095 SmallVector<Value *> PointerOps;
10096 StridedPtrInfo SPtrInfo;
10097 return canVectorizeLoads(
10098 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10099 Slice[Idx * UserMaxVF], Order, PointerOps,
10100 SPtrInfo) == LoadsState::ScatterVectorize;
10101 }))
10102 UserMaxVF = MaxVF;
10103 if (Slice.size() != ConsecutiveNodesSize)
10104 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10105 }
10106 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10107 bool IsVectorized = true;
10108 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10109 ArrayRef<Value *> SubSlice =
10110 Slice.slice(I, std::min(VF, E - I));
10111 if (isVectorized(SubSlice.front()))
10112 continue;
10113 // Check if the subslice is to be-vectorized entry, which is not
10114 // equal to entry.
10115 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10116 [&](const auto &P) {
10117 return !SubSlice.equals(
10118 VectorizableTree[std::get<0>(P)]
10119 ->Scalars) &&
10120 set_is_subset(SubSlice, std::get<1>(P));
10121 }))
10122 continue;
10123 unsigned Sz = VectorizableTree.size();
10124 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
10125 if (Sz == VectorizableTree.size()) {
10126 IsVectorized = false;
10127 // Try non-interleaved vectorization with smaller vector
10128 // factor.
10129 if (InterleaveFactor > 0) {
10130 VF = 2 * (MaxVF / InterleaveFactor);
10131 InterleaveFactor = 0;
10132 }
10133 continue;
10134 }
10135 }
10136 if (IsVectorized)
10137 break;
10138 }
10139 }
10140 NonVectorized.append(SortedNonVectorized);
10141 }
10142 return NonVectorized;
10143 };
10144 for (const auto &GLs : GatheredLoads) {
10145 const auto &Ref = GLs.second;
10146 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10147 if (!Ref.empty() && !NonVectorized.empty() &&
10148 std::accumulate(
10149 Ref.begin(), Ref.end(), 0u,
10150 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10151 -> unsigned { return S + LoadsDists.size(); }) !=
10152 NonVectorized.size() &&
10153 IsMaskedGatherSupported(NonVectorized)) {
10155 FinalGatheredLoads;
10156 for (LoadInst *LI : NonVectorized) {
10157 // Reinsert non-vectorized loads to other list of loads with the same
10158 // base pointers.
10159 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
10160 FinalGatheredLoads,
10161 /*AddNew=*/false);
10162 }
10163 // Final attempt to vectorize non-vectorized loads.
10164 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10165 }
10166 }
10167 // Try to vectorize postponed load entries, previously marked as gathered.
10168 for (unsigned Idx : LoadEntriesToVectorize) {
10169 const TreeEntry &E = *VectorizableTree[Idx];
10170 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10171 // Avoid reordering, if possible.
10172 if (!E.ReorderIndices.empty()) {
10173 // Build a mask out of the reorder indices and reorder scalars per this
10174 // mask.
10175 SmallVector<int> ReorderMask;
10176 inversePermutation(E.ReorderIndices, ReorderMask);
10177 reorderScalars(GatheredScalars, ReorderMask);
10178 }
10179 buildTreeRec(GatheredScalars, 0, EdgeInfo());
10180 }
10181 // If no new entries created, consider it as no gathered loads entries must be
10182 // handled.
10183 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10184 VectorizableTree.size())
10185 GatheredLoadsEntriesFirst.reset();
10186}
10187
10188/// Generates key/subkey pair for the given value to provide effective sorting
10189/// of the values and better detection of the vectorizable values sequences. The
10190/// keys/subkeys can be used for better sorting of the values themselves (keys)
10191/// and in values subgroups (subkeys).
10192static std::pair<size_t, size_t> generateKeySubkey(
10193 Value *V, const TargetLibraryInfo *TLI,
10194 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10195 bool AllowAlternate) {
10196 hash_code Key = hash_value(V->getValueID() + 2);
10197 hash_code SubKey = hash_value(0);
10198 // Sort the loads by the distance between the pointers.
10199 if (auto *LI = dyn_cast<LoadInst>(V)) {
10200 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
10201 if (LI->isSimple())
10202 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
10203 else
10204 Key = SubKey = hash_value(LI);
10205 } else if (isVectorLikeInstWithConstOps(V)) {
10206 // Sort extracts by the vector operands.
10208 Key = hash_value(Value::UndefValueVal + 1);
10209 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
10210 if (!isUndefVector(EI->getVectorOperand()).all() &&
10211 !isa<UndefValue>(EI->getIndexOperand()))
10212 SubKey = hash_value(EI->getVectorOperand());
10213 }
10214 } else if (auto *I = dyn_cast<Instruction>(V)) {
10215 // Sort other instructions just by the opcodes except for CMPInst.
10216 // For CMP also sort by the predicate kind.
10218 isValidForAlternation(I->getOpcode())) {
10219 if (AllowAlternate)
10220 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
10221 else
10222 Key = hash_combine(hash_value(I->getOpcode()), Key);
10223 SubKey = hash_combine(
10224 hash_value(I->getOpcode()), hash_value(I->getType()),
10226 ? I->getType()
10227 : cast<CastInst>(I)->getOperand(0)->getType()));
10228 // For casts, look through the only operand to improve compile time.
10229 if (isa<CastInst>(I)) {
10230 std::pair<size_t, size_t> OpVals =
10231 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
10232 /*AllowAlternate=*/true);
10233 Key = hash_combine(OpVals.first, Key);
10234 SubKey = hash_combine(OpVals.first, SubKey);
10235 }
10236 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
10237 CmpInst::Predicate Pred = CI->getPredicate();
10238 if (CI->isCommutative())
10239 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
10241 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
10242 hash_value(SwapPred),
10243 hash_value(CI->getOperand(0)->getType()));
10244 } else if (auto *Call = dyn_cast<CallInst>(I)) {
10247 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
10248 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
10249 SubKey = hash_combine(hash_value(I->getOpcode()),
10250 hash_value(Call->getCalledFunction()));
10251 } else {
10253 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
10254 }
10255 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10256 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
10257 hash_value(Op.Tag), SubKey);
10258 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
10259 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
10260 SubKey = hash_value(Gep->getPointerOperand());
10261 else
10262 SubKey = hash_value(Gep);
10263 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
10264 !isa<ConstantInt>(I->getOperand(1))) {
10265 // Do not try to vectorize instructions with potentially high cost.
10266 SubKey = hash_value(I);
10267 } else {
10268 SubKey = hash_value(I->getOpcode());
10269 }
10270 Key = hash_combine(hash_value(I->getParent()->getNumber()), Key);
10271 }
10272 return std::make_pair(Key, SubKey);
10273}
10274
10275/// Checks if the specified instruction \p I is an main operation for the given
10276/// \p MainOp and \p AltOp instructions.
10277static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10278 Instruction *AltOp, const TargetLibraryInfo &TLI);
10279
10280/// Builds the arguments types vector for the given call instruction with the
10281/// given \p ID for the specified vector factor.
10284 const unsigned VF, unsigned MinBW,
10285 const TargetTransformInfo *TTI) {
10286 SmallVector<Type *> ArgTys;
10287 for (auto [Idx, Arg] : enumerate(CI->args())) {
10290 ArgTys.push_back(Arg->getType());
10291 continue;
10292 }
10293 if (MinBW > 0) {
10294 ArgTys.push_back(
10295 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10296 continue;
10297 }
10298 }
10299 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10300 }
10301 return ArgTys;
10302}
10303
10304/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10305/// function (if possible) calls. Returns invalid cost for the corresponding
10306/// calls, if they cannot be vectorized/will be scalarized.
10307static std::pair<InstructionCost, InstructionCost>
10310 ArrayRef<Type *> ArgTys) {
10311 auto Shape = VFShape::get(CI->getFunctionType(),
10313 false /*HasGlobalPred*/);
10314 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10315 auto LibCost = InstructionCost::getInvalid();
10316 if (!CI->isNoBuiltin() && VecFunc) {
10317 // Calculate the cost of the vector library call.
10318 // If the corresponding vector call is cheaper, return its cost.
10319 LibCost =
10320 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10321 }
10323
10324 // Calculate the cost of the vector intrinsic call.
10325 FastMathFlags FMF;
10326 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10327 FMF = FPCI->getFastMathFlags();
10328 const InstructionCost ScalarLimit = 10000;
10329 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10330 LibCost.isValid() ? LibCost : ScalarLimit);
10331 auto IntrinsicCost =
10332 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10333 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10334 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10336
10337 return {IntrinsicCost, LibCost};
10338}
10339
10340/// Find the innermost loop starting from \p L, for which at least a single
10341/// value in \p VL is not invariant.
10343 ArrayRef<Value *> VL) {
10344 assert(L && "Expected valid loop");
10345 auto IsLoopInvariant = [&](const Loop *L, ArrayRef<Value *> VL) {
10346 return all_of(VL, [&](Value *V) {
10347 return isa<Constant>(V) || !isa<Instruction>(V) || L->isLoopInvariant(V);
10348 });
10349 };
10350 while (L && IsLoopInvariant(L, VL))
10351 L = L->getParentLoop();
10352 return L;
10353}
10354
10355/// Get the loop nest for the given loop.
10356ArrayRef<const Loop *> BoUpSLP::getLoopNest(const Loop *L) {
10357 assert(L && "Expected valid loop");
10358 if (LoopAwareTripCount == 0)
10359 return {};
10360 SmallVector<const Loop *> &Res =
10361 LoopToLoopNest.try_emplace(L).first->getSecond();
10362 if (!Res.empty())
10363 return Res;
10364 SmallVector<const Loop *> LoopNest;
10365 while (L) {
10366 LoopNest.push_back(L);
10367 L = L->getParentLoop();
10368 }
10369 Res.assign(LoopNest.rbegin(), LoopNest.rend());
10370 return Res;
10371}
10372
10373BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10374 const InstructionsState &S, ArrayRef<Value *> VL,
10375 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10376 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10377 assert(S.getMainOp() &&
10378 "Expected instructions with same/alternate opcodes only.");
10379
10380 unsigned ShuffleOrOp =
10381 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10382 Instruction *VL0 = S.getMainOp();
10383 switch (ShuffleOrOp) {
10384 case Instruction::PHI: {
10385 // Too many operands - gather, most probably won't be vectorized.
10386 if (VL0->getNumOperands() > MaxPHINumOperands)
10387 return TreeEntry::NeedToGather;
10388 // Check for terminator values (e.g. invoke).
10389 for (Value *V : VL) {
10390 auto *PHI = dyn_cast<PHINode>(V);
10391 if (!PHI)
10392 continue;
10393 for (Value *Incoming : PHI->incoming_values()) {
10395 if (Term && Term->isTerminator()) {
10397 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10398 return TreeEntry::NeedToGather;
10399 }
10400 }
10401 }
10402
10403 return TreeEntry::Vectorize;
10404 }
10405 case Instruction::ExtractElement:
10406 if (any_of(VL, [&](Value *V) {
10407 auto *EI = dyn_cast<ExtractElementInst>(V);
10408 if (!EI)
10409 return true;
10410 return isVectorized(EI->getOperand(0));
10411 }))
10412 return TreeEntry::NeedToGather;
10413 [[fallthrough]];
10414 case Instruction::ExtractValue: {
10415 bool Reuse = canReuseExtract(VL, CurrentOrder);
10416 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10417 // non-full registers).
10418 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10419 return TreeEntry::NeedToGather;
10420 if (Reuse || !CurrentOrder.empty())
10421 return TreeEntry::Vectorize;
10422 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10423 return TreeEntry::NeedToGather;
10424 }
10425 case Instruction::InsertElement: {
10426 // Check that we have a buildvector and not a shuffle of 2 or more
10427 // different vectors.
10428 ValueSet SourceVectors;
10429 for (Value *V : VL) {
10430 if (isa<PoisonValue>(V)) {
10431 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10432 return TreeEntry::NeedToGather;
10433 }
10434 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10435 assert(getElementIndex(V) != std::nullopt &&
10436 "Non-constant or undef index?");
10437 }
10438
10439 if (count_if(VL, [&SourceVectors](Value *V) {
10440 return !SourceVectors.contains(V);
10441 }) >= 2) {
10442 // Found 2nd source vector - cancel.
10443 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10444 "different source vectors.\n");
10445 return TreeEntry::NeedToGather;
10446 }
10447
10448 if (any_of(VL, [&SourceVectors](Value *V) {
10449 // The last InsertElement can have multiple uses.
10450 return SourceVectors.contains(V) && !V->hasOneUse();
10451 })) {
10452 assert(SLPReVec && "Only supported by REVEC.");
10453 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10454 "multiple uses.\n");
10455 return TreeEntry::NeedToGather;
10456 }
10457
10458 return TreeEntry::Vectorize;
10459 }
10460 case Instruction::Load: {
10461 // Check that a vectorized load would load the same memory as a scalar
10462 // load. For example, we don't want to vectorize loads that are smaller
10463 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10464 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10465 // from such a struct, we read/write packed bits disagreeing with the
10466 // unvectorized version.
10467 auto IsGatheredNode = [&]() {
10468 if (!GatheredLoadsEntriesFirst)
10469 return false;
10470 return all_of(VL, [&](Value *V) {
10471 if (isa<PoisonValue>(V))
10472 return true;
10473 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10474 return TE->Idx >= *GatheredLoadsEntriesFirst;
10475 });
10476 });
10477 };
10478 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10480 return TreeEntry::Vectorize;
10482 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10483 // Delay slow vectorized nodes for better vectorization attempts.
10484 LoadEntriesToVectorize.insert(VectorizableTree.size());
10485 return TreeEntry::NeedToGather;
10486 }
10487 return IsGatheredNode() ? TreeEntry::NeedToGather
10488 : TreeEntry::CompressVectorize;
10490 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10491 // Delay slow vectorized nodes for better vectorization attempts.
10492 LoadEntriesToVectorize.insert(VectorizableTree.size());
10493 return TreeEntry::NeedToGather;
10494 }
10495 return IsGatheredNode() ? TreeEntry::NeedToGather
10496 : TreeEntry::ScatterVectorize;
10498 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10499 // Delay slow vectorized nodes for better vectorization attempts.
10500 LoadEntriesToVectorize.insert(VectorizableTree.size());
10501 return TreeEntry::NeedToGather;
10502 }
10503 return IsGatheredNode() ? TreeEntry::NeedToGather
10504 : TreeEntry::StridedVectorize;
10505 case LoadsState::Gather:
10506#ifndef NDEBUG
10507 Type *ScalarTy = VL0->getType();
10508 if (DL->getTypeSizeInBits(ScalarTy) !=
10509 DL->getTypeAllocSizeInBits(ScalarTy))
10510 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10511 else if (any_of(VL, [](Value *V) {
10512 auto *LI = dyn_cast<LoadInst>(V);
10513 return !LI || !LI->isSimple();
10514 }))
10515 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10516 else
10517 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10518#endif // NDEBUG
10520 return TreeEntry::NeedToGather;
10521 }
10522 llvm_unreachable("Unexpected state of loads");
10523 }
10524 case Instruction::ZExt:
10525 case Instruction::SExt:
10526 case Instruction::FPToUI:
10527 case Instruction::FPToSI:
10528 case Instruction::FPExt:
10529 case Instruction::PtrToInt:
10530 case Instruction::IntToPtr:
10531 case Instruction::SIToFP:
10532 case Instruction::UIToFP:
10533 case Instruction::Trunc:
10534 case Instruction::FPTrunc:
10535 case Instruction::BitCast: {
10536 Type *SrcTy = VL0->getOperand(0)->getType();
10537 for (Value *V : VL) {
10538 if (isa<PoisonValue>(V))
10539 continue;
10540 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10541 if (Ty != SrcTy || !isValidElementType(Ty)) {
10542 LLVM_DEBUG(
10543 dbgs() << "SLP: Gathering casts with different src types.\n");
10544 return TreeEntry::NeedToGather;
10545 }
10546 }
10547 return TreeEntry::Vectorize;
10548 }
10549 case Instruction::ICmp:
10550 case Instruction::FCmp: {
10551 // Check that all of the compares have the same predicate.
10552 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10554 Type *ComparedTy = VL0->getOperand(0)->getType();
10555 for (Value *V : VL) {
10556 if (isa<PoisonValue>(V))
10557 continue;
10558 auto *Cmp = cast<CmpInst>(V);
10559 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10560 Cmp->getOperand(0)->getType() != ComparedTy) {
10561 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10562 return TreeEntry::NeedToGather;
10563 }
10564 }
10565 return TreeEntry::Vectorize;
10566 }
10567 case Instruction::Select:
10568 if (SLPReVec) {
10569 SmallPtrSet<Type *, 4> CondTypes;
10570 for (Value *V : VL) {
10571 Value *Cond;
10572 if (!match(V, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
10573 !match(V, m_ZExt(m_Value(Cond))))
10574 continue;
10575 CondTypes.insert(Cond->getType());
10576 }
10577 if (CondTypes.size() > 1) {
10578 LLVM_DEBUG(
10579 dbgs()
10580 << "SLP: Gathering select with different condition types.\n");
10581 return TreeEntry::NeedToGather;
10582 }
10583 }
10584 [[fallthrough]];
10585 case Instruction::FNeg:
10586 case Instruction::Add:
10587 case Instruction::FAdd:
10588 case Instruction::Sub:
10589 case Instruction::FSub:
10590 case Instruction::Mul:
10591 case Instruction::FMul:
10592 case Instruction::UDiv:
10593 case Instruction::SDiv:
10594 case Instruction::FDiv:
10595 case Instruction::URem:
10596 case Instruction::SRem:
10597 case Instruction::FRem:
10598 case Instruction::Shl:
10599 case Instruction::LShr:
10600 case Instruction::AShr:
10601 case Instruction::And:
10602 case Instruction::Or:
10603 case Instruction::Xor:
10604 case Instruction::Freeze:
10605 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10606 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10607 auto *I = dyn_cast<Instruction>(V);
10608 return I && I->isBinaryOp() && !I->isFast();
10609 }))
10610 return TreeEntry::NeedToGather;
10611 return TreeEntry::Vectorize;
10612 case Instruction::GetElementPtr: {
10613 // We don't combine GEPs with complicated (nested) indexing.
10614 for (Value *V : VL) {
10615 auto *I = dyn_cast<GetElementPtrInst>(V);
10616 if (!I)
10617 continue;
10618 if (I->getNumOperands() != 2) {
10619 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10620 return TreeEntry::NeedToGather;
10621 }
10622 }
10623
10624 // We can't combine several GEPs into one vector if they operate on
10625 // different types.
10626 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10627 for (Value *V : VL) {
10628 auto *GEP = dyn_cast<GEPOperator>(V);
10629 if (!GEP)
10630 continue;
10631 Type *CurTy = GEP->getSourceElementType();
10632 if (Ty0 != CurTy) {
10633 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10634 return TreeEntry::NeedToGather;
10635 }
10636 }
10637
10638 // We don't combine GEPs with non-constant indexes.
10639 Type *Ty1 = VL0->getOperand(1)->getType();
10640 for (Value *V : VL) {
10641 auto *I = dyn_cast<GetElementPtrInst>(V);
10642 if (!I)
10643 continue;
10644 auto *Op = I->getOperand(1);
10645 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10646 (Op->getType() != Ty1 &&
10647 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10648 Op->getType()->getScalarSizeInBits() >
10649 DL->getIndexSizeInBits(
10650 V->getType()->getPointerAddressSpace())))) {
10651 LLVM_DEBUG(
10652 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10653 return TreeEntry::NeedToGather;
10654 }
10655 }
10656
10657 return TreeEntry::Vectorize;
10658 }
10659 case Instruction::Store: {
10660 // Check if the stores are consecutive or if we need to swizzle them.
10661 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10662 // Avoid types that are padded when being allocated as scalars, while
10663 // being packed together in a vector (such as i1).
10664 if (DL->getTypeSizeInBits(ScalarTy) !=
10665 DL->getTypeAllocSizeInBits(ScalarTy)) {
10666 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10667 return TreeEntry::NeedToGather;
10668 }
10669 // Make sure all stores in the bundle are simple - we can't vectorize
10670 // atomic or volatile stores.
10671 for (Value *V : VL) {
10672 auto *SI = cast<StoreInst>(V);
10673 if (!SI->isSimple()) {
10674 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10675 return TreeEntry::NeedToGather;
10676 }
10677 PointerOps.push_back(SI->getPointerOperand());
10678 }
10679
10680 // Check the order of pointer operands.
10681 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10682 Value *Ptr0;
10683 Value *PtrN;
10684 if (CurrentOrder.empty()) {
10685 Ptr0 = PointerOps.front();
10686 PtrN = PointerOps.back();
10687 } else {
10688 Ptr0 = PointerOps[CurrentOrder.front()];
10689 PtrN = PointerOps[CurrentOrder.back()];
10690 }
10691 std::optional<int64_t> Dist =
10692 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10693 // Check that the sorted pointer operands are consecutive.
10694 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10695 return TreeEntry::Vectorize;
10696 }
10697
10698 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10699 return TreeEntry::NeedToGather;
10700 }
10701 case Instruction::Call: {
10702 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10703 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10704 auto *I = dyn_cast<Instruction>(V);
10705 return I && !I->isFast();
10706 }))
10707 return TreeEntry::NeedToGather;
10708 // Check if the calls are all to the same vectorizable intrinsic or
10709 // library function.
10710 CallInst *CI = cast<CallInst>(VL0);
10712
10713 VFShape Shape = VFShape::get(
10714 CI->getFunctionType(),
10715 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10716 false /*HasGlobalPred*/);
10717 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10718
10719 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10720 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10721 return TreeEntry::NeedToGather;
10722 }
10723 Function *F = CI->getCalledFunction();
10724 unsigned NumArgs = CI->arg_size();
10725 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10726 for (unsigned J = 0; J != NumArgs; ++J)
10728 ScalarArgs[J] = CI->getArgOperand(J);
10729 for (Value *V : VL) {
10730 CallInst *CI2 = dyn_cast<CallInst>(V);
10731 if (!CI2 || CI2->getCalledFunction() != F ||
10732 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10733 (VecFunc &&
10734 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10736 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10737 << "\n");
10738 return TreeEntry::NeedToGather;
10739 }
10740 // Some intrinsics have scalar arguments and should be same in order for
10741 // them to be vectorized.
10742 for (unsigned J = 0; J != NumArgs; ++J) {
10744 Value *A1J = CI2->getArgOperand(J);
10745 if (ScalarArgs[J] != A1J) {
10747 << "SLP: mismatched arguments in call:" << *CI
10748 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10749 return TreeEntry::NeedToGather;
10750 }
10751 }
10752 }
10753 // Verify that the bundle operands are identical between the two calls.
10754 if (CI->hasOperandBundles() &&
10755 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10756 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10757 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10758 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10759 << "!=" << *V << '\n');
10760 return TreeEntry::NeedToGather;
10761 }
10762 }
10763 SmallVector<Type *> ArgTys =
10764 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10765 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10766 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10767 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10768 return TreeEntry::NeedToGather;
10769
10770 return TreeEntry::Vectorize;
10771 }
10772 case Instruction::ShuffleVector: {
10773 if (!S.isAltShuffle()) {
10774 // REVEC can support non alternate shuffle.
10776 return TreeEntry::Vectorize;
10777 // If this is not an alternate sequence of opcode like add-sub
10778 // then do not vectorize this instruction.
10779 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10780 return TreeEntry::NeedToGather;
10781 }
10782
10783 return TreeEntry::Vectorize;
10784 }
10785 default:
10786 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10787 return TreeEntry::NeedToGather;
10788 }
10789}
10790
10791namespace {
10792/// Allows to correctly handle operands of the phi nodes based on the \p Main
10793/// PHINode order of incoming basic blocks/values.
10794class PHIHandler {
10795 DominatorTree &DT;
10796 PHINode *Main = nullptr;
10799
10800public:
10801 PHIHandler() = delete;
10802 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10803 : DT(DT), Main(Main), Phis(Phis),
10804 Operands(Main->getNumIncomingValues(),
10805 SmallVector<Value *>(Phis.size(), nullptr)) {}
10806 void buildOperands() {
10807 constexpr unsigned FastLimit = 4;
10808 if (Main->getNumIncomingValues() <= FastLimit) {
10809 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10810 BasicBlock *InBB = Main->getIncomingBlock(I);
10811 if (!DT.isReachableFromEntry(InBB)) {
10812 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10813 continue;
10814 }
10815 // Prepare the operand vector.
10816 for (auto [Idx, V] : enumerate(Phis)) {
10817 auto *P = dyn_cast<PHINode>(V);
10818 if (!P) {
10820 "Expected isa instruction or poison value.");
10821 Operands[I][Idx] = V;
10822 continue;
10823 }
10824 if (P->getIncomingBlock(I) == InBB)
10825 Operands[I][Idx] = P->getIncomingValue(I);
10826 else
10827 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10828 }
10829 }
10830 return;
10831 }
10832 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10833 Blocks;
10834 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10835 BasicBlock *InBB = Main->getIncomingBlock(I);
10836 if (!DT.isReachableFromEntry(InBB)) {
10837 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10838 continue;
10839 }
10840 Blocks.try_emplace(InBB).first->second.push_back(I);
10841 }
10842 for (auto [Idx, V] : enumerate(Phis)) {
10843 if (isa<PoisonValue>(V)) {
10844 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10845 Operands[I][Idx] = V;
10846 continue;
10847 }
10848 auto *P = cast<PHINode>(V);
10849 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10850 BasicBlock *InBB = P->getIncomingBlock(I);
10851 if (InBB == Main->getIncomingBlock(I)) {
10852 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10853 continue;
10854 Operands[I][Idx] = P->getIncomingValue(I);
10855 continue;
10856 }
10857 auto *It = Blocks.find(InBB);
10858 if (It == Blocks.end())
10859 continue;
10860 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10861 }
10862 }
10863 for (const auto &P : Blocks) {
10864 ArrayRef<unsigned> IncomingValues = P.second;
10865 if (IncomingValues.size() <= 1)
10866 continue;
10867 unsigned BasicI = IncomingValues.consume_front();
10868 for (unsigned I : IncomingValues) {
10869 assert(all_of(enumerate(Operands[I]),
10870 [&](const auto &Data) {
10871 return !Data.value() ||
10872 Data.value() == Operands[BasicI][Data.index()];
10873 }) &&
10874 "Expected empty operands list.");
10875 Operands[I] = Operands[BasicI];
10876 }
10877 }
10878 }
10879 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10880};
10881} // namespace
10882
10883/// Returns main/alternate instructions for the given \p VL. Unlike
10884/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10885/// node support.
10886/// \returns first main/alt instructions, if only poisons and instruction with
10887/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10888static std::pair<Instruction *, Instruction *>
10890 Instruction *MainOp = nullptr;
10891 Instruction *AltOp = nullptr;
10892 for (Value *V : VL) {
10893 if (isa<PoisonValue>(V))
10894 continue;
10895 auto *I = dyn_cast<Instruction>(V);
10896 if (!I)
10897 return {};
10898 if (!MainOp) {
10899 MainOp = I;
10900 continue;
10901 }
10902 if (MainOp->getOpcode() == I->getOpcode()) {
10903 if (I->getParent() != MainOp->getParent())
10904 return {};
10905 continue;
10906 }
10907 if (!AltOp) {
10908 AltOp = I;
10909 continue;
10910 }
10911 if (AltOp->getOpcode() == I->getOpcode()) {
10912 if (I->getParent() != AltOp->getParent())
10913 return {};
10914 continue;
10915 }
10916 return {};
10917 }
10918 if (!AltOp)
10919 return {};
10920 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10921 "Expected different main and alt instructions.");
10922 return std::make_pair(MainOp, AltOp);
10923}
10924
10925/// Checks that every instruction appears once in the list and if not, packs
10926/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10927/// unique scalars is extended by poison values to the whole register size.
10928///
10929/// \returns false if \p VL could not be uniquified, in which case \p VL is
10930/// unchanged and \p ReuseShuffleIndices is empty.
10932 SmallVectorImpl<int> &ReuseShuffleIndices,
10933 const TargetTransformInfo &TTI,
10934 const TargetLibraryInfo &TLI,
10935 const InstructionsState &S,
10936 const BoUpSLP::EdgeInfo &UserTreeIdx,
10937 bool TryPad = false) {
10938 // Check that every instruction appears once in this bundle.
10939 SmallVector<Value *> UniqueValues;
10940 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10941 for (Value *V : VL) {
10942 if (isConstant(V)) {
10943 // Constants are always considered distinct, even if the same constant
10944 // appears multiple times in VL.
10945 ReuseShuffleIndices.emplace_back(
10946 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10947 UniqueValues.emplace_back(V);
10948 continue;
10949 }
10950 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10951 ReuseShuffleIndices.emplace_back(Res.first->second);
10952 if (Res.second)
10953 UniqueValues.emplace_back(V);
10954 }
10955
10956 // Easy case: VL has unique values and a "natural" size
10957 size_t NumUniqueScalarValues = UniqueValues.size();
10958 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10959 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10960 if (NumUniqueScalarValues == VL.size() &&
10961 (VectorizeNonPowerOf2 || IsFullVectors)) {
10962 ReuseShuffleIndices.clear();
10963 return true;
10964 }
10965
10966 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10967 if ((UserTreeIdx.UserTE &&
10968 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10970 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10971 "for nodes with padding.\n");
10972 ReuseShuffleIndices.clear();
10973 return false;
10974 }
10975
10976 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10977 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10978 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10979 return isa<UndefValue>(V) || !isConstant(V);
10980 }))) {
10981 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10982 S.getMainOp()->isSafeToRemove() &&
10983 (S.areInstructionsWithCopyableElements() ||
10984 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10985 // Find the number of elements, which forms full vectors.
10986 unsigned PWSz = getFullVectorNumberOfElements(
10987 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10988 PWSz = std::min<unsigned>(PWSz, VL.size());
10989 if (PWSz == VL.size()) {
10990 // We ended up with the same size after removing duplicates and
10991 // upgrading the resulting vector size to a "nice size". Just keep
10992 // the initial VL then.
10993 ReuseShuffleIndices.clear();
10994 } else {
10995 // Pad unique values with poison to grow the vector to a "nice" size
10996 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10997 UniqueValues.end());
10998 PaddedUniqueValues.append(
10999 PWSz - UniqueValues.size(),
11000 PoisonValue::get(UniqueValues.front()->getType()));
11001 // Check that extended with poisons/copyable operations are still valid
11002 // for vectorization (div/rem are not allowed).
11003 if ((!S.areInstructionsWithCopyableElements() &&
11004 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
11005 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
11006 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
11007 isa<CallInst>(S.getMainOp())))) {
11008 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11009 ReuseShuffleIndices.clear();
11010 return false;
11011 }
11012 VL = std::move(PaddedUniqueValues);
11013 }
11014 return true;
11015 }
11016 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11017 ReuseShuffleIndices.clear();
11018 return false;
11019 }
11020 VL = std::move(UniqueValues);
11021 return true;
11022}
11023
11025 const InstructionsState &LocalState,
11028 OrdersType &ReorderIndices) const {
11029 constexpr unsigned SmallNodeSize = 4;
11030 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11032 return false;
11033
11034 // Check if this is a duplicate of another split entry.
11035 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
11036 << ".\n");
11037 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11038 if (E->isSame(VL)) {
11039 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11040 << *LocalState.getMainOp() << ".\n");
11041 return false;
11042 }
11043 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11044 if (all_of(VL, [&](Value *V) {
11045 return isa<PoisonValue>(V) || Values.contains(V);
11046 })) {
11047 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11048 return false;
11049 }
11050 }
11051
11052 ReorderIndices.assign(VL.size(), VL.size());
11053 SmallBitVector Op1Indices(VL.size());
11054 for (auto [Idx, V] : enumerate(VL)) {
11055 auto *I = dyn_cast<Instruction>(V);
11056 if (!I) {
11057 Op1.push_back(V);
11058 Op1Indices.set(Idx);
11059 continue;
11060 }
11061 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11062 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
11063 *TLI)) ||
11064 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11065 !isAlternateInstruction(I, LocalState.getMainOp(),
11066 LocalState.getAltOp(), *TLI))) {
11067 Op1.push_back(V);
11068 Op1Indices.set(Idx);
11069 continue;
11070 }
11071 Op2.push_back(V);
11072 }
11073 Type *ScalarTy = getValueType(VL.front());
11074 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
11075 unsigned Opcode0 = LocalState.getOpcode();
11076 unsigned Opcode1 = LocalState.getAltOpcode();
11077 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11078 // Enable split node, only if all nodes do not form legal alternate
11079 // instruction (like X86 addsub).
11082 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11083 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11084 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
11085 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
11086 return false;
11087 // Enable split node, only if all nodes are power-of-2/full registers.
11088 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11089 for (unsigned Idx : seq<unsigned>(VL.size())) {
11090 if (Op1Indices.test(Idx)) {
11091 ReorderIndices[Op1Cnt] = Idx;
11092 ++Op1Cnt;
11093 } else {
11094 ReorderIndices[Op2Cnt] = Idx;
11095 ++Op2Cnt;
11096 }
11097 }
11098 if (isIdentityOrder(ReorderIndices))
11099 ReorderIndices.clear();
11100 SmallVector<int> Mask;
11101 if (!ReorderIndices.empty())
11102 inversePermutation(ReorderIndices, Mask);
11103 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11104 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
11105 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
11106 // Check non-profitable single register ops, which better to be represented
11107 // as alternate ops.
11108 if (NumParts >= VL.size())
11109 return false;
11111 InstructionCost InsertCost = ::getShuffleCost(
11112 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
11113 FixedVectorType *SubVecTy =
11114 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
11115 InstructionCost NewShuffleCost =
11116 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
11117 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11118 (Mask.empty() || InsertCost >= NewShuffleCost))
11119 return false;
11120 if ((LocalState.getMainOp()->isBinaryOp() &&
11121 LocalState.getAltOp()->isBinaryOp() &&
11122 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11123 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11124 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11125 (LocalState.getMainOp()->isUnaryOp() &&
11126 LocalState.getAltOp()->isUnaryOp())) {
11127 InstructionCost OriginalVecOpsCost =
11128 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11129 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11130 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11131 for (unsigned Idx : seq<unsigned>(VL.size())) {
11132 if (isa<PoisonValue>(VL[Idx]))
11133 continue;
11134 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11135 }
11136 InstructionCost OriginalCost =
11137 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
11138 VecTy, OriginalMask, Kind);
11139 InstructionCost NewVecOpsCost =
11140 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11141 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11142 InstructionCost NewCost =
11143 NewVecOpsCost + InsertCost +
11144 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11145 VectorizableTree.front()->getOpcode() == Instruction::Store
11146 ? NewShuffleCost
11147 : 0);
11148 // If not profitable to split - exit.
11149 if (NewCost >= OriginalCost)
11150 return false;
11151 }
11152 return true;
11153}
11154
11155namespace {
11156/// Class accepts incoming list of values, checks if it is able to model
11157/// "copyable" values as compatible operations, and generates the list of values
11158/// for scheduling and list of operands doe the new nodes.
11159class InstructionsCompatibilityAnalysis {
11160 DominatorTree &DT;
11161 const DataLayout &DL;
11162 const TargetTransformInfo &TTI;
11163 const TargetLibraryInfo &TLI;
11164 unsigned MainOpcode = 0;
11165 Instruction *MainOp = nullptr;
11166
11167 /// Checks if the opcode is supported as the main opcode for copyable
11168 /// elements.
11169 static bool isSupportedOpcode(const unsigned Opcode) {
11170 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11171 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11172 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11173 Opcode == Instruction::And || Opcode == Instruction::Or ||
11174 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11175 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11176 Opcode == Instruction::FDiv;
11177 }
11178
11179 /// Identifies the best candidate value, which represents main opcode
11180 /// operation.
11181 /// Currently the best candidate is the Add instruction with the parent
11182 /// block with the highest DFS incoming number (block, that dominates other).
11183 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11184 BasicBlock *Parent = nullptr;
11185 // Checks if the instruction has supported opcode.
11186 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11187 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
11188 return false;
11189 return I && isSupportedOpcode(I->getOpcode()) &&
11190 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
11191 };
11192 // Exclude operands instructions immediately to improve compile time, it
11193 // will be unable to schedule anyway.
11194 SmallDenseSet<Value *, 8> Operands;
11195 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11196 bool AnyUndef = false;
11197 for (Value *V : VL) {
11198 auto *I = dyn_cast<Instruction>(V);
11199 if (!I) {
11200 AnyUndef |= isa<UndefValue>(V);
11201 continue;
11202 }
11203 if (!DT.isReachableFromEntry(I->getParent()))
11204 continue;
11205 if (Candidates.empty()) {
11206 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11207 Parent = I->getParent();
11208 Operands.insert(I->op_begin(), I->op_end());
11209 continue;
11210 }
11211 if (Parent == I->getParent()) {
11212 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11213 Operands.insert(I->op_begin(), I->op_end());
11214 continue;
11215 }
11216 auto *NodeA = DT.getNode(Parent);
11217 auto *NodeB = DT.getNode(I->getParent());
11218 assert(NodeA && "Should only process reachable instructions");
11219 assert(NodeB && "Should only process reachable instructions");
11220 assert((NodeA == NodeB) ==
11221 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11222 "Different nodes should have different DFS numbers");
11223 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11224 Candidates.clear();
11225 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11226 Parent = I->getParent();
11227 Operands.clear();
11228 Operands.insert(I->op_begin(), I->op_end());
11229 }
11230 }
11231 unsigned BestOpcodeNum = 0;
11232 MainOp = nullptr;
11233 bool UsedOutside = false;
11234 for (const auto &P : Candidates) {
11235 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
11236 if (UsedOutside && !PUsedOutside)
11237 continue;
11238 if (!UsedOutside && PUsedOutside)
11239 BestOpcodeNum = 0;
11240 if (P.second.size() < BestOpcodeNum)
11241 continue;
11242 // If have inner dependencies - skip.
11243 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
11244 return Operands.contains(I);
11245 }))
11246 continue;
11247 UsedOutside = PUsedOutside;
11248 for (Instruction *I : P.second) {
11249 if (IsSupportedInstruction(I, AnyUndef)) {
11250 MainOp = I;
11251 BestOpcodeNum = P.second.size();
11252 break;
11253 }
11254 }
11255 }
11256 if (MainOp) {
11257 // Do not match, if any copyable is a terminator from the same block as
11258 // the main operation.
11259 if (any_of(VL, [&](Value *V) {
11260 auto *I = dyn_cast<Instruction>(V);
11261 return I && I->getParent() == MainOp->getParent() &&
11262 I->isTerminator();
11263 })) {
11264 MainOp = nullptr;
11265 return;
11266 }
11267 MainOpcode = MainOp->getOpcode();
11268 }
11269 }
11270
11271 /// Returns the idempotent value for the \p MainOp with the detected \p
11272 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11273 /// the operand itself, since V or V == V.
11274 Value *selectBestIdempotentValue() const {
11275 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11276 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
11277 !MainOp->isCommutative());
11278 }
11279
11280 /// Returns the value and operands for the \p V, considering if it is original
11281 /// instruction and its actual operands should be returned, or it is a
11282 /// copyable element and its should be represented as idempotent instruction.
11283 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11284 if (isa<PoisonValue>(V))
11285 return {V, V};
11286 if (!S.isCopyableElement(V))
11287 return convertTo(cast<Instruction>(V), S).second;
11288 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11289 return {V, selectBestIdempotentValue()};
11290 }
11291
11292 /// Builds operands for the original instructions.
11293 void
11294 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11295 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11296
11297 unsigned ShuffleOrOp =
11298 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11299 Instruction *VL0 = S.getMainOp();
11300
11301 switch (ShuffleOrOp) {
11302 case Instruction::PHI: {
11303 auto *PH = cast<PHINode>(VL0);
11304
11305 // Keeps the reordered operands to avoid code duplication.
11306 PHIHandler Handler(DT, PH, VL);
11307 Handler.buildOperands();
11308 Operands.assign(PH->getNumOperands(), {});
11309 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11310 Operands[I].assign(Handler.getOperands(I).begin(),
11311 Handler.getOperands(I).end());
11312 return;
11313 }
11314 case Instruction::ExtractValue:
11315 case Instruction::ExtractElement:
11316 // This is a special case, as it does not gather, but at the same time
11317 // we are not extending buildTree_rec() towards the operands.
11318 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11319 return;
11320 case Instruction::InsertElement:
11321 Operands.assign(2, {VL.size(), nullptr});
11322 for (auto [Idx, V] : enumerate(VL)) {
11323 auto *IE = cast<InsertElementInst>(V);
11324 for (auto [OpIdx, Ops] : enumerate(Operands))
11325 Ops[Idx] = IE->getOperand(OpIdx);
11326 }
11327 return;
11328 case Instruction::Load:
11329 Operands.assign(
11330 1, {VL.size(),
11331 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11332 for (auto [V, Op] : zip(VL, Operands.back())) {
11333 auto *LI = dyn_cast<LoadInst>(V);
11334 if (!LI)
11335 continue;
11336 Op = LI->getPointerOperand();
11337 }
11338 return;
11339 case Instruction::ZExt:
11340 case Instruction::SExt:
11341 case Instruction::FPToUI:
11342 case Instruction::FPToSI:
11343 case Instruction::FPExt:
11344 case Instruction::PtrToInt:
11345 case Instruction::IntToPtr:
11346 case Instruction::SIToFP:
11347 case Instruction::UIToFP:
11348 case Instruction::Trunc:
11349 case Instruction::FPTrunc:
11350 case Instruction::BitCast:
11351 case Instruction::ICmp:
11352 case Instruction::FCmp:
11353 case Instruction::FNeg:
11354 case Instruction::Add:
11355 case Instruction::FAdd:
11356 case Instruction::Sub:
11357 case Instruction::FSub:
11358 case Instruction::Mul:
11359 case Instruction::FMul:
11360 case Instruction::UDiv:
11361 case Instruction::SDiv:
11362 case Instruction::FDiv:
11363 case Instruction::URem:
11364 case Instruction::SRem:
11365 case Instruction::FRem:
11366 case Instruction::Shl:
11367 case Instruction::LShr:
11368 case Instruction::AShr:
11369 case Instruction::And:
11370 case Instruction::Or:
11371 case Instruction::Xor:
11372 case Instruction::Freeze:
11373 case Instruction::Store:
11374 case Instruction::ShuffleVector:
11375 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11376 for (auto [Idx, V] : enumerate(VL)) {
11377 auto *I = dyn_cast<Instruction>(V);
11378 if (!I) {
11379 for (auto [OpIdx, Ops] : enumerate(Operands))
11380 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11381 continue;
11382 }
11383 auto [Op, ConvertedOps] = convertTo(I, S);
11384 for (auto [OpIdx, Ops] : enumerate(Operands))
11385 Ops[Idx] = ConvertedOps[OpIdx];
11386 }
11387 return;
11388 case Instruction::Select:
11389 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11390 for (auto [Idx, V] : enumerate(VL)) {
11391 auto *I = dyn_cast<Instruction>(V);
11392 if (!I) {
11393 for (auto [OpIdx, Ops] : enumerate(Operands))
11394 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11395 continue;
11396 }
11397 if (isa<ZExtInst>(I)) {
11398 // Special case for select + zext i1 to avoid explosion of different
11399 // types. We want to keep the condition as i1 to be able to match
11400 // different selects together and reuse the vectorized condition
11401 // rather than trying to gather it.
11402 Operands[0][Idx] = I->getOperand(0);
11403 Operands[1][Idx] = ConstantInt::get(I->getType(), 1);
11404 Operands[2][Idx] = ConstantInt::getNullValue(I->getType());
11405 continue;
11406 }
11407 auto [Op, ConvertedOps] = convertTo(I, S);
11408 for (auto [OpIdx, Ops] : enumerate(Operands))
11409 Ops[Idx] = ConvertedOps[OpIdx];
11410 }
11411 return;
11412 case Instruction::GetElementPtr: {
11413 Operands.assign(2, {VL.size(), nullptr});
11414 // Need to cast all indices to the same type before vectorization to
11415 // avoid crash.
11416 // Required to be able to find correct matches between different gather
11417 // nodes and reuse the vectorized values rather than trying to gather them
11418 // again.
11419 const unsigned IndexIdx = 1;
11420 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11421 Type *Ty =
11422 all_of(VL,
11423 [&](Value *V) {
11425 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11426 })
11427 ? VL0Ty
11428 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11429 ->getPointerOperandType()
11430 ->getScalarType());
11431 for (auto [Idx, V] : enumerate(VL)) {
11433 if (!GEP) {
11434 Operands[0][Idx] = V;
11435 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11436 continue;
11437 }
11438 Operands[0][Idx] = GEP->getPointerOperand();
11439 auto *Op = GEP->getOperand(IndexIdx);
11440 auto *CI = dyn_cast<ConstantInt>(Op);
11441 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11442 CI, Ty, CI->getValue().isSignBitSet(), DL)
11443 : Op;
11444 }
11445 return;
11446 }
11447 case Instruction::Call: {
11448 auto *CI = cast<CallInst>(VL0);
11450 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11452 continue;
11453 auto &Ops = Operands.emplace_back();
11454 for (Value *V : VL) {
11455 auto *I = dyn_cast<Instruction>(V);
11456 Ops.push_back(I ? I->getOperand(Idx)
11457 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11458 }
11459 }
11460 return;
11461 }
11462 default:
11463 break;
11464 }
11465 llvm_unreachable("Unexpected vectorization of the instructions.");
11466 }
11467
11468 /// Check if the specified \p VL list of values is better to represent as
11469 /// uniform with copyables, as modeled via \p CopyableS, or as alternate (or
11470 /// uniform with compatible ops), modeled via \p S.
11471 /// Performs the analysis of the operands, choosing the preferred main
11472 /// instruction and checking the matching of the operands for the main
11473 /// instruction and copyable elements.
11474 bool isCopyablePreferable(ArrayRef<Value *> VL, const BoUpSLP &R,
11475 const InstructionsState &S,
11476 const InstructionsState &CopyableS) {
11477 // If all elements are vectorized already - keep as is.
11478 if (all_of(VL, [&](Value *V) {
11479 return isa<PoisonValue>(V) || R.isVectorized(V);
11480 }))
11481 return false;
11482 Instruction *SMain = S.getMainOp();
11483 Instruction *SAlt = S.isAltShuffle() ? S.getAltOp() : nullptr;
11484 const bool IsCommutative = ::isCommutative(SMain);
11485 const bool IsAltCommutative =
11486 S.isAltShuffle() ? ::isCommutative(SAlt) : false;
11487 const bool IsMainCommutative = ::isCommutative(MainOp);
11489 buildOriginalOperands(S, SMain, Ops);
11490 // Support only binary operations for now.
11491 if (Ops.size() != 2)
11492 return false;
11493 // Try to find better candidate for S main instruction, which operands have
11494 // better matching.
11495 auto CheckOperands = [](Value *Op, Value *SMainOp) {
11496 auto *OpI = dyn_cast<BinaryOperator>(Op);
11497 if (!OpI)
11498 return false;
11499 auto *SMainOpI = dyn_cast<BinaryOperator>(SMainOp);
11500 if (!SMainOpI)
11501 return true;
11502 return any_of(OpI->operands(), [&](Value *V) {
11503 auto *I = dyn_cast<Instruction>(V);
11504 return I && I->getOpcode() == SMainOpI->getOpcode();
11505 });
11506 };
11507 SmallPtrSet<Value *, 8> Operands;
11508 for (Value *V : VL) {
11509 auto *I = dyn_cast<Instruction>(V);
11510 if (!I || I == SMain)
11511 continue;
11512 Instruction *MatchingOp = S.getMatchingMainOpOrAltOp(I);
11513 if (MatchingOp != SMain)
11514 continue;
11516 buildOriginalOperands(S, I, VOps);
11517 Operands.insert(I->op_begin(), I->op_end());
11518 assert(VOps.size() == 2 && Ops.size() == 2 &&
11519 "Expected binary operations only.");
11520 if (CheckOperands(VOps[0][0], Ops[0][0]) ||
11521 CheckOperands(VOps[1][0], Ops[1][0]) ||
11522 (IsCommutative && (CheckOperands(VOps[0][0], Ops[1][0]) ||
11523 CheckOperands(VOps[1][0], Ops[0][0])))) {
11524 SMain = I;
11525 Ops.swap(VOps);
11526 break;
11527 }
11528 }
11530 buildOriginalOperands(S, MainOp, MainOps);
11531
11532 auto BuildFirstOperandCandidates =
11533 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11535 bool IsCommutative) {
11536 Candidates.emplace_back(Ops[0][0], Op0);
11537 if (IsCommutative)
11538 Candidates.emplace_back(Ops[0][0], Op1);
11539 };
11540
11541 auto BuildSecondOperandCandidates =
11542 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11543 ArrayRef<BoUpSLP::ValueList> Ops, int PrevBestIdx, Value *Op0,
11544 Value *Op1, bool IsCommutative) {
11545 if (PrevBestIdx != 1)
11546 Candidates.emplace_back(Ops[1][0], Op1);
11547 if (PrevBestIdx != 0 && IsCommutative)
11548 Candidates.emplace_back(Ops[1][0], Op0);
11549 };
11550
11551 auto FindBestCandidate =
11552 [&](ArrayRef<std::pair<Value *, Value *>> Candidates, bool &IsConst,
11553 int &Score) {
11554 auto Res = R.findBestRootPair(Candidates);
11555 Score = Res.second;
11556 IsConst =
11558 isConstant(Candidates[Res.first.value_or(0)].first) &&
11559 isConstant(Candidates[Res.first.value_or(0)].second);
11560 if (IsConst) {
11561 // Check if there are splat candidates and consider them better
11562 // option.
11563 for (const auto [Idx, P] : enumerate(Candidates)) {
11564 if (!isConstant(P.first) && !isConstant(P.second) &&
11565 P.second == P.first) {
11566 Res.first = Idx;
11567 IsConst = false;
11568 Score = isa<LoadInst>(Candidates[Res.first.value_or(0)].first)
11571 break;
11572 }
11573 }
11574 }
11575 return Res.first;
11576 };
11577
11578 for (Value *V : VL) {
11579 auto *I = dyn_cast<Instruction>(V);
11580 if (!I || (I == MainOp && (!S.isAltShuffle() || I == SMain)) ||
11581 (!S.isAltShuffle() && I == SMain))
11582 continue;
11584 buildOriginalOperands(S, I == SMain ? MainOp : I, VOps);
11585 SmallVector<Value *> CopyableOps =
11586 getOperands(CopyableS, I == MainOp ? SMain : I);
11587 if (CopyableOps.size() == VOps.size() &&
11588 all_of(zip(CopyableOps, VOps), [&](const auto &P) {
11589 return std::get<0>(P) == std::get<1>(P)[0];
11590 }))
11591 continue;
11593 BuildFirstOperandCandidates(Candidates, MainOps, CopyableOps[0],
11594 CopyableOps[1], IsMainCommutative);
11595 const unsigned OpSize = Candidates.size();
11596 Instruction *MatchingOp =
11597 S.getMatchingMainOpOrAltOp(I) == S.getMainOp() ? SMain : SAlt;
11598 const bool IsCommutativeInst =
11599 (MatchingOp == SMain ? IsCommutative : IsAltCommutative) ||
11600 ::isCommutative(I, MatchingOp);
11601 if (S.isAltShuffle() && MatchingOp == SAlt &&
11602 any_of(VOps, [&](const BoUpSLP::ValueList &Ops) {
11603 auto *I = dyn_cast<BinaryOperator>(Ops[0]);
11604 return I && Operands.contains(I);
11605 }))
11606 return false;
11607 if (S.isAltShuffle() && MatchingOp == SMain)
11608 Operands.insert(I->op_begin(), I->op_end());
11609 BuildFirstOperandCandidates(Candidates, Ops, VOps[0][0], VOps[1][0],
11610 IsCommutativeInst);
11611 bool IsBestConst;
11612 int Score;
11613 std::optional<int> BestOp =
11614 FindBestCandidate(Candidates, IsBestConst, Score);
11615 const bool IsOriginalBetter =
11616 static_cast<unsigned>(BestOp.value_or(OpSize)) >= OpSize;
11617 Candidates.clear();
11618 BuildSecondOperandCandidates(
11619 Candidates, MainOps, IsOriginalBetter ? -1 : *BestOp, CopyableOps[0],
11620 CopyableOps[1], IsMainCommutative);
11621 const unsigned SecondOpSize = Candidates.size();
11622 BuildSecondOperandCandidates(
11623 Candidates, Ops,
11624 IsOriginalBetter ? BestOp.value_or(OpSize - 1) - OpSize : -1,
11625 VOps[0][0], VOps[1][0], IsCommutativeInst);
11626 bool IsSecondBestConst;
11627 int SecondScore;
11628 std::optional<int> SecondBestOp =
11629 FindBestCandidate(Candidates, IsSecondBestConst, SecondScore);
11630 // No best candidates.
11631 if (!BestOp && !SecondBestOp)
11632 return false;
11633 // Original better in both ops combinations.
11634 const bool IsSecondOriginalBetter =
11635 static_cast<unsigned>(SecondBestOp.value_or(SecondOpSize)) >=
11636 SecondOpSize;
11637 if (IsOriginalBetter && IsSecondOriginalBetter)
11638 return false;
11639 // Original is better in second combination, but in the first combination
11640 // no best candidates.
11641 if (!BestOp && IsSecondOriginalBetter)
11642 return false;
11643 // Original is better in first combination, but in the second combination
11644 // no best candidates.
11645 if (!SecondBestOp && IsOriginalBetter)
11646 return false;
11647 // Copyable is best in the first combination, but it is constant, but
11648 // original is better in second non-constant combination.
11649 if (!IsOriginalBetter && IsBestConst && IsSecondOriginalBetter &&
11650 !IsSecondBestConst)
11651 return false;
11652 // Copyable is best in the second combination, but it is constant, but
11653 // original is better in the first non-constant combination.
11654 if (BestOp && IsOriginalBetter && !IsBestConst &&
11655 !IsSecondOriginalBetter && IsSecondBestConst)
11656 return false;
11657 // Original combination score is better.
11658 if (((Score > SecondScore ||
11660 Score == SecondScore)) &&
11661 IsOriginalBetter) ||
11662 (IsSecondOriginalBetter &&
11663 (SecondScore > Score ||
11665 Score == SecondScore))))
11666 return false;
11667 }
11668 return true;
11669 }
11670
11671public:
11672 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11673 const TargetTransformInfo &TTI,
11674 const TargetLibraryInfo &TLI)
11675 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11676
11677 InstructionsState buildInstructionsState(ArrayRef<Value *> VL,
11678 const BoUpSLP &R,
11679 bool WithProfitabilityCheck = false,
11680 bool SkipSameCodeCheck = false) {
11681 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11682 ? InstructionsState::invalid()
11683 : getSameOpcode(VL, TLI);
11684 // Check if series of selects + zext i1 %x to in can be combined into
11685 // selects + select %x, i32 1, i32 0.
11686 Instruction *SelectOp = nullptr;
11687 if (!S && allSameBlock(VL) && all_of(VL, [&](Value *V) {
11688 if (match(V, m_Select(m_Value(), m_Value(), m_Value()))) {
11689 if (!SelectOp)
11690 SelectOp = cast<Instruction>(V);
11691 return true;
11692 }
11693 auto *ZExt = dyn_cast<ZExtInst>(V);
11694 return (ZExt && ZExt->getSrcTy()->isIntegerTy(1)) ||
11696 })) {
11697 if (SelectOp)
11698 return InstructionsState(SelectOp, SelectOp);
11699 }
11700 if (S && S.isAltShuffle()) {
11701 Type *ScalarTy = S.getMainOp()->getType();
11702 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
11703 unsigned Opcode0 = S.getOpcode();
11704 unsigned Opcode1 = S.getAltOpcode();
11705 SmallBitVector OpcodeMask(
11706 getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11707 // If this pattern is supported by the target then we consider the order.
11708 if (TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
11709 return S;
11710 } else if (S && (!VectorizeCopyableElements ||
11711 !isa<BinaryOperator>(S.getMainOp()) ||
11712 all_of(VL, [&](Value *V) {
11713 auto *I = dyn_cast<Instruction>(V);
11714 return !I || I->getOpcode() == S.getOpcode();
11715 }))) {
11716 return S;
11717 }
11719 return S;
11720 findAndSetMainInstruction(VL, R);
11721 if (!MainOp)
11722 return S;
11723 InstructionsState OrigS = S;
11724 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11725 if (OrigS && !isCopyablePreferable(VL, R, OrigS, S))
11726 return OrigS;
11727 if (!WithProfitabilityCheck)
11728 return S;
11729 // Check if it is profitable to vectorize the instruction.
11730 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11731 auto BuildCandidates =
11732 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11733 Value *V2) {
11734 if (V1 != V2 && isa<PHINode>(V1))
11735 return;
11736 auto *I1 = dyn_cast<Instruction>(V1);
11737 auto *I2 = dyn_cast<Instruction>(V2);
11738 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11739 I1->getParent() != I2->getParent())
11740 return;
11741 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11742 };
11743 if (VL.size() == 2) {
11744 // Check if the operands allow better vectorization.
11745 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11746 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11747 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11748 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11749 R.findBestRootPair(Candidates1).first &&
11750 R.findBestRootPair(Candidates2).first;
11751 if (!Res && isCommutative(MainOp)) {
11752 Candidates1.clear();
11753 Candidates2.clear();
11754 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11755 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11756 Res = !Candidates1.empty() && !Candidates2.empty() &&
11757 R.findBestRootPair(Candidates1).first &&
11758 R.findBestRootPair(Candidates2).first;
11759 }
11760 if (!Res)
11761 return OrigS;
11763 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11764 InstructionCost VectorCost;
11765 FixedVectorType *VecTy =
11766 getWidenedType(S.getMainOp()->getType(), VL.size());
11767 switch (MainOpcode) {
11768 case Instruction::Add:
11769 case Instruction::Sub:
11770 case Instruction::LShr:
11771 case Instruction::Shl:
11772 case Instruction::SDiv:
11773 case Instruction::UDiv:
11774 case Instruction::And:
11775 case Instruction::Or:
11776 case Instruction::Xor:
11777 case Instruction::FAdd:
11778 case Instruction::FMul:
11779 case Instruction::FSub:
11780 case Instruction::FDiv:
11781 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11782 break;
11783 default:
11784 llvm_unreachable("Unexpected instruction.");
11785 }
11786 if (VectorCost > ScalarCost)
11787 return OrigS;
11788 return S;
11789 }
11790 assert(Operands.size() == 2 && "Unexpected number of operands!");
11791 unsigned CopyableNum =
11792 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11793 if (CopyableNum < VL.size() / 2)
11794 return S;
11795 // Too many phi copyables - exit.
11796 const unsigned Limit = VL.size() / 24;
11797 if ((CopyableNum >= VL.size() - Limit ||
11798 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11799 CopyableNum >= MaxPHINumOperands) &&
11800 all_of(VL, [&](Value *V) {
11801 return isa<PHINode>(V) || !S.isCopyableElement(V);
11802 }))
11803 return OrigS;
11804 // Check profitability if number of copyables > VL.size() / 2.
11805 // 1. Reorder operands for better matching.
11806 if (isCommutative(MainOp)) {
11807 for (auto [OpL, OpR] : zip(Operands.front(), Operands.back())) {
11808 // Make instructions the first operands.
11809 if (!isa<Instruction>(OpL) && isa<Instruction>(OpR)) {
11810 std::swap(OpL, OpR);
11811 continue;
11812 }
11813 // Make constants the second operands.
11814 if ((isa<Constant>(OpL) && !match(OpR, m_Zero())) ||
11815 match(OpL, m_Zero())) {
11816 std::swap(OpL, OpR);
11817 continue;
11818 }
11819 }
11820 }
11821 // 2. Check, if operands can be vectorized.
11822 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11823 return OrigS;
11824 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11825 if (allConstant(Ops) || isSplat(Ops))
11826 return true;
11827 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11828 // one is different.
11829 constexpr unsigned Limit = 4;
11830 if (Operands.front().size() >= Limit) {
11831 SmallDenseMap<const Value *, unsigned> Counters;
11832 for (Value *V : Ops) {
11833 if (isa<UndefValue>(V))
11834 continue;
11835 ++Counters[V];
11836 }
11837 if (Counters.size() == 2 &&
11838 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11839 return C.second == 1;
11840 }))
11841 return true;
11842 }
11843 // First operand not a constant or splat? Last attempt - check for
11844 // potential vectorization.
11845 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11846 InstructionsState OpS = Analysis.buildInstructionsState(Ops, R);
11847 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11848 return false;
11849 unsigned CopyableNum =
11850 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11851 return CopyableNum <= VL.size() / 2;
11852 };
11853 if (!CheckOperand(Operands.front()))
11854 return OrigS;
11855
11856 return S;
11857 }
11858
11859 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11860 ArrayRef<Value *> VL) {
11861 assert(S && "Invalid state!");
11863 if (S.areInstructionsWithCopyableElements()) {
11864 MainOp = S.getMainOp();
11865 MainOpcode = S.getOpcode();
11866 Operands.assign(MainOp->getNumOperands(),
11867 BoUpSLP::ValueList(VL.size(), nullptr));
11868 for (auto [Idx, V] : enumerate(VL)) {
11869 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11870 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11871 Operands[OperandIdx][Idx] = Operand;
11872 }
11873 } else {
11874 buildOriginalOperands(S, VL, Operands);
11875 }
11876 return Operands;
11877 }
11878};
11879} // namespace
11880
11881BoUpSLP::ScalarsVectorizationLegality
11882BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
11883 const EdgeInfo &UserTreeIdx) const {
11884 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11885
11886 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11887 InstructionsState S = Analysis.buildInstructionsState(
11888 VL, *this, /*WithProfitabilityCheck=*/true);
11889
11890 bool AreScatterAllGEPSameBlock = false;
11891 if (!S) {
11892 SmallVector<unsigned> SortedIndices;
11893 BasicBlock *BB = nullptr;
11894 bool IsScatterVectorizeUserTE =
11895 UserTreeIdx.UserTE &&
11896 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11897 AreScatterAllGEPSameBlock =
11898 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11899 VL.size() > 2 &&
11900 all_of(VL,
11901 [&BB](Value *V) {
11902 auto *I = dyn_cast<GetElementPtrInst>(V);
11903 if (!I)
11904 return doesNotNeedToBeScheduled(V);
11905 if (!BB)
11906 BB = I->getParent();
11907 return BB == I->getParent() && I->getNumOperands() == 2;
11908 }) &&
11909 BB &&
11910 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
11911 *SE, SortedIndices));
11912 if (!AreScatterAllGEPSameBlock) {
11913 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11914 "C,S,B,O, small shuffle. \n";
11915 dbgs() << "[";
11916 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11917 dbgs() << "]\n");
11918 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11919 /*TryToFindDuplicates=*/true,
11920 /*TrySplitVectorize=*/true);
11921 }
11922 // Reset S to make it GetElementPtr kind of node.
11923 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11924 assert(It != VL.end() && "Expected at least one GEP.");
11925 S = getSameOpcode(*It, *TLI);
11926 }
11927 assert(S && "Must be valid.");
11928
11929 // Don't handle vectors.
11930 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11931 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11932 // Do not try to pack to avoid extra instructions here.
11933 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11934 /*TryToFindDuplicates=*/false);
11935 }
11936
11937 // Check that all of the users of the scalars that we want to vectorize are
11938 // schedulable.
11939 BasicBlock *BB = S.getMainOp()->getParent();
11940
11942 !DT->isReachableFromEntry(BB)) {
11943 // Don't go into unreachable blocks. They may contain instructions with
11944 // dependency cycles which confuse the final scheduling.
11945 // Do not vectorize EH and non-returning blocks, not profitable in most
11946 // cases.
11947 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11948 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11949 }
11950
11951 // Don't go into catchswitch blocks, which can happen with PHIs.
11952 // Such blocks can only have PHIs and the catchswitch. There is no
11953 // place to insert a shuffle if we need to, so just avoid that issue.
11955 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11956 // Do not try to pack to avoid extra instructions here.
11957 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11958 /*TryToFindDuplicates=*/false);
11959 }
11960
11961 // Don't handle scalable vectors
11962 if (S.getOpcode() == Instruction::ExtractElement &&
11964 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11965 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11966 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11967 }
11968
11969 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11970 // a load), in which case peek through to include it in the tree, without
11971 // ballooning over-budget.
11972 if (Depth >= RecursionMaxDepth &&
11973 (S.isAltShuffle() || VL.size() < 4 ||
11974 !(match(S.getMainOp(), m_Load(m_Value())) ||
11975 all_of(VL, [&S](const Value *I) {
11976 return match(I,
11978 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11979 })))) {
11980 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11981 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11982 }
11983
11984 // Check if this is a duplicate of another entry.
11985 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11986 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11987 if (E->isSame(VL)) {
11988 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11989 << ".\n");
11990 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11991 }
11992 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11993 if (all_of(VL, [&](Value *V) {
11994 return isa<PoisonValue>(V) || Values.contains(V) ||
11995 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11996 LI->getLoopFor(S.getMainOp()->getParent()) &&
11997 isVectorized(V));
11998 })) {
11999 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
12000 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12001 }
12002 }
12003
12004 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
12005 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
12006 if (!AreAllSameInsts || isSplat(VL) ||
12008 S.getMainOp()) &&
12010 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O conditions. \n";
12011 dbgs() << "[";
12012 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
12013 dbgs() << "]\n");
12014 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12015 }
12016
12017 // Don't vectorize ephemeral values.
12018 if (!EphValues.empty()) {
12019 for (Value *V : VL) {
12020 if (EphValues.count(V)) {
12021 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
12022 << ") is ephemeral.\n");
12023 // Do not try to pack to avoid extra instructions here.
12024 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12025 /*TryToFindDuplicates=*/false);
12026 }
12027 }
12028 }
12029
12030 // We now know that this is a vector of instructions of the same type from
12031 // the same block.
12032
12033 // Check that none of the instructions in the bundle are already in the tree
12034 // and the node may be not profitable for the vectorization as the small
12035 // alternate node.
12036 if (S.isAltShuffle()) {
12037 auto GetNumVectorizedExtracted = [&]() {
12038 APInt Extracted = APInt::getZero(VL.size());
12039 APInt Vectorized = APInt::getAllOnes(VL.size());
12040 for (auto [Idx, V] : enumerate(VL)) {
12041 auto *I = dyn_cast<Instruction>(V);
12042 if (!I || doesNotNeedToBeScheduled(I) ||
12043 all_of(I->operands(), [&](const Use &U) {
12044 return isa<ExtractElementInst>(U.get());
12045 }))
12046 continue;
12047 if (isVectorized(I))
12048 Vectorized.clearBit(Idx);
12049 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
12050 Extracted.setBit(Idx);
12051 }
12052 return std::make_pair(Vectorized, Extracted);
12053 };
12054 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
12056 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
12057 if (!Vectorized.isAllOnes() && !PreferScalarize) {
12058 // Rough cost estimation, if the vector code (+ potential extracts) is
12059 // more profitable than the scalar + buildvector.
12060 Type *ScalarTy = VL.front()->getType();
12061 auto *VecTy = getWidenedType(ScalarTy, VL.size());
12062 InstructionCost VectorizeCostEstimate =
12063 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
12064 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
12065 /*Insert=*/false, /*Extract=*/true, Kind);
12066 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
12067 *TTI, ScalarTy, VecTy, Vectorized,
12068 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
12069 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
12070 }
12071 if (PreferScalarize) {
12072 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
12073 "node is not profitable.\n");
12074 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12075 }
12076 }
12077
12078 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
12079 if (UserIgnoreList && !UserIgnoreList->empty()) {
12080 for (Value *V : VL) {
12081 if (UserIgnoreList->contains(V)) {
12082 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
12083 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12084 }
12085 }
12086 }
12087
12088 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
12089}
12090
12091void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
12092 const EdgeInfo &UserTreeIdx,
12093 unsigned InterleaveFactor) {
12094 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
12095
12096 SmallVector<int> ReuseShuffleIndices;
12097 SmallVector<Value *> VL(VLRef);
12098
12099 // Tries to build split node.
12100 auto TrySplitNode = [&](const InstructionsState &LocalState) {
12101 SmallVector<Value *> Op1, Op2;
12102 OrdersType ReorderIndices;
12103 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
12104 return false;
12105
12106 auto Invalid = ScheduleBundle::invalid();
12107 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
12108 UserTreeIdx, {}, ReorderIndices);
12109 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
12110 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
12111 InstructionsState S = getSameOpcode(Op, *TLI);
12112 if (S && (isa<LoadInst>(S.getMainOp()) ||
12113 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
12114 // Build gather node for loads, they will be gathered later.
12115 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12116 Idx == 0 ? 0 : Op1.size());
12117 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
12118 } else {
12119 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12120 Idx == 0 ? 0 : Op1.size());
12121 buildTreeRec(Op, Depth, {TE, Idx});
12122 }
12123 };
12124 AddNode(Op1, 0);
12125 AddNode(Op2, 1);
12126 return true;
12127 };
12128
12129 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
12130 bool AreConsts = false;
12131 for (Value *V : VL) {
12132 if (isa<PoisonValue>(V))
12133 continue;
12134 if (isa<Constant>(V)) {
12135 AreConsts = true;
12136 continue;
12137 }
12138 if (!isa<PHINode>(V))
12139 return false;
12140 }
12141 return AreConsts;
12142 };
12143 if (AreOnlyConstsWithPHIs(VL)) {
12144 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
12145 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
12146 return;
12147 }
12148
12149 ScalarsVectorizationLegality Legality =
12150 getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
12151 InstructionsState S = Legality.getInstructionsState();
12152 if (!Legality.isLegal()) {
12153 if (Legality.trySplitVectorize()) {
12154 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
12155 // Last chance to try to vectorize alternate node.
12156 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
12157 return;
12158 }
12159 if (Legality.tryToFindDuplicates())
12160 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx);
12161
12162 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12163 return;
12164 }
12165
12166 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
12167 if (S.isAltShuffle() && TrySplitNode(S))
12168 return;
12169
12170 // Check that every instruction appears once in this bundle.
12171 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
12172 /*TryPad=*/true)) {
12173 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12174 return;
12175 }
12176
12177 // Perform specific checks for each particular instruction kind.
12178 bool IsScatterVectorizeUserTE =
12179 UserTreeIdx.UserTE &&
12180 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12181 OrdersType CurrentOrder;
12182 SmallVector<Value *> PointerOps;
12183 StridedPtrInfo SPtrInfo;
12184 TreeEntry::EntryState State = getScalarsVectorizationState(
12185 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
12186 if (State == TreeEntry::NeedToGather) {
12187 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12188 return;
12189 }
12190
12191 // Check the loop nest. We need to be sure we handle a single loop nest at a
12192 // time to avoid incorrect cost estimation because of the loop aware cost
12193 // model.
12194 if (VectorizableTree.empty()) {
12195 assert(CurrentLoopNest.empty() && "Expected empty loop nest");
12196 // Process the first node? Initial fill of the loop nest.
12197 BasicBlock *Parent = S.getMainOp()->getParent();
12198 if (const Loop *L = LI->getLoopFor(Parent)) {
12200 if (L)
12201 CurrentLoopNest.assign(getLoopNest(L));
12202 }
12203 } else if (!UserTreeIdx ||
12204 UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize ||
12205 UserTreeIdx.UserTE->isGather() ||
12206 UserTreeIdx.UserTE->getMainOp()->getParent() !=
12207 S.getMainOp()->getParent()) {
12208 BasicBlock *Parent = S.getMainOp()->getParent();
12209 if (const Loop *L = LI->getLoopFor(Parent)) {
12210 // Check that the new loop nest is not involved.
12211 // Otherwise, mark it as a gather node.
12213 if (L) {
12214 SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
12215 for (const auto [L1, L2] : zip(CurrentLoopNest, NewLoopNest)) {
12216 if (L1 != L2) {
12217 LLVM_DEBUG(dbgs() << "SLP: Different loop nest.\n");
12218 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12219 return;
12220 }
12221 }
12222 if (NewLoopNest.size() > CurrentLoopNest.size())
12223 CurrentLoopNest.append(std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
12224 NewLoopNest.end());
12225 }
12226 }
12227 }
12228
12229 Instruction *VL0 = S.getMainOp();
12230 BasicBlock *BB = VL0->getParent();
12231 auto &BSRef = BlocksSchedules[BB];
12232 if (!BSRef)
12233 BSRef = std::make_unique<BlockScheduling>(BB);
12234
12235 BlockScheduling &BS = *BSRef;
12236
12237 SetVector<Value *> UniqueValues(llvm::from_range, VL);
12238 std::optional<ScheduleBundle *> BundlePtr =
12239 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
12240#ifdef EXPENSIVE_CHECKS
12241 // Make sure we didn't break any internal invariants
12242 BS.verify();
12243#endif
12244 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
12245 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
12246 // Last chance to try to vectorize alternate node.
12247 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
12248 return;
12249 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12250 NonScheduledFirst.insert(VL.front());
12251 if (S.getOpcode() == Instruction::Load &&
12252 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
12254 return;
12255 }
12256 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
12257 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
12258 ScheduleBundle Empty;
12259 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
12260 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
12261
12262 unsigned ShuffleOrOp =
12263 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
12264 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
12265 // Postpone PHI nodes creation
12266 SmallVector<unsigned> PHIOps;
12267 for (unsigned I : seq<unsigned>(Operands.size())) {
12268 ArrayRef<Value *> Op = Operands[I];
12269 if (Op.empty())
12270 continue;
12271 InstructionsState S = getSameOpcode(Op, *TLI);
12272 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12273 buildTreeRec(Op, Depth + 1, {TE, I});
12274 else
12275 PHIOps.push_back(I);
12276 }
12277 for (unsigned I : PHIOps)
12278 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12279 };
12280 switch (ShuffleOrOp) {
12281 case Instruction::PHI: {
12282 TreeEntry *TE =
12283 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12284 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
12285 TE->dump());
12286
12287 TE->setOperands(Operands);
12288 CreateOperandNodes(TE, Operands);
12289 return;
12290 }
12291 case Instruction::ExtractValue:
12292 case Instruction::ExtractElement: {
12293 if (CurrentOrder.empty()) {
12294 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
12295 } else {
12296 LLVM_DEBUG({
12297 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
12298 "with order";
12299 for (unsigned Idx : CurrentOrder)
12300 dbgs() << " " << Idx;
12301 dbgs() << "\n";
12302 });
12303 fixupOrderingIndices(CurrentOrder);
12304 }
12305 // Insert new order with initial value 0, if it does not exist,
12306 // otherwise return the iterator to the existing one.
12307 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12308 ReuseShuffleIndices, CurrentOrder);
12309 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
12310 "(ExtractValueInst/ExtractElementInst).\n";
12311 TE->dump());
12312 // This is a special case, as it does not gather, but at the same time
12313 // we are not extending buildTreeRec() towards the operands.
12314 TE->setOperands(Operands);
12315 return;
12316 }
12317 case Instruction::InsertElement: {
12318 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
12319
12320 auto OrdCompare = [](const std::pair<int, int> &P1,
12321 const std::pair<int, int> &P2) {
12322 return P1.first > P2.first;
12323 };
12324 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
12325 decltype(OrdCompare)>
12326 Indices(OrdCompare);
12327 for (int I = 0, E = VL.size(); I < E; ++I) {
12328 unsigned Idx = *getElementIndex(VL[I]);
12329 Indices.emplace(Idx, I);
12330 }
12331 OrdersType CurrentOrder(VL.size(), VL.size());
12332 bool IsIdentity = true;
12333 for (int I = 0, E = VL.size(); I < E; ++I) {
12334 CurrentOrder[Indices.top().second] = I;
12335 IsIdentity &= Indices.top().second == I;
12336 Indices.pop();
12337 }
12338 if (IsIdentity)
12339 CurrentOrder.clear();
12340 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12341 {}, CurrentOrder);
12342 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12343 TE->dump());
12344
12345 TE->setOperands(Operands);
12346 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
12347 return;
12348 }
12349 case Instruction::Load: {
12350 // Check that a vectorized load would load the same memory as a scalar
12351 // load. For example, we don't want to vectorize loads that are smaller
12352 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12353 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12354 // from such a struct, we read/write packed bits disagreeing with the
12355 // unvectorized version.
12356 TreeEntry *TE = nullptr;
12357 fixupOrderingIndices(CurrentOrder);
12358 switch (State) {
12359 case TreeEntry::Vectorize:
12360 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12361 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12362 if (CurrentOrder.empty())
12363 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12364 TE->dump());
12365 else
12367 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12368 TE->dump());
12369 break;
12370 case TreeEntry::CompressVectorize:
12371 // Vectorizing non-consecutive loads with (masked)load + compress.
12372 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12373 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12374 LLVM_DEBUG(
12375 dbgs()
12376 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12377 TE->dump());
12378 break;
12379 case TreeEntry::StridedVectorize:
12380 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12381 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12382 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12383 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12384 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12385 TE->dump());
12386 break;
12387 case TreeEntry::ScatterVectorize:
12388 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12389 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12390 UserTreeIdx, ReuseShuffleIndices);
12391 LLVM_DEBUG(
12392 dbgs()
12393 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12394 TE->dump());
12395 break;
12396 case TreeEntry::CombinedVectorize:
12397 case TreeEntry::SplitVectorize:
12398 case TreeEntry::NeedToGather:
12399 llvm_unreachable("Unexpected loads state.");
12400 }
12401 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12402 assert(Operands.size() == 1 && "Expected a single operand only");
12403 SmallVector<int> Mask;
12404 inversePermutation(CurrentOrder, Mask);
12405 reorderScalars(Operands.front(), Mask);
12406 }
12407 TE->setOperands(Operands);
12408 if (State == TreeEntry::ScatterVectorize)
12409 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
12410 return;
12411 }
12412 case Instruction::ZExt:
12413 case Instruction::SExt:
12414 case Instruction::FPToUI:
12415 case Instruction::FPToSI:
12416 case Instruction::FPExt:
12417 case Instruction::PtrToInt:
12418 case Instruction::IntToPtr:
12419 case Instruction::SIToFP:
12420 case Instruction::UIToFP:
12421 case Instruction::Trunc:
12422 case Instruction::FPTrunc:
12423 case Instruction::BitCast: {
12424 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12425 std::make_pair(std::numeric_limits<unsigned>::min(),
12426 std::numeric_limits<unsigned>::max()));
12427 if (ShuffleOrOp == Instruction::ZExt ||
12428 ShuffleOrOp == Instruction::SExt) {
12429 CastMaxMinBWSizes = std::make_pair(
12430 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12431 PrevMaxBW),
12432 std::min<unsigned>(
12433 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12434 PrevMinBW));
12435 } else if (ShuffleOrOp == Instruction::Trunc) {
12436 CastMaxMinBWSizes = std::make_pair(
12437 std::max<unsigned>(
12438 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12439 PrevMaxBW),
12440 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12441 PrevMinBW));
12442 }
12443 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12444 ReuseShuffleIndices);
12445 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12446 TE->dump());
12447
12448 TE->setOperands(Operands);
12449 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12450 buildTreeRec(TE->getOperand(I), Depth, {TE, I});
12451 if (ShuffleOrOp == Instruction::Trunc) {
12452 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12453 } else if (ShuffleOrOp == Instruction::SIToFP ||
12454 ShuffleOrOp == Instruction::UIToFP) {
12455 unsigned NumSignBits =
12456 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12457 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
12458 APInt Mask = DB->getDemandedBits(OpI);
12459 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
12460 }
12461 if (NumSignBits * 2 >=
12462 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12463 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12464 }
12465 return;
12466 }
12467 case Instruction::ICmp:
12468 case Instruction::FCmp: {
12469 // Check that all of the compares have the same predicate.
12470 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12471 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12472 ReuseShuffleIndices);
12473 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12474 TE->dump());
12475
12476 VLOperands Ops(VL, Operands, S, *this);
12477 if (cast<CmpInst>(VL0)->isCommutative()) {
12478 // Commutative predicate - collect + sort operands of the instructions
12479 // so that each side is more likely to have the same opcode.
12481 "Commutative Predicate mismatch");
12482 Ops.reorder();
12483 Operands.front() = Ops.getVL(0);
12484 Operands.back() = Ops.getVL(1);
12485 } else {
12486 // Collect operands - commute if it uses the swapped predicate.
12487 for (auto [Idx, V] : enumerate(VL)) {
12488 if (isa<PoisonValue>(V))
12489 continue;
12490 auto *Cmp = cast<CmpInst>(V);
12491 if (Cmp->getPredicate() != P0)
12492 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12493 }
12494 }
12495 TE->setOperands(Operands);
12496 buildTreeRec(Operands.front(), Depth, {TE, 0});
12497 buildTreeRec(Operands.back(), Depth, {TE, 1});
12498 if (ShuffleOrOp == Instruction::ICmp) {
12499 unsigned NumSignBits0 =
12500 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12501 if (NumSignBits0 * 2 >=
12502 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
12503 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12504 unsigned NumSignBits1 =
12505 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
12506 if (NumSignBits1 * 2 >=
12507 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
12508 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12509 }
12510 return;
12511 }
12512 case Instruction::Select:
12513 case Instruction::FNeg:
12514 case Instruction::Add:
12515 case Instruction::FAdd:
12516 case Instruction::Sub:
12517 case Instruction::FSub:
12518 case Instruction::Mul:
12519 case Instruction::FMul:
12520 case Instruction::UDiv:
12521 case Instruction::SDiv:
12522 case Instruction::FDiv:
12523 case Instruction::URem:
12524 case Instruction::SRem:
12525 case Instruction::FRem:
12526 case Instruction::Shl:
12527 case Instruction::LShr:
12528 case Instruction::AShr:
12529 case Instruction::And:
12530 case Instruction::Or:
12531 case Instruction::Xor:
12532 case Instruction::Freeze: {
12533 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12534 ReuseShuffleIndices);
12535 LLVM_DEBUG(
12536 dbgs() << "SLP: added a new TreeEntry "
12537 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12538 TE->dump());
12539
12540 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
12541 VLOperands Ops(VL, Operands, S, *this);
12542 Ops.reorder();
12543 Operands[0] = Ops.getVL(0);
12544 Operands[1] = Ops.getVL(1);
12545 }
12546 TE->setOperands(Operands);
12547 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12548 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12549 return;
12550 }
12551 case Instruction::GetElementPtr: {
12552 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12553 ReuseShuffleIndices);
12554 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12555 TE->dump());
12556 TE->setOperands(Operands);
12557
12558 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12559 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12560 return;
12561 }
12562 case Instruction::Store: {
12563 bool Consecutive = CurrentOrder.empty();
12564 if (!Consecutive)
12565 fixupOrderingIndices(CurrentOrder);
12566 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12567 ReuseShuffleIndices, CurrentOrder);
12568 if (Consecutive)
12569 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12570 TE->dump());
12571 else
12572 LLVM_DEBUG(
12573 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12574 TE->dump());
12575 TE->setOperands(Operands);
12576 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
12577 return;
12578 }
12579 case Instruction::Call: {
12580 // Check if the calls are all to the same vectorizable intrinsic or
12581 // library function.
12582 CallInst *CI = cast<CallInst>(VL0);
12584
12585 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12586 ReuseShuffleIndices);
12587 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12588 TE->dump());
12589 if (isCommutative(VL0)) {
12590 VLOperands Ops(VL, Operands, S, *this);
12591 Ops.reorder();
12592 Operands[0] = Ops.getVL(0);
12593 Operands[1] = Ops.getVL(1);
12594 }
12595 TE->setOperands(Operands);
12596 for (unsigned I : seq<unsigned>(CI->arg_size())) {
12597 // For scalar operands no need to create an entry since no need to
12598 // vectorize it.
12600 continue;
12601 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12602 }
12603 return;
12604 }
12605 case Instruction::ShuffleVector: {
12606 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12607 ReuseShuffleIndices);
12608 if (S.isAltShuffle()) {
12609 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12610 TE->dump());
12611 } else {
12612 assert(SLPReVec && "Only supported by REVEC.");
12613 LLVM_DEBUG(
12614 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12615 TE->dump());
12616 }
12617
12618 // Reorder operands if reordering would enable vectorization.
12619 auto *CI = dyn_cast<CmpInst>(VL0);
12620 if (CI && any_of(VL, [](Value *V) {
12621 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
12622 })) {
12623 auto *MainCI = cast<CmpInst>(S.getMainOp());
12624 auto *AltCI = cast<CmpInst>(S.getAltOp());
12625 CmpInst::Predicate MainP = MainCI->getPredicate();
12626 CmpInst::Predicate AltP = AltCI->getPredicate();
12627 assert(MainP != AltP &&
12628 "Expected different main/alternate predicates.");
12629 // Collect operands - commute if it uses the swapped predicate or
12630 // alternate operation.
12631 for (auto [Idx, V] : enumerate(VL)) {
12632 if (isa<PoisonValue>(V))
12633 continue;
12634 auto *Cmp = cast<CmpInst>(V);
12635
12636 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
12637 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12638 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12639 } else {
12640 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
12641 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
12642 }
12643 }
12644 TE->setOperands(Operands);
12645 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
12646 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
12647 return;
12648 }
12649
12650 if (isa<BinaryOperator>(VL0) || CI) {
12651 VLOperands Ops(VL, Operands, S, *this);
12652 Ops.reorder();
12653 Operands[0] = Ops.getVL(0);
12654 Operands[1] = Ops.getVL(1);
12655 }
12656 TE->setOperands(Operands);
12657 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12658 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
12659 return;
12660 }
12661 default:
12662 break;
12663 }
12664 llvm_unreachable("Unexpected vectorization of the instructions.");
12665}
12666
12667unsigned BoUpSLP::canMapToVector(Type *T) const {
12668 unsigned N = 1;
12669 Type *EltTy = T;
12670
12672 if (EltTy->isEmptyTy())
12673 return 0;
12674 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12675 // Check that struct is homogeneous.
12676 for (const auto *Ty : ST->elements())
12677 if (Ty != *ST->element_begin())
12678 return 0;
12679 N *= ST->getNumElements();
12680 EltTy = *ST->element_begin();
12681 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12682 N *= AT->getNumElements();
12683 EltTy = AT->getElementType();
12684 } else {
12685 auto *VT = cast<FixedVectorType>(EltTy);
12686 N *= VT->getNumElements();
12687 EltTy = VT->getElementType();
12688 }
12689 }
12690
12691 if (!isValidElementType(EltTy))
12692 return 0;
12693 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12694 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12695 VTSize != DL->getTypeStoreSizeInBits(T))
12696 return 0;
12697 return N;
12698}
12699
12700bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12701 SmallVectorImpl<unsigned> &CurrentOrder,
12702 bool ResizeAllowed) const {
12704 assert(It != VL.end() && "Expected at least one extract instruction.");
12705 auto *E0 = cast<Instruction>(*It);
12706 assert(
12708 "Invalid opcode");
12709 // Check if all of the extracts come from the same vector and from the
12710 // correct offset.
12711 Value *Vec = E0->getOperand(0);
12712
12713 CurrentOrder.clear();
12714
12715 // We have to extract from a vector/aggregate with the same number of elements.
12716 unsigned NElts;
12717 if (E0->getOpcode() == Instruction::ExtractValue) {
12718 NElts = canMapToVector(Vec->getType());
12719 if (!NElts)
12720 return false;
12721 // Check if load can be rewritten as load of vector.
12722 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12723 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12724 return false;
12725 } else {
12726 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12727 }
12728
12729 unsigned E = VL.size();
12730 if (!ResizeAllowed && NElts != E)
12731 return false;
12732 SmallVector<int> Indices(E, PoisonMaskElem);
12733 unsigned MinIdx = NElts, MaxIdx = 0;
12734 for (auto [I, V] : enumerate(VL)) {
12735 auto *Inst = dyn_cast<Instruction>(V);
12736 if (!Inst)
12737 continue;
12738 if (Inst->getOperand(0) != Vec)
12739 return false;
12740 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12741 if (isa<UndefValue>(EE->getIndexOperand()))
12742 continue;
12743 std::optional<unsigned> Idx = getExtractIndex(Inst);
12744 if (!Idx)
12745 return false;
12746 const unsigned ExtIdx = *Idx;
12747 if (ExtIdx >= NElts)
12748 continue;
12749 Indices[I] = ExtIdx;
12750 if (MinIdx > ExtIdx)
12751 MinIdx = ExtIdx;
12752 if (MaxIdx < ExtIdx)
12753 MaxIdx = ExtIdx;
12754 }
12755 if (MaxIdx - MinIdx + 1 > E)
12756 return false;
12757 if (MaxIdx + 1 <= E)
12758 MinIdx = 0;
12759
12760 // Check that all of the indices extract from the correct offset.
12761 bool ShouldKeepOrder = true;
12762 // Assign to all items the initial value E + 1 so we can check if the extract
12763 // instruction index was used already.
12764 // Also, later we can check that all the indices are used and we have a
12765 // consecutive access in the extract instructions, by checking that no
12766 // element of CurrentOrder still has value E + 1.
12767 CurrentOrder.assign(E, E);
12768 for (unsigned I = 0; I < E; ++I) {
12769 if (Indices[I] == PoisonMaskElem)
12770 continue;
12771 const unsigned ExtIdx = Indices[I] - MinIdx;
12772 if (CurrentOrder[ExtIdx] != E) {
12773 CurrentOrder.clear();
12774 return false;
12775 }
12776 ShouldKeepOrder &= ExtIdx == I;
12777 CurrentOrder[ExtIdx] = I;
12778 }
12779 if (ShouldKeepOrder)
12780 CurrentOrder.clear();
12781
12782 return ShouldKeepOrder;
12783}
12784
12785bool BoUpSLP::areAllUsersVectorized(
12786 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12787 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12788 all_of(I->users(), [this](User *U) {
12789 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12790 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12791 });
12792}
12793
12794void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12795 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12796 SmallVectorImpl<Value *> *OpScalars,
12797 SmallVectorImpl<Value *> *AltScalars) const {
12798 unsigned Sz = Scalars.size();
12799 Mask.assign(Sz, PoisonMaskElem);
12800 SmallVector<int> OrderMask;
12801 if (!ReorderIndices.empty())
12802 inversePermutation(ReorderIndices, OrderMask);
12803 for (unsigned I = 0; I < Sz; ++I) {
12804 unsigned Idx = I;
12805 if (!ReorderIndices.empty())
12806 Idx = OrderMask[I];
12807 if (isa<PoisonValue>(Scalars[Idx]))
12808 continue;
12809 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12810 if (IsAltOp(OpInst)) {
12811 Mask[I] = Sz + Idx;
12812 if (AltScalars)
12813 AltScalars->push_back(OpInst);
12814 } else {
12815 Mask[I] = Idx;
12816 if (OpScalars)
12817 OpScalars->push_back(OpInst);
12818 }
12819 }
12820 if (!ReuseShuffleIndices.empty()) {
12821 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12822 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12823 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12824 });
12825 Mask.swap(NewMask);
12826 }
12827}
12828
12830 Instruction *AltOp,
12831 const TargetLibraryInfo &TLI) {
12832 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12833}
12834
12836 Instruction *AltOp,
12837 const TargetLibraryInfo &TLI) {
12838 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12839 auto *AltCI = cast<CmpInst>(AltOp);
12840 CmpInst::Predicate MainP = MainCI->getPredicate();
12841 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12842 assert(MainP != AltP && "Expected different main/alternate predicates.");
12843 auto *CI = cast<CmpInst>(I);
12844 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12845 return false;
12846 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12847 return true;
12848 CmpInst::Predicate P = CI->getPredicate();
12850
12851 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12852 "CmpInst expected to match either main or alternate predicate or "
12853 "their swap.");
12854 return MainP != P && MainP != SwappedP;
12855 }
12856 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12857}
12858
12859TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) const {
12860 assert(!Ops.empty());
12861 const auto *Op0 = Ops.front();
12862
12863 const bool IsConstant = all_of(Ops, [](Value *V) {
12864 // TODO: We should allow undef elements here
12865 return isConstant(V) && !isa<UndefValue>(V);
12866 });
12867 const bool IsUniform = all_of(Ops, [=](Value *V) {
12868 // TODO: We should allow undef elements here
12869 return V == Op0;
12870 });
12871 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12872 // TODO: We should allow undef elements here
12873 if (auto *CI = dyn_cast<ConstantInt>(V))
12874 return CI->getValue().isPowerOf2();
12875 return false;
12876 });
12877 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12878 // TODO: We should allow undef elements here
12879 if (auto *CI = dyn_cast<ConstantInt>(V))
12880 return CI->getValue().isNegatedPowerOf2();
12881 return false;
12882 });
12883
12885 if (IsConstant && IsUniform)
12887 else if (IsConstant)
12889 else if (IsUniform)
12891
12893 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12894 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12895
12896 return {VK, VP};
12897}
12898
12899namespace {
12900/// The base class for shuffle instruction emission and shuffle cost estimation.
12901class BaseShuffleAnalysis {
12902protected:
12903 Type *ScalarTy = nullptr;
12904
12905 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12906
12907 /// V is expected to be a vectorized value.
12908 /// When REVEC is disabled, there is no difference between VF and
12909 /// VNumElements.
12910 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12911 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12912 /// of 8.
12913 unsigned getVF(Value *V) const {
12914 assert(V && "V cannot be nullptr");
12915 assert(isa<FixedVectorType>(V->getType()) &&
12916 "V does not have FixedVectorType");
12917 assert(ScalarTy && "ScalarTy cannot be nullptr");
12918 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12919 unsigned VNumElements =
12920 cast<FixedVectorType>(V->getType())->getNumElements();
12921 assert(VNumElements > ScalarTyNumElements &&
12922 "the number of elements of V is not large enough");
12923 assert(VNumElements % ScalarTyNumElements == 0 &&
12924 "the number of elements of V is not a vectorized value");
12925 return VNumElements / ScalarTyNumElements;
12926 }
12927
12928 /// Checks if the mask is an identity mask.
12929 /// \param IsStrict if is true the function returns false if mask size does
12930 /// not match vector size.
12931 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12932 bool IsStrict) {
12933 int Limit = Mask.size();
12934 int VF = VecTy->getNumElements();
12935 int Index = -1;
12936 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12937 return true;
12938 if (!IsStrict) {
12939 // Consider extract subvector starting from index 0.
12940 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12941 Index == 0)
12942 return true;
12943 // All VF-size submasks are identity (e.g.
12944 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12945 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12946 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12947 return all_of(Slice, equal_to(PoisonMaskElem)) ||
12949 }))
12950 return true;
12951 }
12952 return false;
12953 }
12954
12955 /// Tries to combine 2 different masks into single one.
12956 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12957 /// change the size of the vector, \p LocalVF is the original size of the
12958 /// shuffled vector.
12959 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12960 ArrayRef<int> ExtMask) {
12961 unsigned VF = Mask.size();
12962 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12963 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12964 if (ExtMask[I] == PoisonMaskElem)
12965 continue;
12966 int MaskedIdx = Mask[ExtMask[I] % VF];
12967 NewMask[I] =
12968 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12969 }
12970 Mask.swap(NewMask);
12971 }
12972
12973 /// Looks through shuffles trying to reduce final number of shuffles in the
12974 /// code. The function looks through the previously emitted shuffle
12975 /// instructions and properly mark indices in mask as undef.
12976 /// For example, given the code
12977 /// \code
12978 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12979 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12980 /// \endcode
12981 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12982 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12983 /// <0, 1, 2, 3> for the shuffle.
12984 /// If 2 operands are of different size, the smallest one will be resized and
12985 /// the mask recalculated properly.
12986 /// For example, given the code
12987 /// \code
12988 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12989 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12990 /// \endcode
12991 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12992 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12993 /// <0, 1, 2, 3> for the shuffle.
12994 /// So, it tries to transform permutations to simple vector merge, if
12995 /// possible.
12996 /// \param V The input vector which must be shuffled using the given \p Mask.
12997 /// If the better candidate is found, \p V is set to this best candidate
12998 /// vector.
12999 /// \param Mask The input mask for the shuffle. If the best candidate is found
13000 /// during looking-through-shuffles attempt, it is updated accordingly.
13001 /// \param SinglePermute true if the shuffle operation is originally a
13002 /// single-value-permutation. In this case the look-through-shuffles procedure
13003 /// may look for resizing shuffles as the best candidates.
13004 /// \return true if the shuffle results in the non-resizing identity shuffle
13005 /// (and thus can be ignored), false - otherwise.
13006 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
13007 bool SinglePermute) {
13008 Value *Op = V;
13009 ShuffleVectorInst *IdentityOp = nullptr;
13010 SmallVector<int> IdentityMask;
13011 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
13012 // Exit if not a fixed vector type or changing size shuffle.
13013 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
13014 if (!SVTy)
13015 break;
13016 // Remember the identity or broadcast mask, if it is not a resizing
13017 // shuffle. If no better candidates are found, this Op and Mask will be
13018 // used in the final shuffle.
13019 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
13020 if (!IdentityOp || !SinglePermute ||
13021 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
13023 IdentityMask.size()))) {
13024 IdentityOp = SV;
13025 // Store current mask in the IdentityMask so later we did not lost
13026 // this info if IdentityOp is selected as the best candidate for the
13027 // permutation.
13028 IdentityMask.assign(Mask);
13029 }
13030 }
13031 // Remember the broadcast mask. If no better candidates are found, this Op
13032 // and Mask will be used in the final shuffle.
13033 // Zero splat can be used as identity too, since it might be used with
13034 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
13035 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
13036 // expensive, the analysis founds out, that the source vector is just a
13037 // broadcast, this original mask can be transformed to identity mask <0,
13038 // 1, 2, 3>.
13039 // \code
13040 // %0 = shuffle %v, poison, zeroinitalizer
13041 // %res = shuffle %0, poison, <3, 1, 2, 0>
13042 // \endcode
13043 // may be transformed to
13044 // \code
13045 // %0 = shuffle %v, poison, zeroinitalizer
13046 // %res = shuffle %0, poison, <0, 1, 2, 3>
13047 // \endcode
13048 if (SV->isZeroEltSplat()) {
13049 IdentityOp = SV;
13050 IdentityMask.assign(Mask);
13051 }
13052 int LocalVF = Mask.size();
13053 if (auto *SVOpTy =
13054 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
13055 LocalVF = SVOpTy->getNumElements();
13056 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
13057 for (auto [Idx, I] : enumerate(Mask)) {
13058 if (I == PoisonMaskElem ||
13059 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
13060 continue;
13061 ExtMask[Idx] = SV->getMaskValue(I);
13062 }
13063 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
13064 SV->getOperand(0),
13065 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
13066 .all();
13067 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
13068 SV->getOperand(1),
13069 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
13070 .all();
13071 if (!IsOp1Undef && !IsOp2Undef) {
13072 // Update mask and mark undef elems.
13073 for (int &I : Mask) {
13074 if (I == PoisonMaskElem)
13075 continue;
13076 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
13078 I = PoisonMaskElem;
13079 }
13080 break;
13081 }
13082 SmallVector<int> ShuffleMask(SV->getShuffleMask());
13083 combineMasks(LocalVF, ShuffleMask, Mask);
13084 Mask.swap(ShuffleMask);
13085 if (IsOp2Undef)
13086 Op = SV->getOperand(0);
13087 else
13088 Op = SV->getOperand(1);
13089 }
13090 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
13091 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
13093 if (IdentityOp) {
13094 V = IdentityOp;
13095 assert(Mask.size() == IdentityMask.size() &&
13096 "Expected masks of same sizes.");
13097 // Clear known poison elements.
13098 for (auto [I, Idx] : enumerate(Mask))
13099 if (Idx == PoisonMaskElem)
13100 IdentityMask[I] = PoisonMaskElem;
13101 Mask.swap(IdentityMask);
13102 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
13103 return SinglePermute &&
13104 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
13105 /*IsStrict=*/true) ||
13106 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
13107 Shuffle->isZeroEltSplat() &&
13109 all_of(enumerate(Mask), [&](const auto &P) {
13110 return P.value() == PoisonMaskElem ||
13111 Shuffle->getShuffleMask()[P.index()] == 0;
13112 })));
13113 }
13114 V = Op;
13115 return false;
13116 }
13117 V = Op;
13118 return true;
13119 }
13120
13121 /// Smart shuffle instruction emission, walks through shuffles trees and
13122 /// tries to find the best matching vector for the actual shuffle
13123 /// instruction.
13124 template <typename T, typename ShuffleBuilderTy>
13125 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
13126 ShuffleBuilderTy &Builder, Type *ScalarTy) {
13127 assert(V1 && "Expected at least one vector value.");
13128 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13129 SmallVector<int> NewMask(Mask);
13130 if (ScalarTyNumElements != 1) {
13131 assert(SLPReVec && "FixedVectorType is not expected.");
13132 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
13133 Mask = NewMask;
13134 }
13135 if (V2)
13136 Builder.resizeToMatch(V1, V2);
13137 int VF = Mask.size();
13138 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
13139 VF = FTy->getNumElements();
13141 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
13142 .all()) {
13143 // Peek through shuffles.
13144 Value *Op1 = V1;
13145 Value *Op2 = V2;
13146 int VF =
13147 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13148 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13149 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13150 for (int I = 0, E = Mask.size(); I < E; ++I) {
13151 if (Mask[I] < VF)
13152 CombinedMask1[I] = Mask[I];
13153 else
13154 CombinedMask2[I] = Mask[I] - VF;
13155 }
13156 Value *PrevOp1;
13157 Value *PrevOp2;
13158 do {
13159 PrevOp1 = Op1;
13160 PrevOp2 = Op2;
13161 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
13162 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
13163 // Check if we have 2 resizing shuffles - need to peek through operands
13164 // again.
13165 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
13166 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
13167 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
13168 for (auto [Idx, I] : enumerate(CombinedMask1)) {
13169 if (I == PoisonMaskElem)
13170 continue;
13171 ExtMask1[Idx] = SV1->getMaskValue(I);
13172 }
13173 SmallBitVector UseMask1 = buildUseMask(
13174 cast<FixedVectorType>(SV1->getOperand(1)->getType())
13175 ->getNumElements(),
13176 ExtMask1, UseMask::SecondArg);
13177 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
13178 for (auto [Idx, I] : enumerate(CombinedMask2)) {
13179 if (I == PoisonMaskElem)
13180 continue;
13181 ExtMask2[Idx] = SV2->getMaskValue(I);
13182 }
13183 SmallBitVector UseMask2 = buildUseMask(
13184 cast<FixedVectorType>(SV2->getOperand(1)->getType())
13185 ->getNumElements(),
13186 ExtMask2, UseMask::SecondArg);
13187 if (SV1->getOperand(0)->getType() ==
13188 SV2->getOperand(0)->getType() &&
13189 SV1->getOperand(0)->getType() != SV1->getType() &&
13190 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
13191 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
13192 Op1 = SV1->getOperand(0);
13193 Op2 = SV2->getOperand(0);
13194 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
13195 int LocalVF = ShuffleMask1.size();
13196 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
13197 LocalVF = FTy->getNumElements();
13198 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
13199 CombinedMask1.swap(ShuffleMask1);
13200 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
13201 LocalVF = ShuffleMask2.size();
13202 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
13203 LocalVF = FTy->getNumElements();
13204 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
13205 CombinedMask2.swap(ShuffleMask2);
13206 }
13207 }
13208 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
13209 Builder.resizeToMatch(Op1, Op2);
13210 VF = std::max(cast<VectorType>(Op1->getType())
13211 ->getElementCount()
13212 .getKnownMinValue(),
13214 ->getElementCount()
13215 .getKnownMinValue());
13216 for (int I = 0, E = Mask.size(); I < E; ++I) {
13217 if (CombinedMask2[I] != PoisonMaskElem) {
13218 assert(CombinedMask1[I] == PoisonMaskElem &&
13219 "Expected undefined mask element");
13220 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
13221 }
13222 }
13223 if (Op1 == Op2 &&
13224 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
13225 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
13227 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
13228 ArrayRef(CombinedMask1))))
13229 return Builder.createIdentity(Op1);
13230 return Builder.createShuffleVector(
13231 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
13232 CombinedMask1);
13233 }
13234 if (isa<PoisonValue>(V1))
13235 return Builder.createPoison(
13236 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
13237 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
13238 assert(V1 && "Expected non-null value after looking through shuffles.");
13239
13240 if (!IsIdentity)
13241 return Builder.createShuffleVector(V1, NewMask);
13242 return Builder.createIdentity(V1);
13243 }
13244
13245 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
13246 /// shuffle emission.
13247 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
13248 ArrayRef<int> Mask) {
13249 for (unsigned I : seq<unsigned>(CommonMask.size()))
13250 if (Mask[I] != PoisonMaskElem)
13251 CommonMask[I] = I;
13252 }
13253};
13254} // namespace
13255
13256/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
13257static std::pair<InstructionCost, InstructionCost>
13259 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
13260 Type *ScalarTy, VectorType *VecTy) {
13261 InstructionCost ScalarCost = 0;
13262 InstructionCost VecCost = 0;
13263 // Here we differentiate two cases: (1) when Ptrs represent a regular
13264 // vectorization tree node (as they are pointer arguments of scattered
13265 // loads) or (2) when Ptrs are the arguments of loads or stores being
13266 // vectorized as plane wide unit-stride load/store since all the
13267 // loads/stores are known to be from/to adjacent locations.
13268 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13269 // Case 2: estimate costs for pointer related costs when vectorizing to
13270 // a wide load/store.
13271 // Scalar cost is estimated as a set of pointers with known relationship
13272 // between them.
13273 // For vector code we will use BasePtr as argument for the wide load/store
13274 // but we also need to account all the instructions which are going to
13275 // stay in vectorized code due to uses outside of these scalar
13276 // loads/stores.
13277 ScalarCost = TTI.getPointersChainCost(
13278 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13279 CostKind);
13280
13281 SmallVector<const Value *> PtrsRetainedInVecCode;
13282 for (Value *V : Ptrs) {
13283 if (V == BasePtr) {
13284 PtrsRetainedInVecCode.push_back(V);
13285 continue;
13286 }
13287 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13288 // For simplicity assume Ptr to stay in vectorized code if it's not a
13289 // GEP instruction. We don't care since it's cost considered free.
13290 // TODO: We should check for any uses outside of vectorizable tree
13291 // rather than just single use.
13292 if (!Ptr || !Ptr->hasOneUse())
13293 PtrsRetainedInVecCode.push_back(V);
13294 }
13295
13296 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
13297 // If all pointers stay in vectorized code then we don't have
13298 // any savings on that.
13299 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
13300 }
13301 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13302 TTI::PointersChainInfo::getKnownStride(),
13303 VecTy, CostKind);
13304 } else {
13305 // Case 1: Ptrs are the arguments of loads that we are going to transform
13306 // into masked gather load intrinsic.
13307 // All the scalar GEPs will be removed as a result of vectorization.
13308 // For any external uses of some lanes extract element instructions will
13309 // be generated (which cost is estimated separately).
13310 TTI::PointersChainInfo PtrsInfo =
13311 all_of(Ptrs,
13312 [](const Value *V) {
13313 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13314 return Ptr && !Ptr->hasAllConstantIndices();
13315 })
13316 ? TTI::PointersChainInfo::getUnknownStride()
13317 : TTI::PointersChainInfo::getKnownStride();
13318
13319 ScalarCost =
13320 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
13321 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
13322 if (!BaseGEP) {
13323 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
13324 if (It != Ptrs.end())
13325 BaseGEP = cast<GEPOperator>(*It);
13326 }
13327 if (BaseGEP) {
13328 SmallVector<const Value *> Indices(BaseGEP->indices());
13329 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
13330 BaseGEP->getPointerOperand(), Indices, VecTy,
13331 CostKind);
13332 }
13333 }
13334
13335 return std::make_pair(ScalarCost, VecCost);
13336}
13337
13338void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13339 assert(TE.isGather() && TE.ReorderIndices.empty() &&
13340 "Expected gather node without reordering.");
13341 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
13342 SmallSet<size_t, 2> LoadKeyUsed;
13343
13344 // Do not reorder nodes if it small (just 2 elements), all-constant or all
13345 // instructions have same opcode already.
13346 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
13347 all_of(TE.Scalars, isConstant))
13348 return;
13349
13350 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
13351 return VectorizableTree[Idx]->isSame(TE.Scalars);
13352 }))
13353 return;
13354
13355 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13356 Key = hash_combine(hash_value(LI->getParent()->getNumber()), Key);
13357 Value *Ptr =
13358 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
13359 if (LoadKeyUsed.contains(Key)) {
13360 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
13361 if (LIt != LoadsMap.end()) {
13362 for (LoadInst *RLI : LIt->second) {
13363 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
13364 LI->getType(), LI->getPointerOperand(), *DL, *SE,
13365 /*StrictCheck=*/true))
13366 return hash_value(RLI->getPointerOperand());
13367 }
13368 for (LoadInst *RLI : LIt->second) {
13370 LI->getPointerOperand(), *TLI)) {
13371 hash_code SubKey = hash_value(RLI->getPointerOperand());
13372 return SubKey;
13373 }
13374 }
13375 if (LIt->second.size() > 2) {
13376 hash_code SubKey =
13377 hash_value(LIt->second.back()->getPointerOperand());
13378 return SubKey;
13379 }
13380 }
13381 }
13382 LoadKeyUsed.insert(Key);
13383 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
13384 return hash_value(LI->getPointerOperand());
13385 };
13386 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13387 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13388 bool IsOrdered = true;
13389 unsigned NumInstructions = 0;
13390 // Try to "cluster" scalar instructions, to be able to build extra vectorized
13391 // nodes.
13392 for (auto [I, V] : enumerate(TE.Scalars)) {
13393 size_t Key = 1, Idx = 1;
13394 if (auto *Inst = dyn_cast<Instruction>(V);
13396 !isDeleted(Inst) && !isVectorized(V)) {
13397 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
13398 /*AllowAlternate=*/false);
13399 ++NumInstructions;
13400 }
13401 auto &Container = SortedValues[Key];
13402 if (IsOrdered && !KeyToIndex.contains(V) &&
13405 ((Container.contains(Idx) &&
13406 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
13407 (!Container.empty() && !Container.contains(Idx) &&
13408 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
13409 IsOrdered = false;
13410 auto &KTI = KeyToIndex[V];
13411 if (KTI.empty())
13412 Container[Idx].push_back(V);
13413 KTI.push_back(I);
13414 }
13416 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13417 if (!IsOrdered && NumInstructions > 1) {
13418 unsigned Cnt = 0;
13419 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
13420 for (const auto &D : SortedValues) {
13421 for (const auto &P : D.second) {
13422 unsigned Sz = 0;
13423 for (Value *V : P.second) {
13424 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
13425 for (auto [K, Idx] : enumerate(Indices)) {
13426 TE.ReorderIndices[Cnt + K] = Idx;
13427 TE.Scalars[Cnt + K] = V;
13428 }
13429 Sz += Indices.size();
13430 Cnt += Indices.size();
13431 }
13432 if (Sz > 1 && isa<Instruction>(P.second.front())) {
13433 const unsigned SubVF = getFloorFullVectorNumberOfElements(
13434 *TTI, TE.Scalars.front()->getType(), Sz);
13435 SubVectors.emplace_back(Cnt - Sz, SubVF);
13436 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
13437 DemandedElts.clearBit(I);
13438 } else if (!P.second.empty() && isConstant(P.second.front())) {
13439 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
13440 DemandedElts.clearBit(I);
13441 }
13442 }
13443 }
13444 }
13445 // Reuses always require shuffles, so consider it as profitable.
13446 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
13447 return;
13448 // Do simple cost estimation.
13451 auto *ScalarTy = TE.Scalars.front()->getType();
13452 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
13453 for (auto [Idx, Sz] : SubVectors) {
13455 Idx, getWidenedType(ScalarTy, Sz));
13456 }
13457 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13458 /*Insert=*/true,
13459 /*Extract=*/false, CostKind);
13460 int Sz = TE.Scalars.size();
13461 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13462 TE.ReorderIndices.end());
13463 for (unsigned I : seq<unsigned>(Sz)) {
13464 Value *V = TE.getOrdered(I);
13465 if (isa<PoisonValue>(V)) {
13466 ReorderMask[I] = PoisonMaskElem;
13467 } else if (isConstant(V) || DemandedElts[I]) {
13468 ReorderMask[I] = I + TE.ReorderIndices.size();
13469 }
13470 }
13471 Cost += ::getShuffleCost(*TTI,
13472 any_of(ReorderMask, [&](int I) { return I >= Sz; })
13475 VecTy, ReorderMask);
13476 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
13477 ReorderMask.assign(Sz, PoisonMaskElem);
13478 for (unsigned I : seq<unsigned>(Sz)) {
13479 Value *V = TE.getOrdered(I);
13480 if (isConstant(V)) {
13481 DemandedElts.clearBit(I);
13482 if (!isa<PoisonValue>(V))
13483 ReorderMask[I] = I;
13484 } else {
13485 ReorderMask[I] = I + Sz;
13486 }
13487 }
13488 InstructionCost BVCost =
13489 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
13490 /*Insert=*/true, /*Extract=*/false, CostKind);
13491 if (!DemandedElts.isAllOnes())
13492 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
13493 if (Cost >= BVCost) {
13494 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13495 reorderScalars(TE.Scalars, Mask);
13496 TE.ReorderIndices.clear();
13497 }
13498}
13499
13500/// Check if we can convert fadd/fsub sequence to FMAD.
13501/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13503 const InstructionsState &S,
13504 DominatorTree &DT, const DataLayout &DL,
13506 const TargetLibraryInfo &TLI) {
13507 assert(all_of(VL,
13508 [](Value *V) {
13509 return V->getType()->getScalarType()->isFloatingPointTy();
13510 }) &&
13511 "Can only convert to FMA for floating point types");
13512 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13513
13514 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13515 FastMathFlags FMF;
13516 FMF.set();
13517 for (Value *V : VL) {
13518 auto *I = dyn_cast<Instruction>(V);
13519 if (!I)
13520 continue;
13521 if (S.isCopyableElement(I))
13522 continue;
13523 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13524 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13525 continue;
13526 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13527 FMF &= FPCI->getFastMathFlags();
13528 }
13529 return FMF.allowContract();
13530 };
13531 if (!CheckForContractable(VL))
13533 // fmul also should be contractable
13534 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13535 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13536
13537 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
13538 if (!OpS.valid())
13540
13541 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13543 if (!CheckForContractable(Operands.front()))
13545 // Compare the costs.
13546 InstructionCost FMulPlusFAddCost = 0;
13547 InstructionCost FMACost = 0;
13549 FastMathFlags FMF;
13550 FMF.set();
13551 for (Value *V : VL) {
13552 auto *I = dyn_cast<Instruction>(V);
13553 if (!I)
13554 continue;
13555 if (!S.isCopyableElement(I))
13556 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13557 FMF &= FPCI->getFastMathFlags();
13558 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13559 }
13560 unsigned NumOps = 0;
13561 for (auto [V, Op] : zip(VL, Operands.front())) {
13562 if (S.isCopyableElement(V))
13563 continue;
13564 auto *I = dyn_cast<Instruction>(Op);
13565 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
13566 if (auto *OpI = dyn_cast<Instruction>(V))
13567 FMACost += TTI.getInstructionCost(OpI, CostKind);
13568 if (I)
13569 FMACost += TTI.getInstructionCost(I, CostKind);
13570 continue;
13571 }
13572 ++NumOps;
13573 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
13574 FMF &= FPCI->getFastMathFlags();
13575 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
13576 }
13577 Type *Ty = VL.front()->getType();
13578 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13579 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13580 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13581}
13582
13583bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
13584 bool &IsBSwap, bool &ForLoads) const {
13585 assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
13586 "Expected Shl node.");
13587 IsBSwap = false;
13588 ForLoads = false;
13589 if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() ||
13590 !TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
13591 any_of(TE.Scalars, [](Value *V) { return !V->hasOneUse(); }))
13592 return false;
13593 Type *ScalarTy = TE.getMainOp()->getType();
13594 // TODO: Check if same can be done for the vector types.
13595 if (!ScalarTy->isIntegerTy())
13596 return false;
13597 if (ScalarTy->isVectorTy())
13598 return false;
13599 const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
13600 const TreeEntry *LhsTE = getOperandEntry(&TE, /*Idx=*/0);
13601 const TreeEntry *RhsTE = getOperandEntry(&TE, /*Idx=*/1);
13602 // Lhs should be zext i<stride> to I<sz>.
13603 if (!(LhsTE->State == TreeEntry::Vectorize &&
13604 LhsTE->getOpcode() == Instruction::ZExt &&
13605 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
13606 !MinBWs.contains(LhsTE) &&
13607 all_of(LhsTE->Scalars, [](Value *V) { return V->hasOneUse(); })))
13608 return false;
13609 Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
13610 unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
13611 if (!isPowerOf2_64(Stride) || Stride >= Sz || Sz % Stride != 0 ||
13612 !isPowerOf2_64(LhsTE->getVectorFactor()))
13613 return false;
13614 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
13615 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
13616 return false;
13617 Order.clear();
13618 unsigned CurrentValue = 0;
13619 // Rhs should be (0, Stride, 2 * Stride, ..., N-Stride), where N <= Sz.
13620 if (all_of(RhsTE->Scalars,
13621 [&](Value *V) {
13622 CurrentValue += Stride;
13623 if (isa<UndefValue>(V))
13624 return true;
13625 auto *C = dyn_cast<Constant>(V);
13626 if (!C)
13627 return false;
13628 return C->getUniqueInteger() == CurrentValue - Stride;
13629 }) &&
13630 CurrentValue <= Sz) {
13631 Order.clear();
13632 } else {
13633 const unsigned VF = RhsTE->getVectorFactor();
13634 Order.assign(VF, VF);
13635 // Track which logical positions we've seen; reject duplicate shift amounts.
13636 SmallBitVector SeenPositions(VF);
13637 // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
13638 // ..., N-Stride), where N <= Sz.
13639 if (VF * Stride > Sz)
13640 return false;
13641 for (const auto [Idx, V] : enumerate(RhsTE->Scalars)) {
13642 if (isa<UndefValue>(V))
13643 continue;
13644 auto *C = dyn_cast<Constant>(V);
13645 if (!C)
13646 return false;
13647 const APInt &Val = C->getUniqueInteger();
13648 if (Val.isNegative() || Val.uge(Sz) || Val.getZExtValue() % Stride != 0)
13649 return false;
13650 unsigned Pos = Val.getZExtValue() / Stride;
13651 // TODO: Support Pos >= VF, in this case need to shift the final value.
13652 if (Order[Idx] != VF || Pos >= VF)
13653 return false;
13654 if (SeenPositions.test(Pos))
13655 return false;
13656 SeenPositions.set(Pos);
13657 Order[Idx] = Pos;
13658 }
13659 // One of the indices not set - exit.
13660 if (is_contained(Order, VF))
13661 return false;
13662 }
13664 auto *SrcType = IntegerType::getIntNTy(ScalarTy->getContext(),
13665 Stride * LhsTE->getVectorFactor());
13666 FastMathFlags FMF;
13667 SmallPtrSet<Value *, 4> CheckedExtracts;
13668 auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
13669 auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
13670 TTI::CastContextHint CastCtx =
13671 getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
13672 InstructionCost VecCost =
13673 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF, CostKind) +
13674 TTI->getArithmeticInstrCost(Instruction::Shl, VecTy, CostKind,
13675 getOperandInfo(LhsTE->Scalars)) +
13676 TTI->getCastInstrCost(
13677 Instruction::ZExt, VecTy,
13678 getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()), CastCtx,
13679 CostKind);
13680 InstructionCost BitcastCost = TTI->getCastInstrCost(
13681 Instruction::BitCast, SrcType, SrcVecTy, CastCtx, CostKind);
13682 if (!Order.empty()) {
13683 fixupOrderingIndices(Order);
13684 SmallVector<int> Mask;
13685 inversePermutation(Order, Mask);
13686 BitcastCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, SrcVecTy,
13687 Mask, CostKind);
13688 }
13689 // Check if the combination can be modeled as a bitcast+byteswap operation.
13690 constexpr unsigned ByteSize = 8;
13691 if (!Order.empty() && isReverseOrder(Order) &&
13692 DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
13693 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13694 InstructionCost BSwapCost =
13695 TTI->getCastInstrCost(Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
13696 CostKind) +
13697 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
13698 if (BSwapCost <= BitcastCost) {
13699 BitcastCost = BSwapCost;
13700 IsBSwap = true;
13701 Order.clear();
13702 // Check for loads in the ZExt node.
13703 const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0);
13704 if (SrcTE->State == TreeEntry::Vectorize &&
13705 SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
13706 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13707 all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
13708 auto *LI = cast<LoadInst>(SrcTE->getMainOp());
13709 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13710 InstructionCost BSwapCost =
13711 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
13712 LI->getPointerAddressSpace(), CostKind) +
13713 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
13714 if (BSwapCost <= BitcastCost) {
13715 VecCost +=
13716 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
13717 LI->getPointerAddressSpace(), CostKind);
13718 BitcastCost = BSwapCost;
13719 ForLoads = true;
13720 }
13721 }
13722 }
13723 } else if (Order.empty() && DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
13724 // Check for loads in the ZExt node.
13725 const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0);
13726 if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
13727 SrcTE->ReuseShuffleIndices.empty() &&
13728 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13729 all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
13730 auto *LI = cast<LoadInst>(SrcTE->getMainOp());
13731 BitcastCost =
13732 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
13733 LI->getPointerAddressSpace(), CostKind);
13734 VecCost +=
13735 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
13736 LI->getPointerAddressSpace(), CostKind);
13737 ForLoads = true;
13738 }
13739 }
13740 if (SrcType != ScalarTy) {
13741 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
13743 }
13744 return BitcastCost < VecCost;
13745}
13746
13747bool BoUpSLP::matchesInversedZExtSelect(
13748 const TreeEntry &SelectTE,
13749 SmallVectorImpl<unsigned> &InversedCmpsIndices) const {
13750 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13751 "Expected select node.");
13753 for (auto [Idx, V] : enumerate(SelectTE.Scalars)) {
13754 auto *Inst = dyn_cast<Instruction>(V);
13755 if (!Inst || Inst->getOpcode() != Instruction::ZExt)
13756 continue;
13757 ZExts.emplace_back(Inst, Idx);
13758 }
13759 if (ZExts.empty())
13760 return false;
13761 const auto *CmpTE = getOperandEntry(&SelectTE, 0);
13762 const auto *Op1TE = getOperandEntry(&SelectTE, 1);
13763 const auto *Op2TE = getOperandEntry(&SelectTE, 2);
13764 // Compares must be alternate vectorized, and other operands must be gathers
13765 // or copyables.
13766 // TODO: investigate opportunity for reordered/reused nodes.
13767 if (CmpTE->State != TreeEntry::Vectorize || !CmpTE->isAltShuffle() ||
13768 (CmpTE->getOpcode() != Instruction::ICmp &&
13769 CmpTE->getOpcode() != Instruction::FCmp) ||
13770 !CmpTE->ReorderIndices.empty() || !CmpTE->ReuseShuffleIndices.empty() ||
13771 !Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
13772 !Op2TE->ReorderIndices.empty() || !Op2TE->ReuseShuffleIndices.empty())
13773 return false;
13774 // The operands must be buildvectors/copyables.
13775 if (!Op1TE->isGather() || !Op2TE->isGather())
13776 return false;
13777 // TODO: investigate opportunity for the vector nodes with copyables.
13778 auto *Cmp = CmpTE->getMainOp();
13779 CmpPredicate Pred;
13780 auto MatchCmp = m_Cmp(Pred, m_Value(), m_Value());
13781 if (!match(Cmp, MatchCmp))
13782 return false;
13783 CmpPredicate MainPred = Pred;
13784 CmpPredicate InversedPred(CmpInst::getInversePredicate(Pred),
13785 Pred.hasSameSign());
13786 for (const auto [Idx, V] : enumerate(CmpTE->Scalars)) {
13787 if (!match(V, MatchCmp))
13788 continue;
13789 if (CmpPredicate::getMatching(MainPred, Pred))
13790 continue;
13791 if (!CmpPredicate::getMatching(InversedPred, Pred))
13792 return false;
13793 if (!V->hasOneUse())
13794 return false;
13795 InversedCmpsIndices.push_back(Idx);
13796 }
13797
13798 if (InversedCmpsIndices.empty())
13799 return false;
13800 VectorType *VecTy =
13801 getWidenedType(Cmp->getOperand(0)->getType(), CmpTE->getVectorFactor());
13802 Type *CmpTy = CmpInst::makeCmpResultType(VecTy);
13803
13805 InstructionCost VecCost =
13806 TTI->getCmpSelInstrCost(CmpTE->getOpcode(), VecTy, CmpTy, MainPred,
13807 CostKind, getOperandInfo(CmpTE->getOperand(0)),
13808 getOperandInfo(CmpTE->getOperand(1)));
13809 InstructionCost BVCost =
13810 ::getScalarizationOverhead(*TTI, Cmp->getType(), cast<VectorType>(CmpTy),
13811 APInt::getAllOnes(CmpTE->getVectorFactor()),
13812 /*Insert=*/true, /*Extract=*/false, CostKind);
13813 for (Value *V : CmpTE->Scalars) {
13814 auto *I = dyn_cast<Instruction>(V);
13815 if (!I)
13816 continue;
13817 BVCost += TTI->getInstructionCost(I, CostKind);
13818 }
13819 return VecCost < BVCost;
13820}
13821
13822bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const {
13823 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13824 "Expected select node.");
13825 if (DL->isBigEndian())
13826 return false;
13827 if (!SelectTE.ReorderIndices.empty() || !SelectTE.ReuseShuffleIndices.empty())
13828 return false;
13829 if (!UserIgnoreList)
13830 return false;
13831 if (any_of(SelectTE.Scalars, [](Value *V) { return !V->hasOneUse(); }))
13832 return false;
13833 // Check that all reduction operands are or instructions.
13834 if (any_of(*UserIgnoreList,
13835 [](Value *V) { return !match(V, m_Or(m_Value(), m_Value())); }))
13836 return false;
13837 const TreeEntry *Op1TE = getOperandEntry(&SelectTE, 1);
13838 const TreeEntry *Op2TE = getOperandEntry(&SelectTE, 2);
13839 if (!Op1TE->isGather() || !Op2TE->isGather())
13840 return false;
13841 // No need to check for zeroes reordering.
13842 if (!Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
13843 !Op2TE->ReuseShuffleIndices.empty())
13844 return false;
13845 Type *ScalarTy = Op1TE->Scalars.front()->getType();
13846 if (!ScalarTy->isIntegerTy())
13847 return false;
13848 // Check that second operand is all zeroes.
13849 if (any_of(Op2TE->Scalars, [](Value *V) { return !match(V, m_ZeroInt()); }))
13850 return false;
13851 // Check that first operand is 1,2,4,...
13852 if (any_of(enumerate(Op1TE->Scalars), [](const auto &P) {
13853 uint64_t V;
13854 return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(V) &&
13855 Log2_64(V) == P.index());
13856 }))
13857 return false;
13858 // Check if bitcast is cheaper than select.
13859 auto *DstTy = IntegerType::getIntNTy(ScalarTy->getContext(),
13860 SelectTE.getVectorFactor());
13861 VectorType *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor());
13862 Type *CmpTy = CmpInst::makeCmpResultType(OpTy);
13863 VectorType *VecTy = getWidenedType(ScalarTy, SelectTE.getVectorFactor());
13864 auto It = MinBWs.find(&SelectTE);
13865 if (It != MinBWs.end()) {
13866 auto *EffectiveScalarTy =
13867 IntegerType::get(F->getContext(), It->second.first);
13868 VecTy = getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor());
13869 }
13871 InstructionCost BitcastCost = TTI->getCastInstrCost(
13872 Instruction::BitCast, DstTy, CmpTy, TTI::CastContextHint::None, CostKind);
13873 if (DstTy != ScalarTy) {
13874 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
13876 }
13877 FastMathFlags FMF;
13878 InstructionCost SelectCost =
13879 TTI->getCmpSelInstrCost(Instruction::Select, VecTy, CmpTy,
13881 getOperandInfo(Op1TE->Scalars),
13882 getOperandInfo(Op2TE->Scalars)) +
13883 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF, CostKind);
13884 return BitcastCost <= SelectCost;
13885}
13886
13889 BaseGraphSize = VectorizableTree.size();
13890 // Turn graph transforming mode on and off, when done.
13891 class GraphTransformModeRAAI {
13892 bool &SavedIsGraphTransformMode;
13893
13894 public:
13895 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13896 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13897 IsGraphTransformMode = true;
13898 }
13899 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13900 } TransformContext(IsGraphTransformMode);
13901 // Operands are profitable if they are:
13902 // 1. At least one constant
13903 // or
13904 // 2. Splats
13905 // or
13906 // 3. Results in good vectorization opportunity, i.e. may generate vector
13907 // nodes and reduce cost of the graph.
13908 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13909 const InstructionsState &S) {
13911 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
13912 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
13913 I2->getOperand(Op));
13914 return all_of(Candidates, [this](
13915 ArrayRef<std::pair<Value *, Value *>> Cand) {
13916 return all_of(Cand,
13917 [](const std::pair<Value *, Value *> &P) {
13918 return isa<Constant>(P.first) ||
13919 isa<Constant>(P.second) || P.first == P.second;
13920 }) ||
13922 });
13923 };
13924
13925 // Try to reorder gather nodes for better vectorization opportunities.
13926 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13927 TreeEntry &E = *VectorizableTree[Idx];
13928 if (E.isGather())
13929 reorderGatherNode(E);
13930 }
13931
13932 // Better to use full gathered loads analysis, if there are only 2 loads
13933 // gathered nodes each having less than 16 elements.
13934 constexpr unsigned VFLimit = 16;
13935 bool ForceLoadGather =
13936 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13937 return TE->isGather() && TE->hasState() &&
13938 TE->getOpcode() == Instruction::Load &&
13939 TE->getVectorFactor() < VFLimit;
13940 }) == 2;
13941
13942 // Checks if the scalars are used in other node.
13943 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13944 function_ref<bool(Value *)> CheckContainer) {
13945 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
13946 if (isa<PoisonValue>(V))
13947 return true;
13948 auto *I = dyn_cast<Instruction>(V);
13949 if (!I)
13950 return false;
13951 return is_contained(TE->Scalars, I) || CheckContainer(I);
13952 });
13953 };
13954 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13955 if (E.hasState()) {
13956 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
13957 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13958 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13959 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13960 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13961 return is_contained(TEs, TE);
13962 });
13963 });
13964 }))
13965 return true;
13966 ;
13967 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
13968 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13969 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13970 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13971 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13972 return is_contained(TEs, TE);
13973 });
13974 });
13975 }))
13976 return true;
13977 } else {
13978 // Check if the gather node full copy of split node.
13979 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13980 if (It != E.Scalars.end()) {
13981 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13982 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13983 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13984 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13985 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13986 return is_contained(TEs, TE);
13987 });
13988 });
13989 }))
13990 return true;
13991 }
13992 }
13993 return false;
13994 };
13995 // The tree may grow here, so iterate over nodes, built before.
13996 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13997 TreeEntry &E = *VectorizableTree[Idx];
13998 if (E.isGather()) {
13999 ArrayRef<Value *> VL = E.Scalars;
14000 const unsigned Sz = getVectorElementSize(VL.front());
14001 unsigned MinVF = getMinVF(2 * Sz);
14002 // Do not try partial vectorization for small nodes (<= 2), nodes with the
14003 // same opcode and same parent block or all constants.
14004 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
14005 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
14006 // We use allSameOpcode instead of isAltShuffle because we don't
14007 // want to use interchangeable instruction here.
14008 !allSameOpcode(VL) || !allSameBlock(VL)) ||
14009 allConstant(VL) || isSplat(VL))
14010 continue;
14011 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
14012 continue;
14013 // Check if the node is a copy of other vector nodes.
14014 if (CheckForSameVectorNodes(E))
14015 continue;
14016 // Try to find vectorizable sequences and transform them into a series of
14017 // insertvector instructions.
14018 unsigned StartIdx = 0;
14019 unsigned End = VL.size();
14020 SmallBitVector Processed(End);
14021 for (unsigned VF = getFloorFullVectorNumberOfElements(
14022 *TTI, VL.front()->getType(), VL.size() - 1);
14023 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
14024 *TTI, VL.front()->getType(), VF - 1)) {
14025 if (StartIdx + VF > End)
14026 continue;
14028 bool AllStrided = true;
14029 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
14030 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
14031 // If any instruction is vectorized already - do not try again.
14032 // Reuse the existing node, if it fully matches the slice.
14033 if ((Processed.test(Cnt) || isVectorized(Slice.front())) &&
14034 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
14035 continue;
14036 // Constant already handled effectively - skip.
14037 if (allConstant(Slice))
14038 continue;
14039 // Do not try to vectorize small splats (less than vector register and
14040 // only with the single non-undef element).
14041 bool IsSplat = isSplat(Slice);
14042 bool IsTwoRegisterSplat = true;
14043 if (IsSplat && VF == 2) {
14044 unsigned NumRegs2VF = ::getNumberOfParts(
14045 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
14046 IsTwoRegisterSplat = NumRegs2VF == 2;
14047 }
14048 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
14049 count(Slice, Slice.front()) ==
14050 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
14051 : 1)) {
14052 if (IsSplat)
14053 continue;
14054 InstructionsState S = getSameOpcode(Slice, *TLI);
14055 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
14056 (S.getOpcode() == Instruction::Load &&
14058 (S.getOpcode() != Instruction::Load &&
14059 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
14060 continue;
14061 if (VF == 2) {
14062 // Try to vectorize reduced values or if all users are vectorized.
14063 // For expensive instructions extra extracts might be profitable.
14064 if ((!UserIgnoreList || E.Idx != 0) &&
14065 TTI->getInstructionCost(S.getMainOp(), CostKind) <
14067 !all_of(Slice, [&](Value *V) {
14068 if (isa<PoisonValue>(V))
14069 return true;
14070 return areAllUsersVectorized(cast<Instruction>(V),
14071 UserIgnoreList);
14072 }))
14073 continue;
14074 if (S.getOpcode() == Instruction::Load) {
14075 OrdersType Order;
14076 SmallVector<Value *> PointerOps;
14077 StridedPtrInfo SPtrInfo;
14078 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
14079 PointerOps, SPtrInfo);
14080 AllStrided &= Res == LoadsState::StridedVectorize ||
14082 Res == LoadsState::Gather;
14083 // Do not vectorize gathers.
14084 if (Res == LoadsState::ScatterVectorize ||
14085 Res == LoadsState::Gather) {
14086 if (Res == LoadsState::Gather) {
14088 // If reductions and the scalars from the root node are
14089 // analyzed - mark as non-vectorizable reduction.
14090 if (UserIgnoreList && E.Idx == 0)
14091 analyzedReductionVals(Slice);
14092 }
14093 continue;
14094 }
14095 } else if (S.getOpcode() == Instruction::ExtractElement ||
14096 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
14098 !CheckOperandsProfitability(
14099 S.getMainOp(),
14102 S))) {
14103 // Do not vectorize extractelements (handled effectively
14104 // alread). Do not vectorize non-profitable instructions (with
14105 // low cost and non-vectorizable operands.)
14106 continue;
14107 }
14108 }
14109 }
14110 Slices.emplace_back(Cnt, Slice.size());
14111 }
14112 // Do not try to vectorize if all slides are strided or gathered with
14113 // vector factor 2 and there are more than 2 slices. Better to handle
14114 // them in gathered loads analysis, may result in better vectorization.
14115 if (VF == 2 && AllStrided && Slices.size() > 2)
14116 continue;
14117 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
14118 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
14119 Processed.set(Cnt, Cnt + Sz);
14120 if (StartIdx == Cnt)
14121 StartIdx = Cnt + Sz;
14122 if (End == Cnt + Sz)
14123 End = Cnt;
14124 };
14125 for (auto [Cnt, Sz] : Slices) {
14126 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
14127 const TreeEntry *SameTE = nullptr;
14128 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
14129 It != Slice.end()) {
14130 // If any instruction is vectorized already - do not try again.
14131 SameTE = getSameValuesTreeEntry(*It, Slice);
14132 }
14133 unsigned PrevSize = VectorizableTree.size();
14134 [[maybe_unused]] unsigned PrevEntriesSize =
14135 LoadEntriesToVectorize.size();
14136 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
14137 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
14138 VectorizableTree[PrevSize]->isGather() &&
14139 VectorizableTree[PrevSize]->hasState() &&
14140 VectorizableTree[PrevSize]->getOpcode() !=
14141 Instruction::ExtractElement &&
14142 !isSplat(Slice)) {
14143 if (UserIgnoreList && E.Idx == 0 && VF == 2)
14144 analyzedReductionVals(Slice);
14145 VectorizableTree.pop_back();
14146 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
14147 "LoadEntriesToVectorize expected to remain the same");
14148 continue;
14149 }
14150 AddCombinedNode(PrevSize, Cnt, Sz);
14151 }
14152 }
14153 // Restore ordering, if no extra vectorization happened.
14154 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
14155 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
14156 reorderScalars(E.Scalars, Mask);
14157 E.ReorderIndices.clear();
14158 }
14159 }
14160 if (!E.hasState())
14161 continue;
14162 switch (E.getOpcode()) {
14163 case Instruction::Load: {
14164 // No need to reorder masked gather loads, just reorder the scalar
14165 // operands.
14166 if (E.State != TreeEntry::Vectorize)
14167 break;
14168 Type *ScalarTy = E.getMainOp()->getType();
14169 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
14170 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
14171 // Check if profitable to represent consecutive load + reverse as strided
14172 // load with stride -1.
14173 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
14174 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
14175 SmallVector<int> Mask;
14176 inversePermutation(E.ReorderIndices, Mask);
14177 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
14178 InstructionCost OriginalVecCost =
14179 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
14180 BaseLI->getPointerAddressSpace(), CostKind,
14182 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
14183 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
14184 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
14185 VecTy, BaseLI->getPointerOperand(),
14186 /*VariableMask=*/false, CommonAlignment,
14187 BaseLI),
14188 CostKind);
14189 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
14190 // Strided load is more profitable than consecutive load + reverse -
14191 // transform the node to strided load.
14192 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
14193 ->getPointerOperand()
14194 ->getType());
14195 StridedPtrInfo SPtrInfo;
14196 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
14197 SPtrInfo.Ty = VecTy;
14198 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
14199 E.State = TreeEntry::StridedVectorize;
14200 }
14201 }
14202 break;
14203 }
14204 case Instruction::Store: {
14205 Type *ScalarTy =
14206 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
14207 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
14208 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
14209 // Check if profitable to represent consecutive load + reverse as strided
14210 // load with stride -1.
14211 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
14212 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
14213 SmallVector<int> Mask;
14214 inversePermutation(E.ReorderIndices, Mask);
14215 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
14216 InstructionCost OriginalVecCost =
14217 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
14218 BaseSI->getPointerAddressSpace(), CostKind,
14220 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
14221 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
14222 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
14223 VecTy, BaseSI->getPointerOperand(),
14224 /*VariableMask=*/false, CommonAlignment,
14225 BaseSI),
14226 CostKind);
14227 if (StridedCost < OriginalVecCost)
14228 // Strided store is more profitable than reverse + consecutive store -
14229 // transform the node to strided store.
14230 E.State = TreeEntry::StridedVectorize;
14231 } else if (!E.ReorderIndices.empty()) {
14232 // Check for interleaved stores.
14233 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
14234 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
14235 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
14236 if (Mask.size() < 4)
14237 return 0u;
14238 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
14240 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
14241 TTI.isLegalInterleavedAccessType(
14242 VecTy, Factor, BaseSI->getAlign(),
14243 BaseSI->getPointerAddressSpace()))
14244 return Factor;
14245 }
14246
14247 return 0u;
14248 };
14249 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
14250 unsigned InterleaveFactor = IsInterleaveMask(Mask);
14251 if (InterleaveFactor != 0)
14252 E.setInterleave(InterleaveFactor);
14253 }
14254 break;
14255 }
14256 case Instruction::Select: {
14257 if (E.State != TreeEntry::Vectorize)
14258 break;
14259 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
14260 if (MinMaxID != Intrinsic::not_intrinsic) {
14261 // This node is a minmax node.
14262 E.CombinedOp = TreeEntry::MinMax;
14263 TreeEntry *CondEntry = getOperandEntry(&E, 0);
14264 if (SelectOnly && CondEntry->UserTreeIndex &&
14265 CondEntry->State == TreeEntry::Vectorize) {
14266 // The condition node is part of the combined minmax node.
14267 CondEntry->State = TreeEntry::CombinedVectorize;
14268 }
14269 break;
14270 }
14271 // Check for zext + selects, which can be reordered.
14272 SmallVector<unsigned> InversedCmpsIndices;
14273 if (matchesInversedZExtSelect(E, InversedCmpsIndices)) {
14274 auto *CmpTE = getOperandEntry(&E, 0);
14275 auto *Op1TE = getOperandEntry(&E, 1);
14276 auto *Op2TE = getOperandEntry(&E, 2);
14277 // State now is uniform, not alternate opcode.
14278 CmpTE->setOperations(
14279 InstructionsState(CmpTE->getMainOp(), CmpTE->getMainOp()));
14280 // Update mapping between the swapped values and their internal matching
14281 // nodes.
14282 auto UpdateGatherEntry = [&](TreeEntry *OldTE, TreeEntry *NewTE,
14283 Value *V) {
14284 if (isConstant(V))
14285 return;
14286 auto It = ValueToGatherNodes.find(V);
14287 assert(It != ValueToGatherNodes.end() &&
14288 "Expected to find the value in the map.");
14289 auto &C = It->getSecond();
14290 if (!is_contained(OldTE->Scalars, V))
14291 C.remove(OldTE);
14292 C.insert(NewTE);
14293 };
14294 ValueList &Op1 = E.getOperand(1);
14295 ValueList &Op2 = E.getOperand(2);
14296 for (const unsigned Idx : InversedCmpsIndices) {
14297 Value *V1 = Op1TE->Scalars[Idx];
14298 Value *V2 = Op2TE->Scalars[Idx];
14299 std::swap(Op1TE->Scalars[Idx], Op2TE->Scalars[Idx]);
14300 std::swap(Op1[Idx], Op2[Idx]);
14301 UpdateGatherEntry(Op1TE, Op2TE, V1);
14302 UpdateGatherEntry(Op2TE, Op1TE, V2);
14303 }
14304 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&E, 1), Op1TE);
14305 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&E, 2), Op2TE);
14306 // NB: Fallback to check if select can be converted to cmp bitcast.
14307 }
14308 if (matchesSelectOfBits(E)) {
14309 // This node is a (reduced or) cmp bitcast node.
14310 const TreeEntry::CombinedOpcode Code = TreeEntry::ReducedCmpBitcast;
14311 E.CombinedOp = Code;
14312 auto *Op1TE = getOperandEntry(&E, 1);
14313 auto *Op2TE = getOperandEntry(&E, 2);
14314 Op1TE->State = TreeEntry::CombinedVectorize;
14315 Op1TE->CombinedOp = Code;
14316 Op2TE->State = TreeEntry::CombinedVectorize;
14317 Op2TE->CombinedOp = Code;
14318 break;
14319 }
14320 break;
14321 }
14322 case Instruction::FSub:
14323 case Instruction::FAdd: {
14324 // Check if possible to convert (a*b)+c to fma.
14325 if (E.State != TreeEntry::Vectorize ||
14326 !E.getOperations().isAddSubLikeOp())
14327 break;
14328 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
14329 .isValid())
14330 break;
14331 // This node is a fmuladd node.
14332 E.CombinedOp = TreeEntry::FMulAdd;
14333 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
14334 if (FMulEntry->UserTreeIndex &&
14335 FMulEntry->State == TreeEntry::Vectorize) {
14336 // The FMul node is part of the combined fmuladd node.
14337 FMulEntry->State = TreeEntry::CombinedVectorize;
14338 }
14339 break;
14340 }
14341 case Instruction::Shl: {
14342 if (E.Idx != 0 || DL->isBigEndian())
14343 break;
14344 if (!UserIgnoreList)
14345 break;
14346 // Check that all reduction operands are disjoint or instructions.
14347 if (any_of(*UserIgnoreList, [](Value *V) {
14348 return !match(V, m_DisjointOr(m_Value(), m_Value()));
14349 }))
14350 break;
14351 OrdersType Order;
14352 bool IsBSwap;
14353 bool ForLoads;
14354 if (!matchesShlZExt(E, Order, IsBSwap, ForLoads))
14355 break;
14356 // This node is a (reduced disjoint or) bitcast node.
14357 TreeEntry::CombinedOpcode Code =
14358 IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
14359 : TreeEntry::ReducedBitcastBSwap)
14360 : (ForLoads ? TreeEntry::ReducedBitcastLoads
14361 : TreeEntry::ReducedBitcast);
14362 E.CombinedOp = Code;
14363 E.ReorderIndices = std::move(Order);
14364 TreeEntry *ZExtEntry = getOperandEntry(&E, 0);
14365 assert(ZExtEntry->UserTreeIndex &&
14366 ZExtEntry->State == TreeEntry::Vectorize &&
14367 ZExtEntry->getOpcode() == Instruction::ZExt &&
14368 "Expected ZExt node.");
14369 // The ZExt node is part of the combined node.
14370 ZExtEntry->State = TreeEntry::CombinedVectorize;
14371 ZExtEntry->CombinedOp = Code;
14372 if (ForLoads) {
14373 TreeEntry *LoadsEntry = getOperandEntry(ZExtEntry, 0);
14374 assert(LoadsEntry->UserTreeIndex &&
14375 LoadsEntry->State == TreeEntry::Vectorize &&
14376 LoadsEntry->getOpcode() == Instruction::Load &&
14377 "Expected Load node.");
14378 // The Load node is part of the combined node.
14379 LoadsEntry->State = TreeEntry::CombinedVectorize;
14380 LoadsEntry->CombinedOp = Code;
14381 }
14382 TreeEntry *ConstEntry = getOperandEntry(&E, 1);
14383 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
14384 "Expected ZExt node.");
14385 // The ConstNode node is part of the combined node.
14386 ConstEntry->State = TreeEntry::CombinedVectorize;
14387 ConstEntry->CombinedOp = Code;
14388 break;
14389 }
14390 default:
14391 break;
14392 }
14393 }
14394
14395 if (LoadEntriesToVectorize.empty()) {
14396 // Single load node - exit.
14397 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
14398 VectorizableTree.front()->getOpcode() == Instruction::Load)
14399 return;
14400 // Small graph with small VF - exit.
14401 constexpr unsigned SmallTree = 3;
14402 constexpr unsigned SmallVF = 2;
14403 if ((VectorizableTree.size() <= SmallTree &&
14404 VectorizableTree.front()->Scalars.size() == SmallVF) ||
14405 (VectorizableTree.size() <= 2 && UserIgnoreList))
14406 return;
14407
14408 if (VectorizableTree.front()->isNonPowOf2Vec() &&
14409 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
14410 getCanonicalGraphSize() <= SmallTree &&
14411 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
14412 [](const std::unique_ptr<TreeEntry> &TE) {
14413 return TE->isGather() && TE->hasState() &&
14414 TE->getOpcode() == Instruction::Load &&
14415 !allSameBlock(TE->Scalars);
14416 }) == 1)
14417 return;
14418 }
14419
14420 // A list of loads to be gathered during the vectorization process. We can
14421 // try to vectorize them at the end, if profitable.
14422 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
14424 GatheredLoads;
14425
14426 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14427 TreeEntry &E = *TE;
14428 if (E.isGather() &&
14429 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
14430 (!E.hasState() && any_of(E.Scalars,
14431 [&](Value *V) {
14432 return isa<LoadInst>(V) &&
14433 !isVectorized(V) &&
14434 !isDeleted(cast<Instruction>(V));
14435 }))) &&
14436 !isSplat(E.Scalars)) {
14437 for (Value *V : E.Scalars) {
14438 auto *LI = dyn_cast<LoadInst>(V);
14439 if (!LI)
14440 continue;
14441 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
14442 continue;
14444 *this, V, *DL, *SE, *TTI,
14445 GatheredLoads[std::make_tuple(
14446 LI->getParent(),
14447 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
14448 LI->getType())]);
14449 }
14450 }
14451 }
14452 // Try to vectorize gathered loads if this is not just a gather of loads.
14453 if (!GatheredLoads.empty())
14454 tryToVectorizeGatheredLoads(GatheredLoads);
14455}
14456
14457/// Merges shuffle masks and emits final shuffle instruction, if required. It
14458/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14459/// when the actual shuffle instruction is generated only if this is actually
14460/// required. Otherwise, the shuffle instruction emission is delayed till the
14461/// end of the process, to reduce the number of emitted instructions and further
14462/// analysis/transformations.
14463class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
14464 bool IsFinalized = false;
14465 SmallVector<int> CommonMask;
14467 const TargetTransformInfo &TTI;
14468 InstructionCost Cost = 0;
14469 SmallDenseSet<Value *> VectorizedVals;
14470 BoUpSLP &R;
14471 SmallPtrSetImpl<Value *> &CheckedExtracts;
14472 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14473 /// While set, still trying to estimate the cost for the same nodes and we
14474 /// can delay actual cost estimation (virtual shuffle instruction emission).
14475 /// May help better estimate the cost if same nodes must be permuted + allows
14476 /// to move most of the long shuffles cost estimation to TTI.
14477 bool SameNodesEstimated = true;
14478
14479 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
14480 if (Ty->getScalarType()->isPointerTy()) {
14483 IntegerType::get(Ty->getContext(),
14484 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
14485 Ty->getScalarType());
14486 if (auto *VTy = dyn_cast<VectorType>(Ty))
14487 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
14488 return Res;
14489 }
14490 return Constant::getAllOnesValue(Ty);
14491 }
14492
14493 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
14494 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
14495 return TTI::TCC_Free;
14496 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14497 InstructionCost GatherCost = 0;
14498 SmallVector<Value *> Gathers(VL);
14499 if (!Root && isSplat(VL)) {
14500 // Found the broadcasting of the single scalar, calculate the cost as
14501 // the broadcast.
14502 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
14503 assert(It != VL.end() && "Expected at least one non-undef value.");
14504 // Add broadcast for non-identity shuffle only.
14505 bool NeedShuffle =
14506 count(VL, *It) > 1 &&
14507 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
14508 if (!NeedShuffle) {
14509 if (isa<FixedVectorType>(ScalarTy)) {
14510 assert(SLPReVec && "FixedVectorType is not expected.");
14511 return TTI.getShuffleCost(
14512 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
14513 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
14514 cast<FixedVectorType>(ScalarTy));
14515 }
14516 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
14517 CostKind, std::distance(VL.begin(), It),
14518 PoisonValue::get(VecTy), *It);
14519 }
14520
14521 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
14522 transform(VL, ShuffleMask.begin(), [](Value *V) {
14523 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
14524 });
14525 InstructionCost InsertCost =
14526 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
14527 PoisonValue::get(VecTy), *It);
14528 return InsertCost + ::getShuffleCost(TTI,
14530 VecTy, ShuffleMask, CostKind,
14531 /*Index=*/0, /*SubTp=*/nullptr,
14532 /*Args=*/*It);
14533 }
14534 return GatherCost +
14535 (all_of(Gathers, IsaPred<UndefValue>)
14537 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
14538 ScalarTy));
14539 };
14540
14541 /// Compute the cost of creating a vector containing the extracted values from
14542 /// \p VL.
14544 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
14545 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14546 unsigned NumParts) {
14547 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
14548 unsigned NumElts =
14549 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
14550 auto *EE = dyn_cast<ExtractElementInst>(V);
14551 if (!EE)
14552 return Sz;
14553 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
14554 if (!VecTy)
14555 return Sz;
14556 return std::max(Sz, VecTy->getNumElements());
14557 });
14558 // FIXME: this must be moved to TTI for better estimation.
14559 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
14560 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
14562 SmallVectorImpl<unsigned> &SubVecSizes)
14563 -> std::optional<TTI::ShuffleKind> {
14564 if (NumElts <= EltsPerVector)
14565 return std::nullopt;
14566 int OffsetReg0 =
14567 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
14568 [](int S, int I) {
14569 if (I == PoisonMaskElem)
14570 return S;
14571 return std::min(S, I);
14572 }),
14573 EltsPerVector);
14574 int OffsetReg1 = OffsetReg0;
14575 DenseSet<int> RegIndices;
14576 // Check that if trying to permute same single/2 input vectors.
14578 int FirstRegId = -1;
14579 Indices.assign(1, OffsetReg0);
14580 for (auto [Pos, I] : enumerate(Mask)) {
14581 if (I == PoisonMaskElem)
14582 continue;
14583 int Idx = I - OffsetReg0;
14584 int RegId =
14585 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
14586 if (FirstRegId < 0)
14587 FirstRegId = RegId;
14588 RegIndices.insert(RegId);
14589 if (RegIndices.size() > 2)
14590 return std::nullopt;
14591 if (RegIndices.size() == 2) {
14592 ShuffleKind = TTI::SK_PermuteTwoSrc;
14593 if (Indices.size() == 1) {
14594 OffsetReg1 = alignDown(
14595 std::accumulate(
14596 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
14597 [&](int S, int I) {
14598 if (I == PoisonMaskElem)
14599 return S;
14600 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
14601 ((I - OffsetReg0) % NumElts) / EltsPerVector;
14602 if (RegId == FirstRegId)
14603 return S;
14604 return std::min(S, I);
14605 }),
14606 EltsPerVector);
14607 unsigned Index = OffsetReg1 % NumElts;
14608 Indices.push_back(Index);
14609 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
14610 }
14611 Idx = I - OffsetReg1;
14612 }
14613 I = (Idx % NumElts) % EltsPerVector +
14614 (RegId == FirstRegId ? 0 : EltsPerVector);
14615 }
14616 return ShuffleKind;
14617 };
14618 InstructionCost Cost = 0;
14619
14620 // Process extracts in blocks of EltsPerVector to check if the source vector
14621 // operand can be re-used directly. If not, add the cost of creating a
14622 // shuffle to extract the values into a vector register.
14623 for (unsigned Part : seq<unsigned>(NumParts)) {
14624 if (!ShuffleKinds[Part])
14625 continue;
14626 ArrayRef<int> MaskSlice = Mask.slice(
14627 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
14628 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
14629 copy(MaskSlice, SubMask.begin());
14631 SmallVector<unsigned, 2> SubVecSizes;
14632 std::optional<TTI::ShuffleKind> RegShuffleKind =
14633 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
14634 if (!RegShuffleKind) {
14635 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
14637 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
14638 Cost +=
14639 ::getShuffleCost(TTI, *ShuffleKinds[Part],
14640 getWidenedType(ScalarTy, NumElts), MaskSlice);
14641 continue;
14642 }
14643 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
14644 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
14645 Cost +=
14646 ::getShuffleCost(TTI, *RegShuffleKind,
14647 getWidenedType(ScalarTy, EltsPerVector), SubMask);
14648 }
14649 const unsigned BaseVF = getFullVectorNumberOfElements(
14650 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
14651 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
14652 assert((Idx + SubVecSize) <= BaseVF &&
14653 "SK_ExtractSubvector index out of range");
14655 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
14656 Idx, getWidenedType(ScalarTy, SubVecSize));
14657 }
14658 // Second attempt to check, if just a permute is better estimated than
14659 // subvector extract.
14660 SubMask.assign(NumElts, PoisonMaskElem);
14661 copy(MaskSlice, SubMask.begin());
14662 InstructionCost OriginalCost = ::getShuffleCost(
14663 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
14664 if (OriginalCost < Cost)
14665 Cost = OriginalCost;
14666 }
14667 return Cost;
14668 }
14669 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
14670 /// mask \p Mask, register number \p Part, that includes \p SliceSize
14671 /// elements.
14672 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
14673 ArrayRef<int> Mask, unsigned Part,
14674 unsigned SliceSize) {
14675 if (SameNodesEstimated) {
14676 // Delay the cost estimation if the same nodes are reshuffling.
14677 // If we already requested the cost of reshuffling of E1 and E2 before, no
14678 // need to estimate another cost with the sub-Mask, instead include this
14679 // sub-Mask into the CommonMask to estimate it later and avoid double cost
14680 // estimation.
14681 if ((InVectors.size() == 2 &&
14682 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
14683 cast<const TreeEntry *>(InVectors.back()) == E2) ||
14684 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
14685 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
14686 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14687 [](int Idx) { return Idx == PoisonMaskElem; }) &&
14688 "Expected all poisoned elements.");
14689 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
14690 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14691 return;
14692 }
14693 // Found non-matching nodes - need to estimate the cost for the matched
14694 // and transform mask.
14695 Cost += createShuffle(InVectors.front(),
14696 InVectors.size() == 1 ? nullptr : InVectors.back(),
14697 CommonMask);
14698 transformMaskAfterShuffle(CommonMask, CommonMask);
14699 } else if (InVectors.size() == 2) {
14700 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14701 transformMaskAfterShuffle(CommonMask, CommonMask);
14702 }
14703 SameNodesEstimated = false;
14704 if (!E2 && InVectors.size() == 1) {
14705 unsigned VF = E1.getVectorFactor();
14706 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
14707 VF = std::max(VF, getVF(V1));
14708 } else {
14709 const auto *E = cast<const TreeEntry *>(InVectors.front());
14710 VF = std::max(VF, E->getVectorFactor());
14711 }
14712 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14713 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14714 CommonMask[Idx] = Mask[Idx] + VF;
14715 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14716 transformMaskAfterShuffle(CommonMask, CommonMask);
14717 } else {
14718 auto P = InVectors.front();
14719 Cost += createShuffle(&E1, E2, Mask);
14720 unsigned VF = Mask.size();
14721 if (Value *V1 = dyn_cast<Value *>(P)) {
14722 VF = std::max(VF,
14723 getNumElements(V1->getType()));
14724 } else {
14725 const auto *E = cast<const TreeEntry *>(P);
14726 VF = std::max(VF, E->getVectorFactor());
14727 }
14728 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14729 if (Mask[Idx] != PoisonMaskElem)
14730 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14731 Cost += createShuffle(P, InVectors.front(), CommonMask);
14732 transformMaskAfterShuffle(CommonMask, CommonMask);
14733 }
14734 }
14735
14736 class ShuffleCostBuilder {
14737 const TargetTransformInfo &TTI;
14738
14739 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14740 int Index = -1;
14741 return Mask.empty() ||
14742 (VF == Mask.size() &&
14745 Index == 0);
14746 }
14747
14748 public:
14749 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14750 ~ShuffleCostBuilder() = default;
14751 InstructionCost createShuffleVector(Value *V1, Value *,
14752 ArrayRef<int> Mask) const {
14753 // Empty mask or identity mask are free.
14754 unsigned VF =
14755 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14756 if (isEmptyOrIdentity(Mask, VF))
14757 return TTI::TCC_Free;
14758 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
14759 cast<VectorType>(V1->getType()), Mask);
14760 }
14761 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
14762 // Empty mask or identity mask are free.
14763 unsigned VF =
14764 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14765 if (isEmptyOrIdentity(Mask, VF))
14766 return TTI::TCC_Free;
14767 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
14768 cast<VectorType>(V1->getType()), Mask);
14769 }
14770 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
14771 InstructionCost createPoison(Type *Ty, unsigned VF) const {
14772 return TTI::TCC_Free;
14773 }
14774 void resizeToMatch(Value *&, Value *&) const {}
14775 };
14776
14777 /// Smart shuffle instruction emission, walks through shuffles trees and
14778 /// tries to find the best matching vector for the actual shuffle
14779 /// instruction.
14781 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
14783 ArrayRef<int> Mask) {
14784 ShuffleCostBuilder Builder(TTI);
14785 SmallVector<int> CommonMask(Mask);
14786 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
14787 unsigned CommonVF = Mask.size();
14788 InstructionCost ExtraCost = 0;
14789 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14790 unsigned VF) -> InstructionCost {
14791 if (E.isGather() && allConstant(E.Scalars))
14792 return TTI::TCC_Free;
14793 Type *EScalarTy = E.Scalars.front()->getType();
14794 bool IsSigned = true;
14795 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14796 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
14797 IsSigned = It->second.second;
14798 }
14799 if (EScalarTy != ScalarTy) {
14800 unsigned CastOpcode = Instruction::Trunc;
14801 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14802 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14803 if (DstSz > SrcSz)
14804 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14805 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
14806 getWidenedType(EScalarTy, VF),
14807 TTI::CastContextHint::None, CostKind);
14808 }
14809 return TTI::TCC_Free;
14810 };
14811 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14812 if (isa<Constant>(V))
14813 return TTI::TCC_Free;
14814 auto *VecTy = cast<VectorType>(V->getType());
14815 Type *EScalarTy = VecTy->getElementType();
14816 if (EScalarTy != ScalarTy) {
14817 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
14818 unsigned CastOpcode = Instruction::Trunc;
14819 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14820 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14821 if (DstSz > SrcSz)
14822 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14823 return TTI.getCastInstrCost(
14824 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
14825 VecTy, TTI::CastContextHint::None, CostKind);
14826 }
14827 return TTI::TCC_Free;
14828 };
14829 if (!V1 && !V2 && !P2.isNull()) {
14830 // Shuffle 2 entry nodes.
14831 const TreeEntry *E = cast<const TreeEntry *>(P1);
14832 unsigned VF = E->getVectorFactor();
14833 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14834 CommonVF = std::max(VF, E2->getVectorFactor());
14835 assert(all_of(Mask,
14836 [=](int Idx) {
14837 return Idx < 2 * static_cast<int>(CommonVF);
14838 }) &&
14839 "All elements in mask must be less than 2 * CommonVF.");
14840 if (E->Scalars.size() == E2->Scalars.size()) {
14841 SmallVector<int> EMask = E->getCommonMask();
14842 SmallVector<int> E2Mask = E2->getCommonMask();
14843 if (!EMask.empty() || !E2Mask.empty()) {
14844 for (int &Idx : CommonMask) {
14845 if (Idx == PoisonMaskElem)
14846 continue;
14847 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14848 Idx = EMask[Idx];
14849 else if (Idx >= static_cast<int>(CommonVF))
14850 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14851 E->Scalars.size();
14852 }
14853 }
14854 CommonVF = E->Scalars.size();
14855 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14856 GetNodeMinBWAffectedCost(*E2, CommonVF);
14857 } else {
14858 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14859 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14860 }
14861 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14862 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14863 } else if (!V1 && P2.isNull()) {
14864 // Shuffle single entry node.
14865 const TreeEntry *E = cast<const TreeEntry *>(P1);
14866 unsigned VF = E->getVectorFactor();
14867 CommonVF = VF;
14868 assert(
14869 all_of(Mask,
14870 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14871 "All elements in mask must be less than CommonVF.");
14872 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14873 SmallVector<int> EMask = E->getCommonMask();
14874 assert(!EMask.empty() && "Expected non-empty common mask.");
14875 for (int &Idx : CommonMask) {
14876 if (Idx != PoisonMaskElem)
14877 Idx = EMask[Idx];
14878 }
14879 CommonVF = E->Scalars.size();
14880 } else if (unsigned Factor = E->getInterleaveFactor();
14881 Factor > 0 && E->Scalars.size() != Mask.size() &&
14883 Factor)) {
14884 // Deinterleaved nodes are free.
14885 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14886 }
14887 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14888 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14889 // Not identity/broadcast? Try to see if the original vector is better.
14890 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14891 CommonVF == CommonMask.size() &&
14892 any_of(enumerate(CommonMask),
14893 [](const auto &&P) {
14894 return P.value() != PoisonMaskElem &&
14895 static_cast<unsigned>(P.value()) != P.index();
14896 }) &&
14897 any_of(CommonMask,
14898 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
14899 SmallVector<int> ReorderMask;
14900 inversePermutation(E->ReorderIndices, ReorderMask);
14901 ::addMask(CommonMask, ReorderMask);
14902 }
14903 } else if (V1 && P2.isNull()) {
14904 // Shuffle single vector.
14905 ExtraCost += GetValueMinBWAffectedCost(V1);
14906 CommonVF = getVF(V1);
14907 assert(
14908 all_of(Mask,
14909 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14910 "All elements in mask must be less than CommonVF.");
14911 } else if (V1 && !V2) {
14912 // Shuffle vector and tree node.
14913 unsigned VF = getVF(V1);
14914 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
14915 CommonVF = std::max(VF, E2->getVectorFactor());
14916 assert(all_of(Mask,
14917 [=](int Idx) {
14918 return Idx < 2 * static_cast<int>(CommonVF);
14919 }) &&
14920 "All elements in mask must be less than 2 * CommonVF.");
14921 if (E2->Scalars.size() == VF && VF != CommonVF) {
14922 SmallVector<int> E2Mask = E2->getCommonMask();
14923 assert(!E2Mask.empty() && "Expected non-empty common mask.");
14924 for (int &Idx : CommonMask) {
14925 if (Idx == PoisonMaskElem)
14926 continue;
14927 if (Idx >= static_cast<int>(CommonVF))
14928 Idx = E2Mask[Idx - CommonVF] + VF;
14929 }
14930 CommonVF = VF;
14931 }
14932 ExtraCost += GetValueMinBWAffectedCost(V1);
14933 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14934 ExtraCost += GetNodeMinBWAffectedCost(
14935 *E2, std::min(CommonVF, E2->getVectorFactor()));
14936 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14937 } else if (!V1 && V2) {
14938 // Shuffle vector and tree node.
14939 unsigned VF = getVF(V2);
14940 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
14941 CommonVF = std::max(VF, E1->getVectorFactor());
14942 assert(all_of(Mask,
14943 [=](int Idx) {
14944 return Idx < 2 * static_cast<int>(CommonVF);
14945 }) &&
14946 "All elements in mask must be less than 2 * CommonVF.");
14947 if (E1->Scalars.size() == VF && VF != CommonVF) {
14948 SmallVector<int> E1Mask = E1->getCommonMask();
14949 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14950 for (int &Idx : CommonMask) {
14951 if (Idx == PoisonMaskElem)
14952 continue;
14953 if (Idx >= static_cast<int>(CommonVF))
14954 Idx = E1Mask[Idx - CommonVF] + VF;
14955 else
14956 Idx = E1Mask[Idx];
14957 }
14958 CommonVF = VF;
14959 }
14960 ExtraCost += GetNodeMinBWAffectedCost(
14961 *E1, std::min(CommonVF, E1->getVectorFactor()));
14962 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14963 ExtraCost += GetValueMinBWAffectedCost(V2);
14964 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14965 } else {
14966 assert(V1 && V2 && "Expected both vectors.");
14967 unsigned VF = getVF(V1);
14968 CommonVF = std::max(VF, getVF(V2));
14969 assert(all_of(Mask,
14970 [=](int Idx) {
14971 return Idx < 2 * static_cast<int>(CommonVF);
14972 }) &&
14973 "All elements in mask must be less than 2 * CommonVF.");
14974 ExtraCost +=
14975 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14976 if (V1->getType() != V2->getType()) {
14977 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14978 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14979 } else {
14980 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
14981 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
14982 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
14983 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
14984 }
14985 }
14986 InVectors.front() =
14987 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14988 if (InVectors.size() == 2)
14989 InVectors.pop_back();
14990 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14991 V1, V2, CommonMask, Builder, ScalarTy);
14992 }
14993
14994public:
14996 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14997 SmallPtrSetImpl<Value *> &CheckedExtracts)
14998 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14999 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
15000 CheckedExtracts(CheckedExtracts) {}
15001 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
15002 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15003 unsigned NumParts, bool &UseVecBaseAsInput) {
15004 UseVecBaseAsInput = false;
15005 if (Mask.empty())
15006 return nullptr;
15007 Value *VecBase = nullptr;
15008 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
15009 if (!E->ReorderIndices.empty()) {
15010 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
15011 E->ReorderIndices.end());
15012 reorderScalars(VL, ReorderMask);
15013 }
15014 // Check if it can be considered reused if same extractelements were
15015 // vectorized already.
15016 bool PrevNodeFound = any_of(
15017 ArrayRef(R.VectorizableTree).take_front(E->Idx),
15018 [&](const std::unique_ptr<TreeEntry> &TE) {
15019 return ((TE->hasState() && !TE->isAltShuffle() &&
15020 TE->getOpcode() == Instruction::ExtractElement) ||
15021 TE->isGather()) &&
15022 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
15023 return VL.size() > Data.index() &&
15024 (Mask[Data.index()] == PoisonMaskElem ||
15025 isa<UndefValue>(VL[Data.index()]) ||
15026 Data.value() == VL[Data.index()]);
15027 });
15028 });
15029 SmallPtrSet<Value *, 4> UniqueBases;
15030 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
15031 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
15032 for (unsigned Part : seq<unsigned>(NumParts)) {
15033 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
15034 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
15035 for (auto [I, V] :
15036 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
15037 // Ignore non-extractelement scalars.
15038 if (isa<UndefValue>(V) ||
15039 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
15040 continue;
15041 // If all users of instruction are going to be vectorized and this
15042 // instruction itself is not going to be vectorized, consider this
15043 // instruction as dead and remove its cost from the final cost of the
15044 // vectorized tree.
15045 // Also, avoid adjusting the cost for extractelements with multiple uses
15046 // in different graph entries.
15047 auto *EE = cast<ExtractElementInst>(V);
15048 VecBase = EE->getVectorOperand();
15049 UniqueBases.insert(VecBase);
15050 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
15051 if (!CheckedExtracts.insert(V).second ||
15052 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
15053 any_of(VEs,
15054 [&](const TreeEntry *TE) {
15055 return R.DeletedNodes.contains(TE) ||
15056 R.TransformedToGatherNodes.contains(TE);
15057 }) ||
15058 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
15059 !R.isVectorized(EE) &&
15060 count_if(E->Scalars, [&](Value *V) { return V == EE; }) !=
15061 count_if(E->UserTreeIndex.UserTE->Scalars,
15062 [&](Value *V) { return V == EE; })) ||
15063 any_of(EE->users(),
15064 [&](User *U) {
15065 return isa<GetElementPtrInst>(U) &&
15066 !R.areAllUsersVectorized(cast<Instruction>(U),
15067 &VectorizedVals);
15068 }) ||
15069 (!VEs.empty() && !is_contained(VEs, E)))
15070 continue;
15071 std::optional<unsigned> EEIdx = getExtractIndex(EE);
15072 if (!EEIdx)
15073 continue;
15074 unsigned Idx = *EEIdx;
15075 // Take credit for instruction that will become dead.
15076 if (EE->hasOneUse() || !PrevNodeFound) {
15077 Instruction *Ext = EE->user_back();
15078 if (isa<SExtInst, ZExtInst>(Ext) &&
15080 // Use getExtractWithExtendCost() to calculate the cost of
15081 // extractelement/ext pair.
15082 Cost -= TTI.getExtractWithExtendCost(
15083 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
15084 Idx, CostKind);
15085 // Add back the cost of s|zext which is subtracted separately.
15086 Cost += TTI.getCastInstrCost(
15087 Ext->getOpcode(), Ext->getType(), EE->getType(),
15089 continue;
15090 }
15091 }
15092 APInt &DemandedElts =
15093 VectorOpsToExtracts
15094 .try_emplace(VecBase,
15095 APInt::getZero(getNumElements(VecBase->getType())))
15096 .first->getSecond();
15097 DemandedElts.setBit(Idx);
15098 }
15099 }
15100 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
15102 DemandedElts, /*Insert=*/false,
15103 /*Extract=*/true, CostKind);
15104 // Check that gather of extractelements can be represented as just a
15105 // shuffle of a single/two vectors the scalars are extracted from.
15106 // Found the bunch of extractelement instructions that must be gathered
15107 // into a vector and can be represented as a permutation elements in a
15108 // single input vector or of 2 input vectors.
15109 // Done for reused if same extractelements were vectorized already.
15110 if (!PrevNodeFound)
15111 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
15112 InVectors.assign(1, E);
15113 CommonMask.assign(Mask.begin(), Mask.end());
15114 transformMaskAfterShuffle(CommonMask, CommonMask);
15115 SameNodesEstimated = false;
15116 if (NumParts != 1 && UniqueBases.size() != 1) {
15117 UseVecBaseAsInput = true;
15118 VecBase =
15119 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
15120 }
15121 return VecBase;
15122 }
15123 /// Checks if the specified entry \p E needs to be delayed because of its
15124 /// dependency nodes.
15125 std::optional<InstructionCost>
15126 needToDelay(const TreeEntry *,
15128 // No need to delay the cost estimation during analysis.
15129 return std::nullopt;
15130 }
15131 /// Reset the builder to handle perfect diamond match.
15133 IsFinalized = false;
15134 CommonMask.clear();
15135 InVectors.clear();
15136 Cost = 0;
15137 VectorizedVals.clear();
15138 SameNodesEstimated = true;
15139 }
15140 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
15141 if (&E1 == &E2) {
15142 assert(all_of(Mask,
15143 [&](int Idx) {
15144 return Idx < static_cast<int>(E1.getVectorFactor());
15145 }) &&
15146 "Expected single vector shuffle mask.");
15147 add(E1, Mask);
15148 return;
15149 }
15150 if (InVectors.empty()) {
15151 CommonMask.assign(Mask.begin(), Mask.end());
15152 InVectors.assign({&E1, &E2});
15153 return;
15154 }
15155 assert(!CommonMask.empty() && "Expected non-empty common mask.");
15156 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
15157 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
15158 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
15159 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
15160 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
15161 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
15162 }
15163 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
15164 if (InVectors.empty()) {
15165 CommonMask.assign(Mask.begin(), Mask.end());
15166 InVectors.assign(1, &E1);
15167 return;
15168 }
15169 assert(!CommonMask.empty() && "Expected non-empty common mask.");
15170 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
15171 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
15172 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
15173 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
15174 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
15175 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
15176 if (!SameNodesEstimated && InVectors.size() == 1)
15177 InVectors.emplace_back(&E1);
15178 }
15179 /// Adds 2 input vectors and the mask for their shuffling.
15180 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
15181 // May come only for shuffling of 2 vectors with extractelements, already
15182 // handled in adjustExtracts.
15183 assert(InVectors.size() == 1 &&
15184 all_of(enumerate(CommonMask),
15185 [&](auto P) {
15186 if (P.value() == PoisonMaskElem)
15187 return Mask[P.index()] == PoisonMaskElem;
15188 auto *EI = cast<ExtractElementInst>(
15189 cast<const TreeEntry *>(InVectors.front())
15190 ->getOrdered(P.index()));
15191 return EI->getVectorOperand() == V1 ||
15192 EI->getVectorOperand() == V2;
15193 }) &&
15194 "Expected extractelement vectors.");
15195 }
15196 /// Adds another one input vector and the mask for the shuffling.
15197 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
15198 if (InVectors.empty()) {
15199 assert(CommonMask.empty() && !ForExtracts &&
15200 "Expected empty input mask/vectors.");
15201 CommonMask.assign(Mask.begin(), Mask.end());
15202 InVectors.assign(1, V1);
15203 return;
15204 }
15205 if (ForExtracts) {
15206 // No need to add vectors here, already handled them in adjustExtracts.
15207 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
15208 !CommonMask.empty() &&
15209 all_of(enumerate(CommonMask),
15210 [&](auto P) {
15211 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
15212 ->getOrdered(P.index());
15213 if (P.value() == PoisonMaskElem)
15214 return P.value() == Mask[P.index()] ||
15215 isa<UndefValue>(Scalar);
15216 if (isa<Constant>(V1))
15217 return true;
15218 auto *EI = cast<ExtractElementInst>(Scalar);
15219 return EI->getVectorOperand() == V1;
15220 }) &&
15221 "Expected only tree entry for extractelement vectors.");
15222 return;
15223 }
15224 assert(!InVectors.empty() && !CommonMask.empty() &&
15225 "Expected only tree entries from extracts/reused buildvectors.");
15226 unsigned VF = getVF(V1);
15227 if (InVectors.size() == 2) {
15228 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
15229 transformMaskAfterShuffle(CommonMask, CommonMask);
15230 VF = std::max<unsigned>(VF, CommonMask.size());
15231 } else if (const auto *InTE =
15232 InVectors.front().dyn_cast<const TreeEntry *>()) {
15233 VF = std::max(VF, InTE->getVectorFactor());
15234 } else {
15235 VF = std::max(
15236 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
15237 ->getNumElements());
15238 }
15239 InVectors.push_back(V1);
15240 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15241 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
15242 CommonMask[Idx] = Mask[Idx] + VF;
15243 }
15244 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
15245 Value *Root = nullptr) {
15246 Cost += getBuildVectorCost(VL, Root);
15247 if (!Root) {
15248 // FIXME: Need to find a way to avoid use of getNullValue here.
15250 unsigned VF = VL.size();
15251 if (MaskVF != 0)
15252 VF = std::min(VF, MaskVF);
15253 Type *VLScalarTy = VL.front()->getType();
15254 for (Value *V : VL.take_front(VF)) {
15255 Type *ScalarTy = VLScalarTy->getScalarType();
15256 if (isa<PoisonValue>(V)) {
15257 Vals.push_back(PoisonValue::get(ScalarTy));
15258 continue;
15259 }
15260 if (isa<UndefValue>(V)) {
15261 Vals.push_back(UndefValue::get(ScalarTy));
15262 continue;
15263 }
15264 Vals.push_back(Constant::getNullValue(ScalarTy));
15265 }
15266 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
15267 assert(SLPReVec && "FixedVectorType is not expected.");
15268 // When REVEC is enabled, we need to expand vector types into scalar
15269 // types.
15270 Vals = replicateMask(Vals, VecTy->getNumElements());
15271 }
15272 return ConstantVector::get(Vals);
15273 }
15276 cast<FixedVectorType>(Root->getType())->getNumElements()),
15277 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
15278 }
15280 /// Finalize emission of the shuffles.
15282 ArrayRef<int> ExtMask,
15283 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
15284 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
15287 Action = {}) {
15288 IsFinalized = true;
15289 if (Action) {
15290 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
15291 if (InVectors.size() == 2)
15292 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
15293 else
15294 Cost += createShuffle(Vec, nullptr, CommonMask);
15295 transformMaskAfterShuffle(CommonMask, CommonMask);
15296 assert(VF > 0 &&
15297 "Expected vector length for the final value before action.");
15298 Value *V = cast<Value *>(Vec);
15299 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
15300 Cost += createShuffle(V1, V2, Mask);
15301 return V1;
15302 });
15303 InVectors.front() = V;
15304 }
15305 if (!SubVectors.empty()) {
15306 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
15307 if (InVectors.size() == 2)
15308 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
15309 else
15310 Cost += createShuffle(Vec, nullptr, CommonMask);
15311 transformMaskAfterShuffle(CommonMask, CommonMask);
15312 // Add subvectors permutation cost.
15313 if (!SubVectorsMask.empty()) {
15314 assert(SubVectorsMask.size() <= CommonMask.size() &&
15315 "Expected same size of masks for subvectors and common mask.");
15316 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
15317 copy(SubVectorsMask, SVMask.begin());
15318 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
15319 if (I2 != PoisonMaskElem) {
15320 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
15321 I1 = I2 + CommonMask.size();
15322 }
15323 }
15325 getWidenedType(ScalarTy, CommonMask.size()),
15326 SVMask, CostKind);
15327 }
15328 for (auto [E, Idx] : SubVectors) {
15329 Type *EScalarTy = E->Scalars.front()->getType();
15330 bool IsSigned = true;
15331 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
15332 EScalarTy =
15333 IntegerType::get(EScalarTy->getContext(), It->second.first);
15334 IsSigned = It->second.second;
15335 }
15336 if (ScalarTy != EScalarTy) {
15337 unsigned CastOpcode = Instruction::Trunc;
15338 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15339 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15340 if (DstSz > SrcSz)
15341 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15342 Cost += TTI.getCastInstrCost(
15343 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
15344 getWidenedType(EScalarTy, E->getVectorFactor()),
15346 }
15349 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
15350 getWidenedType(ScalarTy, E->getVectorFactor()));
15351 if (!CommonMask.empty()) {
15352 std::iota(std::next(CommonMask.begin(), Idx),
15353 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
15354 Idx);
15355 }
15356 }
15357 }
15358
15359 if (!ExtMask.empty()) {
15360 if (CommonMask.empty()) {
15361 CommonMask.assign(ExtMask.begin(), ExtMask.end());
15362 } else {
15363 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
15364 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
15365 if (ExtMask[I] == PoisonMaskElem)
15366 continue;
15367 NewMask[I] = CommonMask[ExtMask[I]];
15368 }
15369 CommonMask.swap(NewMask);
15370 }
15371 }
15372 if (CommonMask.empty()) {
15373 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
15374 return Cost;
15375 }
15376 return Cost +
15377 createShuffle(InVectors.front(),
15378 InVectors.size() == 2 ? InVectors.back() : nullptr,
15379 CommonMask);
15380 }
15381
15383 assert((IsFinalized || CommonMask.empty()) &&
15384 "Shuffle construction must be finalized.");
15385 }
15386};
15387
15388const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
15389 unsigned Idx) const {
15390 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
15391 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
15392 return Op;
15393}
15394
15395TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
15396 if (TE.State == TreeEntry::ScatterVectorize ||
15397 TE.State == TreeEntry::StridedVectorize)
15399 if (TE.State == TreeEntry::CompressVectorize)
15401 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
15402 !TE.isAltShuffle()) {
15403 if (TE.ReorderIndices.empty())
15405 SmallVector<int> Mask;
15406 inversePermutation(TE.ReorderIndices, Mask);
15407 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
15409 }
15411}
15412
15413/// Get the assumed loop trip count for the loop \p L.
15414static unsigned getLoopTripCount(const Loop *L, ScalarEvolution &SE) {
15415 if (LoopAwareTripCount == 0)
15416 return 1;
15417 unsigned Scale = SE.getSmallConstantTripCount(L);
15418 if (Scale == 0)
15419 Scale = getLoopEstimatedTripCount(const_cast<Loop *>(L)).value_or(0);
15420 if (Scale != 0) {
15421 // Multiple exiting blocks - choose the minimum between trip count (scale)
15422 // and LoopAwareTripCount, since the multiple exit loops can be terminated
15423 // early.
15424 if (!L->getExitingBlock())
15425 return std::min<unsigned>(LoopAwareTripCount, Scale);
15426 return Scale;
15427 }
15428 return LoopAwareTripCount;
15429}
15430
15431unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
15432 Instruction *U) {
15433 BasicBlock *Parent = nullptr;
15434 if (U) {
15435 Parent = U->getParent();
15436 } else if (TE.isGather() || TE.State == TreeEntry::SplitVectorize) {
15437 EdgeInfo EI = TE.UserTreeIndex;
15438 while (EI.UserTE) {
15439 if (EI.UserTE->isGather() ||
15440 EI.UserTE->State == TreeEntry::SplitVectorize) {
15441 EI = EI.UserTE->UserTreeIndex;
15442 continue;
15443 }
15444 if (EI.UserTE->State == TreeEntry::Vectorize &&
15445 EI.UserTE->getOpcode() == Instruction::PHI) {
15446 auto *PH = cast<PHINode>(EI.UserTE->getMainOp());
15447 Parent = PH->getIncomingBlock(EI.EdgeIdx);
15448 } else {
15449 Parent = EI.UserTE->getMainOp()->getParent();
15450 }
15451 break;
15452 }
15453 if (!Parent)
15454 return 1;
15455 } else {
15456 Parent = TE.getMainOp()->getParent();
15457 }
15458 if (const Loop *L = LI->getLoopFor(Parent)) {
15459 const auto It = LoopToScaleFactor.find(L);
15460 if (It != LoopToScaleFactor.end())
15461 return It->second;
15462 unsigned Scale = 1;
15463 if (const Loop *NonInvL = findInnermostNonInvariantLoop(
15464 L, Scalar ? ArrayRef(Scalar) : ArrayRef(TE.Scalars))) {
15465 Scale = getLoopTripCount(NonInvL, *SE);
15466 for (const Loop *LN : getLoopNest(NonInvL)) {
15467 if (LN == L)
15468 break;
15469 auto LNRes = LoopToScaleFactor.try_emplace(LN, 0);
15470 auto &LoopScale = LNRes.first->getSecond();
15471 if (!LNRes.second) {
15472 Scale *= LoopScale;
15473 break;
15474 }
15475 Scale *= getLoopTripCount(LN, *SE);
15476 LoopScale = Scale;
15477 }
15478 }
15479 LoopToScaleFactor.try_emplace(L, Scale);
15480 return Scale;
15481 }
15482 return 1;
15483}
15484
15486BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
15487 SmallPtrSetImpl<Value *> &CheckedExtracts) {
15488 ArrayRef<Value *> VL = E->Scalars;
15489
15490 Type *ScalarTy = getValueType(VL[0]);
15491 if (!isValidElementType(ScalarTy))
15492 return InstructionCost::getInvalid();
15494
15495 // If we have computed a smaller type for the expression, update VecTy so
15496 // that the costs will be accurate.
15497 auto It = MinBWs.find(E);
15498 Type *OrigScalarTy = ScalarTy;
15499 if (It != MinBWs.end()) {
15500 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15501 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15502 if (VecTy)
15503 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15504 } else if (E->Idx == 0 && isReducedBitcastRoot()) {
15505 const TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
15506 ScalarTy = cast<CastInst>(ZExt->getMainOp())->getSrcTy();
15507 }
15508 auto *VecTy = getWidenedType(ScalarTy, VL.size());
15509 unsigned EntryVF = E->getVectorFactor();
15510 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
15511
15512 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
15513 if (allConstant(VL))
15514 return 0;
15515 if (isa<InsertElementInst>(VL[0]))
15516 return InstructionCost::getInvalid();
15517 if (isa<CmpInst>(VL.front()))
15518 ScalarTy = VL.front()->getType();
15519 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
15520 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
15521 }
15522 if (E->State == TreeEntry::SplitVectorize) {
15523 assert(E->CombinedEntriesWithIndices.size() == 2 &&
15524 "Expected exactly 2 combined entries.");
15525 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
15526 InstructionCost VectorCost = 0;
15527 if (E->ReorderIndices.empty()) {
15528 VectorCost = ::getShuffleCost(
15529 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
15530 E->CombinedEntriesWithIndices.back().second,
15532 ScalarTy,
15533 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
15534 ->getVectorFactor()));
15535 } else {
15536 unsigned CommonVF =
15537 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
15538 ->getVectorFactor(),
15539 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
15540 ->getVectorFactor());
15541 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
15542 getWidenedType(ScalarTy, CommonVF),
15543 E->getSplitMask(), CostKind);
15544 }
15545 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
15546 return VectorCost;
15547 }
15548 InstructionCost CommonCost = 0;
15549 SmallVector<int> Mask;
15550 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
15551 (E->State != TreeEntry::StridedVectorize ||
15552 !isReverseOrder(E->ReorderIndices))) {
15553 SmallVector<int> NewMask;
15554 if (E->getOpcode() == Instruction::Store) {
15555 // For stores the order is actually a mask.
15556 NewMask.resize(E->ReorderIndices.size());
15557 copy(E->ReorderIndices, NewMask.begin());
15558 } else {
15559 inversePermutation(E->ReorderIndices, NewMask);
15560 }
15561 ::addMask(Mask, NewMask);
15562 }
15563 if (!E->ReuseShuffleIndices.empty())
15564 ::addMask(Mask, E->ReuseShuffleIndices);
15565 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
15566 CommonCost =
15567 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
15568 assert((E->State == TreeEntry::Vectorize ||
15569 E->State == TreeEntry::ScatterVectorize ||
15570 E->State == TreeEntry::StridedVectorize ||
15571 E->State == TreeEntry::CompressVectorize) &&
15572 "Unhandled state");
15573 assert(E->getOpcode() &&
15574 ((allSameType(VL) && allSameBlock(VL)) ||
15575 (E->getOpcode() == Instruction::GetElementPtr &&
15576 E->getMainOp()->getType()->isPointerTy()) ||
15577 E->hasCopyableElements()) &&
15578 "Invalid VL");
15579 Instruction *VL0 = E->getMainOp();
15580 unsigned ShuffleOrOp =
15581 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15582 if (E->CombinedOp != TreeEntry::NotCombinedOp)
15583 ShuffleOrOp = E->CombinedOp;
15584 SmallSetVector<Value *, 16> UniqueValues;
15585 SmallVector<unsigned, 16> UniqueIndexes;
15586 for (auto [Idx, V] : enumerate(VL))
15587 if (UniqueValues.insert(V))
15588 UniqueIndexes.push_back(Idx);
15589 const unsigned Sz = UniqueValues.size();
15590 SmallBitVector UsedScalars(Sz, false);
15591 for (unsigned I = 0; I < Sz; ++I) {
15592 if (isa<Instruction>(UniqueValues[I]) &&
15593 !E->isCopyableElement(UniqueValues[I]) &&
15594 getTreeEntries(UniqueValues[I]).front() == E)
15595 continue;
15596 UsedScalars.set(I);
15597 }
15598 auto GetCastContextHint = [&](Value *V) {
15599 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
15600 return getCastContextHint(*OpTEs.front());
15601 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
15602 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
15603 !SrcState.isAltShuffle())
15606 };
15607 auto GetCostDiff =
15608 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
15609 function_ref<InstructionCost(InstructionCost)> VectorCost) {
15610 // Calculate the cost of this instruction.
15611 InstructionCost ScalarCost = 0;
15612 if (isa<CastInst, CallInst>(VL0)) {
15613 // For some of the instructions no need to calculate cost for each
15614 // particular instruction, we can use the cost of the single
15615 // instruction x total number of scalar instructions.
15616 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
15617 } else {
15618 for (unsigned I = 0; I < Sz; ++I) {
15619 if (UsedScalars.test(I))
15620 continue;
15621 ScalarCost += ScalarEltCost(I);
15622 }
15623 }
15624
15625 InstructionCost VecCost = VectorCost(CommonCost);
15626 // Check if the current node must be resized, if the parent node is not
15627 // resized.
15628 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
15629 E->Idx != 0 &&
15630 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
15631 const EdgeInfo &EI = E->UserTreeIndex;
15632 if (!EI.UserTE->hasState() ||
15633 EI.UserTE->getOpcode() != Instruction::Select ||
15634 EI.EdgeIdx != 0) {
15635 auto UserBWIt = MinBWs.find(EI.UserTE);
15636 Type *UserScalarTy =
15637 (EI.UserTE->isGather() ||
15638 EI.UserTE->State == TreeEntry::SplitVectorize)
15639 ? EI.UserTE->Scalars.front()->getType()
15640 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
15641 if (UserBWIt != MinBWs.end())
15642 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
15643 UserBWIt->second.first);
15644 if (ScalarTy != UserScalarTy) {
15645 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15646 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
15647 unsigned VecOpcode;
15648 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
15649 if (BWSz > SrcBWSz)
15650 VecOpcode = Instruction::Trunc;
15651 else
15652 VecOpcode =
15653 It->second.second ? Instruction::SExt : Instruction::ZExt;
15654 TTI::CastContextHint CCH = GetCastContextHint(VL0);
15655 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
15656 CostKind);
15657 }
15658 }
15659 }
15660 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
15661 ScalarCost, "Calculated costs for Tree"));
15662 return VecCost - ScalarCost;
15663 };
15664 // Calculate cost difference from vectorizing set of GEPs.
15665 // Negative value means vectorizing is profitable.
15666 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
15667 assert((E->State == TreeEntry::Vectorize ||
15668 E->State == TreeEntry::StridedVectorize ||
15669 E->State == TreeEntry::CompressVectorize) &&
15670 "Entry state expected to be Vectorize, StridedVectorize or "
15671 "MaskedLoadCompressVectorize here.");
15672 InstructionCost ScalarCost = 0;
15673 InstructionCost VecCost = 0;
15674 std::tie(ScalarCost, VecCost) = getGEPCosts(
15675 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
15676 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
15677 "Calculated GEPs cost for Tree"));
15678
15679 return VecCost - ScalarCost;
15680 };
15681
15682 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
15683 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
15684 if (MinMaxID == Intrinsic::not_intrinsic)
15685 return InstructionCost::getInvalid();
15686 Type *CanonicalType = Ty;
15687 if (CanonicalType->isPtrOrPtrVectorTy())
15688 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
15689 CanonicalType->getContext(),
15690 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
15691
15692 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
15693 {CanonicalType, CanonicalType});
15695 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15696 // If the selects are the only uses of the compares, they will be
15697 // dead and we can adjust the cost by removing their cost.
15698 if (VI && SelectOnly) {
15699 assert((!Ty->isVectorTy() || SLPReVec) &&
15700 "Expected only for scalar type.");
15701 auto *CI = cast<CmpInst>(VI->getOperand(0));
15702 IntrinsicCost -= TTI->getCmpSelInstrCost(
15703 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
15704 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
15705 {TTI::OK_AnyValue, TTI::OP_None}, CI);
15706 }
15707 return IntrinsicCost;
15708 };
15709 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
15710 Instruction *VI) {
15711 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
15712 return Cost;
15713 };
15714 switch (ShuffleOrOp) {
15715 case Instruction::PHI: {
15716 // Count reused scalars.
15717 InstructionCost ScalarCost = 0;
15718 SmallPtrSet<const TreeEntry *, 4> CountedOps;
15719 for (Value *V : UniqueValues) {
15720 auto *PHI = dyn_cast<PHINode>(V);
15721 if (!PHI)
15722 continue;
15723
15724 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
15725 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
15726 Value *Op = PHI->getIncomingValue(I);
15727 Operands[I] = Op;
15728 }
15729 if (const TreeEntry *OpTE =
15730 getSameValuesTreeEntry(Operands.front(), Operands))
15731 if (CountedOps.insert(OpTE).second &&
15732 !OpTE->ReuseShuffleIndices.empty())
15733 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
15734 OpTE->Scalars.size());
15735 }
15736
15737 return CommonCost - ScalarCost;
15738 }
15739 case Instruction::ExtractValue:
15740 case Instruction::ExtractElement: {
15741 APInt DemandedElts;
15742 VectorType *SrcVecTy = nullptr;
15743 auto GetScalarCost = [&](unsigned Idx) {
15744 if (isa<PoisonValue>(UniqueValues[Idx]))
15746
15747 auto *I = cast<Instruction>(UniqueValues[Idx]);
15748 if (!SrcVecTy) {
15749 if (ShuffleOrOp == Instruction::ExtractElement) {
15750 auto *EE = cast<ExtractElementInst>(I);
15751 SrcVecTy = EE->getVectorOperandType();
15752 } else {
15753 auto *EV = cast<ExtractValueInst>(I);
15754 Type *AggregateTy = EV->getAggregateOperand()->getType();
15755 unsigned NumElts;
15756 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
15757 NumElts = ATy->getNumElements();
15758 else
15759 NumElts = AggregateTy->getStructNumElements();
15760 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
15761 }
15762 }
15763 if (I->hasOneUse()) {
15764 Instruction *Ext = I->user_back();
15765 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
15767 // Use getExtractWithExtendCost() to calculate the cost of
15768 // extractelement/ext pair.
15769 InstructionCost Cost = TTI->getExtractWithExtendCost(
15770 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
15771 CostKind);
15772 // Subtract the cost of s|zext which is subtracted separately.
15773 Cost -= TTI->getCastInstrCost(
15774 Ext->getOpcode(), Ext->getType(), I->getType(),
15776 return Cost;
15777 }
15778 }
15779 if (DemandedElts.isZero())
15780 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
15781 DemandedElts.setBit(*getExtractIndex(I));
15783 };
15784 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15785 return CommonCost - (DemandedElts.isZero()
15787 : TTI.getScalarizationOverhead(
15788 SrcVecTy, DemandedElts, /*Insert=*/false,
15789 /*Extract=*/true, CostKind));
15790 };
15791 return GetCostDiff(GetScalarCost, GetVectorCost);
15792 }
15793 case Instruction::InsertElement: {
15794 assert(E->ReuseShuffleIndices.empty() &&
15795 "Unique insertelements only are expected.");
15796 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
15797 unsigned const NumElts = SrcVecTy->getNumElements();
15798 unsigned const NumScalars = VL.size();
15799
15800 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
15801
15802 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15803 unsigned OffsetBeg = *getElementIndex(VL.front());
15804 unsigned OffsetEnd = OffsetBeg;
15805 InsertMask[OffsetBeg] = 0;
15806 for (auto [I, V] : enumerate(VL.drop_front())) {
15807 unsigned Idx = *getElementIndex(V);
15808 if (OffsetBeg > Idx)
15809 OffsetBeg = Idx;
15810 else if (OffsetEnd < Idx)
15811 OffsetEnd = Idx;
15812 InsertMask[Idx] = I + 1;
15813 }
15814 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
15815 if (NumOfParts > 0 && NumOfParts < NumElts)
15816 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15817 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15818 VecScalarsSz;
15819 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15820 unsigned InsertVecSz = std::min<unsigned>(
15821 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
15822 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15823 bool IsWholeSubvector =
15824 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15825 // Check if we can safely insert a subvector. If it is not possible, just
15826 // generate a whole-sized vector and shuffle the source vector and the new
15827 // subvector.
15828 if (OffsetBeg + InsertVecSz > VecSz) {
15829 // Align OffsetBeg to generate correct mask.
15830 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
15831 InsertVecSz = VecSz;
15832 }
15833
15834 APInt DemandedElts = APInt::getZero(NumElts);
15835 // TODO: Add support for Instruction::InsertValue.
15836 SmallVector<int> Mask;
15837 if (!E->ReorderIndices.empty()) {
15838 inversePermutation(E->ReorderIndices, Mask);
15839 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
15840 } else {
15841 Mask.assign(VecSz, PoisonMaskElem);
15842 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
15843 }
15844 bool IsIdentity = true;
15845 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
15846 Mask.swap(PrevMask);
15847 for (unsigned I = 0; I < NumScalars; ++I) {
15848 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
15849 DemandedElts.setBit(InsertIdx);
15850 IsIdentity &= InsertIdx - OffsetBeg == I;
15851 Mask[InsertIdx - OffsetBeg] = I;
15852 }
15853 assert(Offset < NumElts && "Failed to find vector index offset");
15854
15856 Cost -=
15857 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
15858 /*Insert*/ true, /*Extract*/ false, CostKind);
15859
15860 // First cost - resize to actual vector size if not identity shuffle or
15861 // need to shift the vector.
15862 // Do not calculate the cost if the actual size is the register size and
15863 // we can merge this shuffle with the following SK_Select.
15864 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
15865 if (!IsIdentity)
15867 InsertVecTy, Mask);
15868 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15869 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15870 }));
15871 // Second cost - permutation with subvector, if some elements are from the
15872 // initial vector or inserting a subvector.
15873 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
15874 // subvector of ActualVecTy.
15875 SmallBitVector InMask =
15876 isUndefVector(FirstInsert->getOperand(0),
15877 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15878 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
15879 if (InsertVecSz != VecSz) {
15880 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
15881 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
15882 CostKind, OffsetBeg - Offset, InsertVecTy);
15883 } else {
15884 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
15885 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
15886 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
15887 I <= End; ++I)
15888 if (Mask[I] != PoisonMaskElem)
15889 Mask[I] = I + VecSz;
15890 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
15891 Mask[I] =
15892 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
15893 Cost +=
15894 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
15895 }
15896 }
15897 return Cost;
15898 }
15899 case Instruction::ZExt:
15900 case Instruction::SExt:
15901 case Instruction::FPToUI:
15902 case Instruction::FPToSI:
15903 case Instruction::FPExt:
15904 case Instruction::PtrToInt:
15905 case Instruction::IntToPtr:
15906 case Instruction::SIToFP:
15907 case Instruction::UIToFP:
15908 case Instruction::Trunc:
15909 case Instruction::FPTrunc:
15910 case Instruction::BitCast: {
15911 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15912 Type *SrcScalarTy = VL0->getOperand(0)->getType();
15913 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
15914 unsigned Opcode = ShuffleOrOp;
15915 unsigned VecOpcode = Opcode;
15916 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15917 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15918 // Check if the values are candidates to demote.
15919 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
15920 if (SrcIt != MinBWs.end()) {
15921 SrcBWSz = SrcIt->second.first;
15922 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
15923 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
15924 SrcVecTy =
15925 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
15926 }
15927 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15928 if (BWSz == SrcBWSz) {
15929 VecOpcode = Instruction::BitCast;
15930 } else if (BWSz < SrcBWSz) {
15931 VecOpcode = Instruction::Trunc;
15932 } else if (It != MinBWs.end()) {
15933 assert(BWSz > SrcBWSz && "Invalid cast!");
15934 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15935 } else if (SrcIt != MinBWs.end()) {
15936 assert(BWSz > SrcBWSz && "Invalid cast!");
15937 VecOpcode =
15938 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15939 }
15940 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15941 !SrcIt->second.second) {
15942 VecOpcode = Instruction::UIToFP;
15943 }
15944 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
15945 assert(Idx == 0 && "Expected 0 index only");
15946 return TTI->getCastInstrCost(Opcode, VL0->getType(),
15947 VL0->getOperand(0)->getType(),
15949 };
15950 auto GetVectorCost = [=](InstructionCost CommonCost) {
15951 // Do not count cost here if minimum bitwidth is in effect and it is just
15952 // a bitcast (here it is just a noop).
15953 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15954 return CommonCost;
15955 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
15956 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
15957
15958 bool IsArithmeticExtendedReduction =
15959 E->Idx == 0 && UserIgnoreList &&
15960 all_of(*UserIgnoreList, [](Value *V) {
15961 auto *I = cast<Instruction>(V);
15962 return is_contained({Instruction::Add, Instruction::FAdd,
15963 Instruction::Mul, Instruction::FMul,
15964 Instruction::And, Instruction::Or,
15965 Instruction::Xor},
15966 I->getOpcode());
15967 });
15968 if (IsArithmeticExtendedReduction &&
15969 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15970 return CommonCost;
15971 return CommonCost +
15972 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
15973 VecOpcode == Opcode ? VI : nullptr);
15974 };
15975 return GetCostDiff(GetScalarCost, GetVectorCost);
15976 }
15977 case Instruction::FCmp:
15978 case Instruction::ICmp:
15979 case Instruction::Select: {
15980 CmpPredicate VecPred, SwappedVecPred;
15981 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
15982 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
15983 match(VL0, MatchCmp))
15984 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
15985 else
15986 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
15989 auto GetScalarCost = [&](unsigned Idx) {
15990 if (isa<PoisonValue>(UniqueValues[Idx]))
15992
15993 if (!isa<SelectInst>(UniqueValues[Idx]))
15994 return TTI->getInstructionCost(cast<Instruction>(UniqueValues[Idx]),
15995 CostKind);
15996
15997 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15998 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
16001 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
16002 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
16003 !match(VI, MatchCmp)) ||
16004 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
16005 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
16006 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
16009
16010 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
16011 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
16012 CostKind, getOperandInfo(VI->getOperand(0)),
16013 getOperandInfo(VI->getOperand(1)), VI);
16014 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
16015 if (IntrinsicCost.isValid())
16016 ScalarCost = IntrinsicCost;
16017
16018 return ScalarCost;
16019 };
16020 auto GetVectorCost = [&](InstructionCost CommonCost) {
16021 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
16022
16023 InstructionCost VecCost =
16024 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
16025 CostKind, getOperandInfo(E->getOperand(0)),
16026 getOperandInfo(E->getOperand(1)), VL0);
16027 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
16028 auto *CondType =
16029 getWidenedType(SI->getCondition()->getType(), VL.size());
16030 unsigned CondNumElements = CondType->getNumElements();
16031 unsigned VecTyNumElements = getNumElements(VecTy);
16032 assert(VecTyNumElements >= CondNumElements &&
16033 VecTyNumElements % CondNumElements == 0 &&
16034 "Cannot vectorize Instruction::Select");
16035 if (CondNumElements != VecTyNumElements) {
16036 // When the return type is i1 but the source is fixed vector type, we
16037 // need to duplicate the condition value.
16038 VecCost += ::getShuffleCost(
16039 *TTI, TTI::SK_PermuteSingleSrc, CondType,
16040 createReplicatedMask(VecTyNumElements / CondNumElements,
16041 CondNumElements));
16042 }
16043 }
16044 return VecCost + CommonCost;
16045 };
16046 return GetCostDiff(GetScalarCost, GetVectorCost);
16047 }
16048 case TreeEntry::MinMax: {
16049 auto GetScalarCost = [&](unsigned Idx) {
16050 return GetMinMaxCost(OrigScalarTy);
16051 };
16052 auto GetVectorCost = [&](InstructionCost CommonCost) {
16053 InstructionCost VecCost = GetMinMaxCost(VecTy);
16054 return VecCost + CommonCost;
16055 };
16056 return GetCostDiff(GetScalarCost, GetVectorCost);
16057 }
16058 case TreeEntry::FMulAdd: {
16059 auto GetScalarCost = [&](unsigned Idx) {
16060 if (isa<PoisonValue>(UniqueValues[Idx]))
16062 return GetFMulAddCost(E->getOperations(),
16063 cast<Instruction>(UniqueValues[Idx]));
16064 };
16065 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16066 FastMathFlags FMF;
16067 FMF.set();
16068 for (Value *V : E->Scalars) {
16069 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
16070 FMF &= FPCI->getFastMathFlags();
16071 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
16072 FMF &= FPCIOp->getFastMathFlags();
16073 }
16074 }
16075 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
16076 {VecTy, VecTy, VecTy}, FMF);
16077 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
16078 return VecCost + CommonCost;
16079 };
16080 return GetCostDiff(GetScalarCost, GetVectorCost);
16081 }
16082 case TreeEntry::ReducedBitcast:
16083 case TreeEntry::ReducedBitcastBSwap: {
16084 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
16085 if (isa<PoisonValue>(UniqueValues[Idx]))
16087 auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]);
16088 if (!Shl)
16090 InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind);
16091 auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0));
16092 if (!ZExt)
16093 return ScalarCost;
16094 ScalarCost += TTI.getInstructionCost(ZExt, CostKind);
16095 return ScalarCost;
16096 };
16097 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16098 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
16099 TTI::CastContextHint CastCtx =
16100 getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
16101 Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
16102 auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
16103 InstructionCost BitcastCost = TTI.getCastInstrCost(
16104 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx, CostKind);
16105 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
16106 auto *SrcType = IntegerType::getIntNTy(
16107 ScalarTy->getContext(),
16108 DL->getTypeSizeInBits(SrcScalarTy) * EntryVF);
16109 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
16111 TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
16112 BitcastCost += IntrinsicCost;
16113 if (SrcType != ScalarTy) {
16114 BitcastCost +=
16115 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
16117 }
16118 }
16119 return BitcastCost + CommonCost;
16120 };
16121 return GetCostDiff(GetScalarCost, GetVectorCost);
16122 }
16123 case TreeEntry::ReducedBitcastLoads:
16124 case TreeEntry::ReducedBitcastBSwapLoads: {
16125 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
16126 if (isa<PoisonValue>(UniqueValues[Idx]))
16128 auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]);
16129 if (!Shl)
16131 InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind);
16132 auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0));
16133 if (!ZExt)
16134 return ScalarCost;
16135 ScalarCost += TTI.getInstructionCost(ZExt, CostKind);
16136 auto *Load = dyn_cast<Instruction>(ZExt->getOperand(0));
16137 if (!Load)
16138 return ScalarCost;
16139 ScalarCost += TTI.getInstructionCost(Load, CostKind);
16140 return ScalarCost;
16141 };
16142 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16143 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
16144 const TreeEntry *LoadTE = getOperandEntry(LhsTE, /*Idx=*/0);
16145 auto *LI0 = cast<LoadInst>(LoadTE->getMainOp());
16146 auto *SrcType = IntegerType::getIntNTy(
16147 ScalarTy->getContext(),
16148 DL->getTypeSizeInBits(LI0->getType()) * EntryVF);
16149 InstructionCost LoadCost =
16150 TTI.getMemoryOpCost(Instruction::Load, SrcType, LI0->getAlign(),
16151 LI0->getPointerAddressSpace(), CostKind);
16152 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
16153 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
16155 TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
16156 LoadCost += IntrinsicCost;
16157 if (SrcType != ScalarTy) {
16158 LoadCost +=
16159 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
16161 }
16162 }
16163 return LoadCost + CommonCost;
16164 };
16165 return GetCostDiff(GetScalarCost, GetVectorCost);
16166 }
16167 case TreeEntry::ReducedCmpBitcast: {
16168 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
16169 if (isa<PoisonValue>(UniqueValues[Idx]))
16171 auto *Sel = dyn_cast<Instruction>(UniqueValues[Idx]);
16172 if (!Sel)
16174 InstructionCost ScalarCost = TTI.getInstructionCost(Sel, CostKind);
16175 return ScalarCost;
16176 };
16177 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16178 Type *CmpTy = CmpInst::makeCmpResultType(VecTy);
16179 auto *DstTy =
16180 IntegerType::getIntNTy(ScalarTy->getContext(), E->getVectorFactor());
16181 InstructionCost BitcastCost =
16182 TTI.getCastInstrCost(Instruction::BitCast, DstTy, CmpTy,
16184 if (DstTy != ScalarTy) {
16185 BitcastCost +=
16186 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
16188 }
16189 return BitcastCost + CommonCost;
16190 };
16191 return GetCostDiff(GetScalarCost, GetVectorCost);
16192 }
16193 case Instruction::FNeg:
16194 case Instruction::Add:
16195 case Instruction::FAdd:
16196 case Instruction::Sub:
16197 case Instruction::FSub:
16198 case Instruction::Mul:
16199 case Instruction::FMul:
16200 case Instruction::UDiv:
16201 case Instruction::SDiv:
16202 case Instruction::FDiv:
16203 case Instruction::URem:
16204 case Instruction::SRem:
16205 case Instruction::FRem:
16206 case Instruction::Shl:
16207 case Instruction::LShr:
16208 case Instruction::AShr:
16209 case Instruction::And:
16210 case Instruction::Or:
16211 case Instruction::Xor: {
16212 auto GetScalarCost = [&](unsigned Idx) {
16213 if (isa<PoisonValue>(UniqueValues[Idx]))
16215
16216 // We cannot retrieve the operand from UniqueValues[Idx] because an
16217 // interchangeable instruction may be used. The order and the actual
16218 // operand might differ from what is retrieved from UniqueValues[Idx].
16219 unsigned Lane = UniqueIndexes[Idx];
16220 Value *Op1 = E->getOperand(0)[Lane];
16221 Value *Op2;
16222 SmallVector<const Value *, 2> Operands(1, Op1);
16223 if (isa<UnaryOperator>(UniqueValues[Idx])) {
16224 Op2 = Op1;
16225 } else {
16226 Op2 = E->getOperand(1)[Lane];
16227 Operands.push_back(Op2);
16228 }
16231 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
16232 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
16233 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
16234 I && (ShuffleOrOp == Instruction::FAdd ||
16235 ShuffleOrOp == Instruction::FSub)) {
16236 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
16237 if (IntrinsicCost.isValid())
16238 ScalarCost = IntrinsicCost;
16239 }
16240 return ScalarCost;
16241 };
16242 auto GetVectorCost = [=](InstructionCost CommonCost) {
16243 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
16244 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
16245 ArrayRef<Value *> Ops = E->getOperand(I);
16246 if (all_of(Ops, [&](Value *Op) {
16247 auto *CI = dyn_cast<ConstantInt>(Op);
16248 return CI && CI->getValue().countr_one() >= It->second.first;
16249 }))
16250 return CommonCost;
16251 }
16252 }
16253 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
16254 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
16255 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
16256 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
16257 Op2Info, {}, nullptr, TLI) +
16258 CommonCost;
16259 };
16260 return GetCostDiff(GetScalarCost, GetVectorCost);
16261 }
16262 case Instruction::GetElementPtr: {
16263 return CommonCost + GetGEPCostDiff(VL, VL0);
16264 }
16265 case Instruction::Load: {
16266 auto GetScalarCost = [&](unsigned Idx) {
16267 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
16268 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
16269 VI->getAlign(), VI->getPointerAddressSpace(),
16271 };
16272 auto *LI0 = cast<LoadInst>(VL0);
16273 auto GetVectorCost = [&](InstructionCost CommonCost) {
16274 InstructionCost VecLdCost;
16275 switch (E->State) {
16276 case TreeEntry::Vectorize:
16277 if (unsigned Factor = E->getInterleaveFactor()) {
16278 VecLdCost = TTI->getInterleavedMemoryOpCost(
16279 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
16280 LI0->getPointerAddressSpace(), CostKind);
16281
16282 } else {
16283 VecLdCost = TTI->getMemoryOpCost(
16284 Instruction::Load, VecTy, LI0->getAlign(),
16285 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
16286 }
16287 break;
16288 case TreeEntry::StridedVectorize: {
16289 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
16290 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
16291 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
16292 Align CommonAlignment =
16293 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
16294 VecLdCost = TTI->getMemIntrinsicInstrCost(
16295 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
16296 StridedLoadTy, LI0->getPointerOperand(),
16297 /*VariableMask=*/false, CommonAlignment),
16298 CostKind);
16299 if (StridedLoadTy != VecTy)
16300 VecLdCost +=
16301 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
16302 getCastContextHint(*E), CostKind);
16303
16304 break;
16305 }
16306 case TreeEntry::CompressVectorize: {
16307 bool IsMasked;
16308 unsigned InterleaveFactor;
16309 SmallVector<int> CompressMask;
16310 VectorType *LoadVecTy;
16311 SmallVector<Value *> Scalars(VL);
16312 if (!E->ReorderIndices.empty()) {
16313 SmallVector<int> Mask(E->ReorderIndices.begin(),
16314 E->ReorderIndices.end());
16315 reorderScalars(Scalars, Mask);
16316 }
16317 SmallVector<Value *> PointerOps(Scalars.size());
16318 for (auto [I, V] : enumerate(Scalars))
16319 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
16320 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
16321 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
16322 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
16323 CompressMask, LoadVecTy);
16324 assert(IsVectorized && "Failed to vectorize load");
16325 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
16326 InterleaveFactor, IsMasked);
16327 Align CommonAlignment = LI0->getAlign();
16328 if (InterleaveFactor) {
16329 VecLdCost = TTI->getInterleavedMemoryOpCost(
16330 Instruction::Load, LoadVecTy, InterleaveFactor, {},
16331 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
16332 } else if (IsMasked) {
16333 VecLdCost = TTI->getMemIntrinsicInstrCost(
16334 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
16335 CommonAlignment,
16336 LI0->getPointerAddressSpace()),
16337 CostKind);
16338 // TODO: include this cost into CommonCost.
16339 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
16340 LoadVecTy, CompressMask, CostKind);
16341 } else {
16342 VecLdCost = TTI->getMemoryOpCost(
16343 Instruction::Load, LoadVecTy, CommonAlignment,
16344 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
16345 // TODO: include this cost into CommonCost.
16346 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
16347 LoadVecTy, CompressMask, CostKind);
16348 }
16349 break;
16350 }
16351 case TreeEntry::ScatterVectorize: {
16352 Align CommonAlignment =
16353 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
16354 VecLdCost = TTI->getMemIntrinsicInstrCost(
16355 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
16356 LI0->getPointerOperand(),
16357 /*VariableMask=*/false, CommonAlignment),
16358 CostKind);
16359 break;
16360 }
16361 case TreeEntry::CombinedVectorize:
16362 case TreeEntry::SplitVectorize:
16363 case TreeEntry::NeedToGather:
16364 llvm_unreachable("Unexpected vectorization state.");
16365 }
16366 return VecLdCost + CommonCost;
16367 };
16368
16369 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
16370 // If this node generates masked gather load then it is not a terminal node.
16371 // Hence address operand cost is estimated separately.
16372 if (E->State == TreeEntry::ScatterVectorize)
16373 return Cost;
16374
16375 // Estimate cost of GEPs since this tree node is a terminator.
16376 SmallVector<Value *> PointerOps(VL.size());
16377 for (auto [I, V] : enumerate(VL))
16378 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
16379 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
16380 }
16381 case Instruction::Store: {
16382 bool IsReorder = !E->ReorderIndices.empty();
16383 auto GetScalarCost = [=](unsigned Idx) {
16384 auto *VI = cast<StoreInst>(VL[Idx]);
16385 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
16386 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
16387 VI->getAlign(), VI->getPointerAddressSpace(),
16388 CostKind, OpInfo, VI);
16389 };
16390 auto *BaseSI =
16391 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
16392 auto GetVectorCost = [=](InstructionCost CommonCost) {
16393 // We know that we can merge the stores. Calculate the cost.
16394 InstructionCost VecStCost;
16395 if (E->State == TreeEntry::StridedVectorize) {
16396 Align CommonAlignment =
16397 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
16398 VecStCost = TTI->getMemIntrinsicInstrCost(
16399 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
16400 VecTy, BaseSI->getPointerOperand(),
16401 /*VariableMask=*/false, CommonAlignment),
16402 CostKind);
16403 } else {
16404 assert(E->State == TreeEntry::Vectorize &&
16405 "Expected either strided or consecutive stores.");
16406 if (unsigned Factor = E->getInterleaveFactor()) {
16407 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
16408 "No reused shuffles expected");
16409 CommonCost = 0;
16410 VecStCost = TTI->getInterleavedMemoryOpCost(
16411 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
16412 BaseSI->getPointerAddressSpace(), CostKind);
16413 } else {
16414 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
16415 VecStCost = TTI->getMemoryOpCost(
16416 Instruction::Store, VecTy, BaseSI->getAlign(),
16417 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
16418 }
16419 }
16420 return VecStCost + CommonCost;
16421 };
16422 SmallVector<Value *> PointerOps(VL.size());
16423 for (auto [I, V] : enumerate(VL)) {
16424 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
16425 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
16426 }
16427
16428 return GetCostDiff(GetScalarCost, GetVectorCost) +
16429 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
16430 }
16431 case Instruction::Call: {
16432 auto GetScalarCost = [&](unsigned Idx) {
16433 auto *CI = cast<CallInst>(UniqueValues[Idx]);
16436 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
16437 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
16438 }
16439 return TTI->getCallInstrCost(CI->getCalledFunction(),
16441 CI->getFunctionType()->params(), CostKind);
16442 };
16443 auto GetVectorCost = [=](InstructionCost CommonCost) {
16444 auto *CI = cast<CallInst>(VL0);
16447 CI, ID, VecTy->getNumElements(),
16448 It != MinBWs.end() ? It->second.first : 0, TTI);
16449 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16450 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
16451 };
16452 return GetCostDiff(GetScalarCost, GetVectorCost);
16453 }
16454 case Instruction::ShuffleVector: {
16455 if (!SLPReVec || E->isAltShuffle())
16456 assert(E->isAltShuffle() &&
16457 ((Instruction::isBinaryOp(E->getOpcode()) &&
16458 Instruction::isBinaryOp(E->getAltOpcode())) ||
16459 (Instruction::isCast(E->getOpcode()) &&
16460 Instruction::isCast(E->getAltOpcode())) ||
16461 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16462 "Invalid Shuffle Vector Operand");
16463 // Try to find the previous shuffle node with the same operands and same
16464 // main/alternate ops.
16465 auto TryFindNodeWithEqualOperands = [=]() {
16466 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16467 if (TE.get() == E)
16468 break;
16469 if (TE->hasState() && TE->isAltShuffle() &&
16470 ((TE->getOpcode() == E->getOpcode() &&
16471 TE->getAltOpcode() == E->getAltOpcode()) ||
16472 (TE->getOpcode() == E->getAltOpcode() &&
16473 TE->getAltOpcode() == E->getOpcode())) &&
16474 TE->hasEqualOperands(*E))
16475 return true;
16476 }
16477 return false;
16478 };
16479 auto GetScalarCost = [&](unsigned Idx) {
16480 if (isa<PoisonValue>(UniqueValues[Idx]))
16482
16483 auto *VI = cast<Instruction>(UniqueValues[Idx]);
16484 assert(E->getMatchingMainOpOrAltOp(VI) &&
16485 "Unexpected main/alternate opcode");
16486 (void)E;
16487 return TTI->getInstructionCost(VI, CostKind);
16488 };
16489 // Need to clear CommonCost since the final shuffle cost is included into
16490 // vector cost.
16491 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
16492 // VecCost is equal to sum of the cost of creating 2 vectors
16493 // and the cost of creating shuffle.
16494 InstructionCost VecCost = 0;
16495 if (TryFindNodeWithEqualOperands()) {
16496 LLVM_DEBUG({
16497 dbgs() << "SLP: diamond match for alternate node found.\n";
16498 E->dump();
16499 });
16500 // No need to add new vector costs here since we're going to reuse
16501 // same main/alternate vector ops, just do different shuffling.
16502 } else if (Instruction::isBinaryOp(E->getOpcode())) {
16503 VecCost =
16504 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
16505 VecCost +=
16506 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
16507 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16508 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
16509 VecCost = TTIRef.getCmpSelInstrCost(
16510 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
16511 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
16512 VL0);
16513 VecCost += TTIRef.getCmpSelInstrCost(
16514 E->getOpcode(), VecTy, MaskTy,
16515 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
16516 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
16517 E->getAltOp());
16518 } else {
16519 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
16520 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
16521 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
16522 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
16523 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16524 unsigned SrcBWSz =
16525 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
16526 if (SrcIt != MinBWs.end()) {
16527 SrcBWSz = SrcIt->second.first;
16528 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
16529 SrcTy = getWidenedType(SrcSclTy, VL.size());
16530 }
16531 if (BWSz <= SrcBWSz) {
16532 if (BWSz < SrcBWSz)
16533 VecCost =
16534 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
16536 LLVM_DEBUG({
16537 dbgs()
16538 << "SLP: alternate extension, which should be truncated.\n";
16539 E->dump();
16540 });
16541 return VecCost;
16542 }
16543 }
16544 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
16546 VecCost +=
16547 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
16549 }
16550 SmallVector<int> Mask;
16551 E->buildAltOpShuffleMask(
16552 [&](Instruction *I) {
16553 assert(E->getMatchingMainOpOrAltOp(I) &&
16554 "Unexpected main/alternate opcode");
16555 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16556 *TLI);
16557 },
16558 Mask);
16560 FinalVecTy, Mask, CostKind);
16561 // Patterns like [fadd,fsub] can be combined into a single instruction
16562 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
16563 // need to take into account their order when looking for the most used
16564 // order.
16565 unsigned Opcode0 = E->getOpcode();
16566 unsigned Opcode1 = E->getAltOpcode();
16567 SmallBitVector OpcodeMask(
16568 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
16569 // If this pattern is supported by the target then we consider the
16570 // order.
16571 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
16572 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
16573 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
16574 return AltVecCost < VecCost ? AltVecCost : VecCost;
16575 }
16576 // TODO: Check the reverse order too.
16577 return VecCost;
16578 };
16579 if (SLPReVec && !E->isAltShuffle())
16580 return GetCostDiff(
16581 GetScalarCost, [&](InstructionCost) -> InstructionCost {
16582 // If a group uses mask in order, the shufflevector can be
16583 // eliminated by instcombine. Then the cost is 0.
16585 "Not supported shufflevector usage.");
16586 auto *SV = cast<ShuffleVectorInst>(VL.front());
16587 unsigned SVNumElements =
16588 cast<FixedVectorType>(SV->getOperand(0)->getType())
16589 ->getNumElements();
16590 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
16591 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
16592 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
16593 int NextIndex = 0;
16594 if (!all_of(Group, [&](Value *V) {
16596 "Not supported shufflevector usage.");
16597 auto *SV = cast<ShuffleVectorInst>(V);
16598 int Index;
16599 [[maybe_unused]] bool IsExtractSubvectorMask =
16600 SV->isExtractSubvectorMask(Index);
16601 assert(IsExtractSubvectorMask &&
16602 "Not supported shufflevector usage.");
16603 if (NextIndex != Index)
16604 return false;
16605 NextIndex += SV->getShuffleMask().size();
16606 return true;
16607 }))
16608 return ::getShuffleCost(
16610 calculateShufflevectorMask(E->Scalars));
16611 }
16612 return TTI::TCC_Free;
16613 });
16614 return GetCostDiff(GetScalarCost, GetVectorCost);
16615 }
16616 case Instruction::Freeze:
16617 return CommonCost;
16618 default:
16619 llvm_unreachable("Unknown instruction");
16620 }
16621}
16622
16623bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
16624 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
16625 << VectorizableTree.size() << " is fully vectorizable .\n");
16626
16627 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
16628 SmallVector<int> Mask;
16629 return TE->isGather() &&
16630 !any_of(TE->Scalars,
16631 [this](Value *V) { return EphValues.contains(V); }) &&
16632 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
16633 TE->Scalars.size() < Limit ||
16634 (((TE->hasState() &&
16635 TE->getOpcode() == Instruction::ExtractElement) ||
16637 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
16638 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
16639 !TE->isAltShuffle()) ||
16640 any_of(TE->Scalars, IsaPred<LoadInst>));
16641 };
16642
16643 // We only handle trees of heights 1 and 2.
16644 if (VectorizableTree.size() == 1 &&
16645 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
16646 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
16647 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
16648 (ForReduction &&
16649 AreVectorizableGathers(VectorizableTree[0].get(),
16650 VectorizableTree[0]->Scalars.size()) &&
16651 VectorizableTree[0]->getVectorFactor() > 2)))
16652 return true;
16653
16654 if (VectorizableTree.size() != 2)
16655 return false;
16656
16657 // Handle splat and all-constants stores. Also try to vectorize tiny trees
16658 // with the second gather nodes if they have less scalar operands rather than
16659 // the initial tree element (may be profitable to shuffle the second gather)
16660 // or they are extractelements, which form shuffle.
16661 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
16662 AreVectorizableGathers(VectorizableTree[1].get(),
16663 VectorizableTree[0]->Scalars.size()))
16664 return true;
16665
16666 // Gathering cost would be too much for tiny trees.
16667 if (VectorizableTree[0]->isGather() ||
16668 (VectorizableTree[1]->isGather() &&
16669 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
16670 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
16671 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
16672 return false;
16673
16674 return true;
16675}
16676
16677bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
16678 if (!DebugCounter::shouldExecute(VectorizedGraphs))
16679 return true;
16680
16681 // Graph is empty - do nothing.
16682 if (VectorizableTree.empty()) {
16683 assert(ExternalUses.empty() && "We shouldn't have any external users");
16684
16685 return true;
16686 }
16687
16688 if (VectorizableTree.size() == 1 && !ForReduction &&
16689 VectorizableTree.front()->isGather() &&
16690 VectorizableTree.front()->hasState() &&
16691 VectorizableTree.front()->getOpcode() == Instruction::ExtractElement)
16692 return true;
16693 // No need to vectorize inserts of gathered values.
16694 if (VectorizableTree.size() == 2 &&
16695 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
16696 VectorizableTree[1]->isGather() &&
16697 (VectorizableTree[1]->getVectorFactor() <= 2 ||
16698 !(isSplat(VectorizableTree[1]->Scalars) ||
16699 allConstant(VectorizableTree[1]->Scalars))))
16700 return true;
16701
16702 // The tree with only 3 nodes, where 2 last are gathers/buildvectors, not
16703 // profitable for vectorization.
16704 constexpr int Limit = 4;
16705 if (VectorizableTree.size() == 3 && SLPCostThreshold == 0 &&
16706 (!ForReduction || VectorizableTree.front()->getVectorFactor() <= 2) &&
16707 all_of(ArrayRef(VectorizableTree).drop_front(),
16708 [&](const std::unique_ptr<TreeEntry> &TE) {
16709 return TE->isGather() && TE->getVectorFactor() <= Limit &&
16710 !all_of(
16711 TE->Scalars,
16713 }))
16714 return true;
16715
16716 // If the graph includes only PHI nodes and gathers, it is defnitely not
16717 // profitable for the vectorization, we can skip it, if the cost threshold is
16718 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
16719 // gathers/buildvectors.
16720 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16721 !VectorizableTree.empty() &&
16722 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16723 return (TE->isGather() &&
16724 (!TE->hasState() ||
16725 TE->getOpcode() != Instruction::ExtractElement) &&
16726 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
16727 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
16728 }))
16729 return true;
16730
16731 // Do not vectorize small tree of phis only, if all vector phis are also
16732 // gathered.
16733 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16734 VectorizableTree.size() <= Limit &&
16735 all_of(VectorizableTree,
16736 [&](const std::unique_ptr<TreeEntry> &TE) {
16737 return (TE->isGather() &&
16738 (!TE->hasState() ||
16739 TE->getOpcode() != Instruction::ExtractElement) &&
16740 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
16741 Limit) ||
16742 (TE->hasState() &&
16743 (TE->getOpcode() == Instruction::InsertElement ||
16744 (TE->getOpcode() == Instruction::PHI &&
16745 all_of(TE->Scalars, [&](Value *V) {
16746 return isa<PoisonValue>(V) || MustGather.contains(V);
16747 }))));
16748 }) &&
16749 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16750 return TE->State == TreeEntry::Vectorize &&
16751 TE->getOpcode() == Instruction::PHI;
16752 }))
16753 return true;
16754
16755 // PHI nodes only and gathers cannot be vectorized, skip.
16756 constexpr unsigned LargeTree = 20;
16757 bool HasSingleLoad = false;
16758 if (!ForReduction && SLPCostThreshold >= 0 &&
16759 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16760 bool PrevLoad = HasSingleLoad;
16761 HasSingleLoad |=
16762 TE->hasState() && !TE->isGather() &&
16763 (TE->getOpcode() == Instruction::Load ||
16764 TE->hasCopyableElements()) &&
16765 (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty());
16766 return (TE->hasState() &&
16767 (TE->getOpcode() == Instruction::PHI ||
16768 (VectorizableTree.size() >= LargeTree &&
16769 (TE->getOpcode() == Instruction::Store ||
16770 (TE->getOpcode() == Instruction::Load && !PrevLoad)) &&
16771 TE->getVectorFactor() <= Limit))) ||
16772 (TE->isGather() &&
16773 (!TE->hasState() ||
16774 TE->getOpcode() != Instruction::ExtractElement));
16775 }))
16776 return true;
16777
16778 // Single non-phi vector node - skip the tree.
16779 bool VectorNodeFound = false;
16780 bool AnyNonConst = false;
16781 if (!ForReduction && SLPCostThreshold >= 0 && VectorizableTree.size() >= 5 &&
16782 VectorizableTree.front()->getVectorFactor() <= 2 &&
16783 VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() &&
16784 all_of(VectorizableTree,
16785 [&](const std::unique_ptr<TreeEntry> &TE) {
16786 if (TE->State == TreeEntry::Vectorize && TE->hasState()) {
16787 if (TE->hasState() && (TE->getOpcode() == Instruction::PHI ||
16788 !TE->ReorderIndices.empty()))
16789 return true;
16790 bool PrevVectorNodeFound = VectorNodeFound;
16791 VectorNodeFound = true;
16792 return !PrevVectorNodeFound;
16793 }
16794 AnyNonConst |= !allConstant(TE->Scalars);
16795 return TE->isGather() || TE->State == TreeEntry::SplitVectorize;
16796 }) &&
16797 AnyNonConst)
16798 return true;
16799
16800 // If the tree contains only phis, buildvectors, split nodes and
16801 // small nodes with reuses, we can skip it.
16802 SmallVector<const TreeEntry *> StoreLoadNodes;
16803 unsigned NumGathers = 0;
16804 constexpr int LimitTreeSize = 36;
16805 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16806 all_of(VectorizableTree,
16807 [&](const std::unique_ptr<TreeEntry> &TE) {
16808 if (!TE->isGather() && TE->hasState() &&
16809 (TE->getOpcode() == Instruction::Load ||
16810 TE->getOpcode() == Instruction::Store)) {
16811 StoreLoadNodes.push_back(TE.get());
16812 return true;
16813 }
16814 if (TE->isGather())
16815 ++NumGathers;
16816 return TE->State == TreeEntry::SplitVectorize ||
16817 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
16818 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
16819 VectorizableTree.size() > LimitTreeSize) ||
16820 (TE->isGather() &&
16821 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
16822 (TE->hasState() &&
16823 (TE->getOpcode() == Instruction::PHI ||
16824 (TE->hasCopyableElements() &&
16825 static_cast<unsigned>(count_if(
16826 TE->Scalars, IsaPred<PHINode, Constant>)) >=
16827 TE->Scalars.size() / 2) ||
16828 ((!TE->ReuseShuffleIndices.empty() ||
16829 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
16830 TE->Scalars.size() == 2)));
16831 }) &&
16832 (StoreLoadNodes.empty() ||
16833 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
16834 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
16835 return TE->getOpcode() == Instruction::Store ||
16836 all_of(TE->Scalars, [&](Value *V) {
16837 return !isa<LoadInst>(V) ||
16838 areAllUsersVectorized(cast<Instruction>(V));
16839 });
16840 })))))
16841 return true;
16842
16843 // If the tree contains only buildvector, 2 non-buildvectors (with root user
16844 // tree node) and other buildvectors, we can skip it.
16845 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16846 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
16847 VectorizableTree.size() >= Limit &&
16848 count_if(ArrayRef(VectorizableTree).drop_front(),
16849 [&](const std::unique_ptr<TreeEntry> &TE) {
16850 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16851 TE->UserTreeIndex.UserTE->Idx == 0;
16852 }) == 2)
16853 return true;
16854
16855 // If the tree contains only vectorization of the phi node from the
16856 // buildvector - skip it.
16857 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16858 VectorizableTree.size() > 2 &&
16859 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16860 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16861 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16862 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16863 all_of(
16864 ArrayRef(VectorizableTree).drop_front(2),
16865 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
16866 return true;
16867
16868 // We can vectorize the tree if its size is greater than or equal to the
16869 // minimum size specified by the MinTreeSize command line option.
16870 if (VectorizableTree.size() >= MinTreeSize)
16871 return false;
16872
16873 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
16874 // can vectorize it if we can prove it fully vectorizable.
16875 if (isFullyVectorizableTinyTree(ForReduction))
16876 return false;
16877
16878 // Check if any of the gather node forms an insertelement buildvector
16879 // somewhere.
16880 bool IsAllowedSingleBVNode =
16881 VectorizableTree.size() > 1 ||
16882 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16883 !VectorizableTree.front()->isAltShuffle() &&
16884 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16885 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16886 allSameBlock(VectorizableTree.front()->Scalars));
16887 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
16888 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
16889 return isa<ExtractElementInst, Constant>(V) ||
16890 (IsAllowedSingleBVNode &&
16891 !V->hasNUsesOrMore(UsesLimit) &&
16892 any_of(V->users(), IsaPred<InsertElementInst>));
16893 });
16894 }))
16895 return false;
16896
16897 if (VectorizableTree.back()->isGather() &&
16898 VectorizableTree.back()->hasState() &&
16899 VectorizableTree.back()->isAltShuffle() &&
16900 VectorizableTree.back()->getVectorFactor() > 2 &&
16901 allSameBlock(VectorizableTree.back()->Scalars) &&
16902 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16903 TTI->getScalarizationOverhead(
16904 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16905 VectorizableTree.back()->getVectorFactor()),
16906 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
16907 /*Insert=*/true, /*Extract=*/false,
16909 return false;
16910
16911 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
16912 // vectorizable.
16913 return true;
16914}
16915
16918 constexpr unsigned SmallTree = 3;
16919 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16920 getCanonicalGraphSize() <= SmallTree &&
16921 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
16922 [](const std::unique_ptr<TreeEntry> &TE) {
16923 return TE->isGather() && TE->hasState() &&
16924 TE->getOpcode() == Instruction::Load &&
16925 !allSameBlock(TE->Scalars);
16926 }) == 1)
16927 return true;
16928 return false;
16929 }
16930 bool Res = false;
16931 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
16932 TreeEntry &E = *VectorizableTree[Idx];
16933 if (E.State == TreeEntry::SplitVectorize)
16934 return false;
16935 if (!E.isGather())
16936 continue;
16937 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16938 (!E.hasState() &&
16940 (isa<ExtractElementInst>(E.Scalars.front()) &&
16941 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
16942 return false;
16943 if (isSplat(E.Scalars) || allConstant(E.Scalars))
16944 continue;
16945 Res = true;
16946 }
16947 return Res;
16948}
16949
16951 // Walk from the bottom of the tree to the top, tracking which values are
16952 // live. When we see a call instruction that is not part of our tree,
16953 // query TTI to see if there is a cost to keeping values live over it
16954 // (for example, if spills and fills are required).
16955
16956 const TreeEntry *Root = VectorizableTree.front().get();
16957 if (Root->isGather())
16958 return 0;
16959
16960 InstructionCost Cost = 0;
16962 EntriesToOperands;
16963 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
16964 SmallPtrSet<const Instruction *, 8> LastInstructions;
16965 SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries;
16966 for (const auto &TEPtr : VectorizableTree) {
16967 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
16968 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
16969 TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
16970 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
16971 TEPtr->CombinedOp == TreeEntry::ReducedCmpBitcast) {
16972 ScalarOrPseudoEntries.insert(TEPtr.get());
16973 continue;
16974 }
16975 if (!TEPtr->isGather()) {
16976 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16977 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
16978 LastInstructions.insert(LastInst);
16979 }
16980 if (TEPtr->UserTreeIndex)
16981 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16982 }
16983
16984 auto NoCallIntrinsic = [this](const Instruction *I) {
16985 const auto *II = dyn_cast<IntrinsicInst>(I);
16986 if (!II)
16987 return false;
16988 if (II->isAssumeLikeIntrinsic())
16989 return true;
16990 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
16991 InstructionCost IntrCost =
16992 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
16993 InstructionCost CallCost = TTI->getCallInstrCost(
16994 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
16995 return IntrCost < CallCost;
16996 };
16997
16998 // Maps last instruction in the entry to the last instruction for the one of
16999 // operand entries and the flag. If the flag is true, there are no calls in
17000 // between these instructions.
17002 CheckedInstructions;
17003 unsigned Budget = 0;
17004 const unsigned BudgetLimit =
17005 ScheduleRegionSizeBudget / VectorizableTree.size();
17006 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
17007 const Instruction *Last) {
17008 assert(First->getParent() == Last->getParent() &&
17009 "Expected instructions in same block.");
17010 if (auto It = CheckedInstructions.find(Last);
17011 It != CheckedInstructions.end()) {
17012 const Instruction *Checked = It->second.getPointer();
17013 if (Checked == First || Checked->comesBefore(First))
17014 return It->second.getInt() != 0;
17015 Last = Checked;
17016 } else if (Last == First || Last->comesBefore(First)) {
17017 return true;
17018 }
17020 ++First->getIterator().getReverse(),
17021 PrevInstIt =
17022 Last->getIterator().getReverse();
17023 SmallVector<const Instruction *> LastInstsInRange;
17024 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
17025 // Debug information does not impact spill cost.
17026 // Vectorized calls, represented as vector intrinsics, do not impact spill
17027 // cost.
17028 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
17029 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
17030 for (const Instruction *LastInst : LastInstsInRange)
17031 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
17032 return false;
17033 }
17034 if (LastInstructions.contains(&*PrevInstIt))
17035 LastInstsInRange.push_back(&*PrevInstIt);
17036
17037 ++PrevInstIt;
17038 ++Budget;
17039 }
17040 for (const Instruction *LastInst : LastInstsInRange)
17041 CheckedInstructions.try_emplace(
17042 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
17043 Budget <= BudgetLimit ? 1 : 0);
17044 return Budget <= BudgetLimit;
17045 };
17046 auto AddCosts = [&](const TreeEntry *Op) {
17047 if (ScalarOrPseudoEntries.contains(Op))
17048 return;
17049 Type *ScalarTy = Op->Scalars.front()->getType();
17050 auto It = MinBWs.find(Op);
17051 if (It != MinBWs.end())
17052 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
17053 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
17054 unsigned Scale = getScaleToLoopIterations(*Op);
17055 InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
17056 KeepLiveCost *= Scale;
17057 Cost += KeepLiveCost;
17058 if (ScalarTy->isVectorTy()) {
17059 // Handle revec dead vector instructions.
17060 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) *
17061 Scale;
17062 }
17063 };
17064 // Memoize the relationship between blocks, i.e. if there is (at least one)
17065 // non-vectorized call between the blocks. This allows to skip the analysis of
17066 // the same block paths multiple times.
17068 ParentOpParentToPreds;
17069 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
17070 BasicBlock *OpParent) {
17071 auto Key = std::make_pair(Root, OpParent);
17072 if (auto It = ParentOpParentToPreds.find(Key);
17073 It != ParentOpParentToPreds.end())
17074 return It->second;
17076 if (Pred)
17077 Worklist.push_back(Pred);
17078 else
17079 Worklist.append(pred_begin(Root), pred_end(Root));
17082 ParentsPairsToAdd;
17083 bool Res = false;
17085 for (const auto &KeyPair : ParentsPairsToAdd) {
17086 assert(!ParentOpParentToPreds.contains(KeyPair) &&
17087 "Should not have been added before.");
17088 ParentOpParentToPreds.try_emplace(KeyPair, Res);
17089 }
17090 });
17091 while (!Worklist.empty()) {
17092 BasicBlock *BB = Worklist.pop_back_val();
17093 if (BB == OpParent || !Visited.insert(BB).second)
17094 continue;
17095 auto Pair = std::make_pair(BB, OpParent);
17096 if (auto It = ParentOpParentToPreds.find(Pair);
17097 It != ParentOpParentToPreds.end()) {
17098 Res = It->second;
17099 return Res;
17100 }
17101 ParentsPairsToAdd.insert(Pair);
17102 unsigned BlockSize = BB->size();
17103 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
17104 return Res;
17105 Budget += BlockSize;
17106 if (Budget > BudgetLimit)
17107 return Res;
17108 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
17109 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
17110 BB->getTerminator()))
17111 return Res;
17112 Worklist.append(pred_begin(BB), pred_end(BB));
17113 }
17114 Res = true;
17115 return Res;
17116 };
17117 SmallVector<const TreeEntry *> LiveEntries(1, Root);
17118 auto FindNonScalarParentEntry = [&](const TreeEntry *E) -> const TreeEntry * {
17119 assert(ScalarOrPseudoEntries.contains(E) &&
17120 "Expected scalar or pseudo entry.");
17121 const TreeEntry *Entry = E;
17122 while (Entry->UserTreeIndex) {
17123 Entry = Entry->UserTreeIndex.UserTE;
17124 if (!ScalarOrPseudoEntries.contains(Entry))
17125 return Entry;
17126 }
17127 return nullptr;
17128 };
17129 while (!LiveEntries.empty()) {
17130 const TreeEntry *Entry = LiveEntries.pop_back_val();
17131 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
17132 if (Operands.empty())
17133 continue;
17134 if (ScalarOrPseudoEntries.contains(Entry)) {
17135 Entry = FindNonScalarParentEntry(Entry);
17136 if (!Entry) {
17137 for (const TreeEntry *Op : Operands) {
17138 if (!Op->isGather())
17139 LiveEntries.push_back(Op);
17140 }
17141 continue;
17142 }
17143 }
17144 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
17145 BasicBlock *Parent = LastInst->getParent();
17146 for (const TreeEntry *Op : Operands) {
17147 if (!Op->isGather())
17148 LiveEntries.push_back(Op);
17149 if (ScalarOrPseudoEntries.contains(Op))
17150 continue;
17151 if (Entry->State == TreeEntry::SplitVectorize ||
17152 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
17153 (Op->isGather() && allConstant(Op->Scalars)))
17154 continue;
17155 Budget = 0;
17156 BasicBlock *Pred = nullptr;
17157 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
17158 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
17159 BasicBlock *OpParent;
17160 Instruction *OpLastInst;
17161 if (Op->isGather()) {
17162 assert(Entry->getOpcode() == Instruction::PHI &&
17163 "Expected phi node only.");
17164 OpParent = cast<PHINode>(Entry->getMainOp())
17165 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
17166 OpLastInst = OpParent->getTerminator();
17167 for (Value *V : Op->Scalars) {
17168 auto *Inst = dyn_cast<Instruction>(V);
17169 if (!Inst)
17170 continue;
17171 if (isVectorized(V)) {
17172 OpParent = Inst->getParent();
17173 OpLastInst = Inst;
17174 break;
17175 }
17176 }
17177 } else {
17178 OpLastInst = EntriesToLastInstruction.at(Op);
17179 OpParent = OpLastInst->getParent();
17180 }
17181 // Check the call instructions within the same basic blocks.
17182 if (OpParent == Parent) {
17183 if (Entry->getOpcode() == Instruction::PHI) {
17184 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
17185 AddCosts(Op);
17186 continue;
17187 }
17188 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
17189 AddCosts(Op);
17190 continue;
17191 }
17192 // Check for call instruction in between blocks.
17193 // 1. Check entry's block to the head.
17194 if (Entry->getOpcode() != Instruction::PHI &&
17195 !CheckForNonVecCallsInSameBlock(
17196 &*Parent->getFirstNonPHIOrDbgOrAlloca(), LastInst)) {
17197 AddCosts(Op);
17198 continue;
17199 }
17200 // 2. Check op's block from the end.
17201 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
17202 OpParent->getTerminator())) {
17203 AddCosts(Op);
17204 continue;
17205 }
17206 // 3. Check the predecessors of entry's block till op's block.
17207 if (!CheckPredecessors(Parent, Pred, OpParent)) {
17208 AddCosts(Op);
17209 continue;
17210 }
17211 }
17212 }
17213
17214 return Cost;
17215}
17216
17217/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
17218/// buildvector sequence.
17220 const InsertElementInst *IE2) {
17221 if (IE1 == IE2)
17222 return false;
17223 const auto *I1 = IE1;
17224 const auto *I2 = IE2;
17225 const InsertElementInst *PrevI1;
17226 const InsertElementInst *PrevI2;
17227 unsigned Idx1 = *getElementIndex(IE1);
17228 unsigned Idx2 = *getElementIndex(IE2);
17229 do {
17230 if (I2 == IE1)
17231 return true;
17232 if (I1 == IE2)
17233 return false;
17234 PrevI1 = I1;
17235 PrevI2 = I2;
17236 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
17237 getElementIndex(I1).value_or(Idx2) != Idx2)
17238 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
17239 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
17240 getElementIndex(I2).value_or(Idx1) != Idx1)
17241 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
17242 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
17243 llvm_unreachable("Two different buildvectors not expected.");
17244}
17245
17246namespace {
17247/// Returns incoming Value *, if the requested type is Value * too, or a default
17248/// value, otherwise.
17249struct ValueSelect {
17250 template <typename U>
17251 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
17252 return V;
17253 }
17254 template <typename U>
17255 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
17256 return U();
17257 }
17258};
17259} // namespace
17260
17261/// Does the analysis of the provided shuffle masks and performs the requested
17262/// actions on the vectors with the given shuffle masks. It tries to do it in
17263/// several steps.
17264/// 1. If the Base vector is not undef vector, resizing the very first mask to
17265/// have common VF and perform action for 2 input vectors (including non-undef
17266/// Base). Other shuffle masks are combined with the resulting after the 1 stage
17267/// and processed as a shuffle of 2 elements.
17268/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
17269/// action only for 1 vector with the given mask, if it is not the identity
17270/// mask.
17271/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
17272/// vectors, combing the masks properly between the steps.
17273template <typename T>
17275 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
17276 function_ref<unsigned(T *)> GetVF,
17277 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
17279 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
17280 SmallVector<int> Mask(ShuffleMask.begin()->second);
17281 auto VMIt = std::next(ShuffleMask.begin());
17282 T *Prev = nullptr;
17283 SmallBitVector UseMask =
17284 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
17285 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
17286 if (!IsBaseUndef.all()) {
17287 // Base is not undef, need to combine it with the next subvectors.
17288 std::pair<T *, bool> Res =
17289 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
17290 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
17291 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
17292 if (Mask[Idx] == PoisonMaskElem)
17293 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
17294 else
17295 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
17296 }
17297 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
17298 assert((!V || GetVF(V) == Mask.size()) &&
17299 "Expected base vector of VF number of elements.");
17300 Prev = Action(Mask, {nullptr, Res.first});
17301 } else if (ShuffleMask.size() == 1) {
17302 // Base is undef and only 1 vector is shuffled - perform the action only for
17303 // single vector, if the mask is not the identity mask.
17304 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
17305 /*ForSingleMask=*/true);
17306 if (Res.second)
17307 // Identity mask is found.
17308 Prev = Res.first;
17309 else
17310 Prev = Action(Mask, {ShuffleMask.begin()->first});
17311 } else {
17312 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
17313 // shuffles step by step, combining shuffle between the steps.
17314 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
17315 unsigned Vec2VF = GetVF(VMIt->first);
17316 if (Vec1VF == Vec2VF) {
17317 // No need to resize the input vectors since they are of the same size, we
17318 // can shuffle them directly.
17319 ArrayRef<int> SecMask = VMIt->second;
17320 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
17321 if (SecMask[I] != PoisonMaskElem) {
17322 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
17323 Mask[I] = SecMask[I] + Vec1VF;
17324 }
17325 }
17326 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
17327 } else {
17328 // Vectors of different sizes - resize and reshuffle.
17329 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
17330 /*ForSingleMask=*/false);
17331 std::pair<T *, bool> Res2 =
17332 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
17333 ArrayRef<int> SecMask = VMIt->second;
17334 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
17335 if (Mask[I] != PoisonMaskElem) {
17336 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
17337 if (Res1.second)
17338 Mask[I] = I;
17339 } else if (SecMask[I] != PoisonMaskElem) {
17340 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
17341 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
17342 }
17343 }
17344 Prev = Action(Mask, {Res1.first, Res2.first});
17345 }
17346 VMIt = std::next(VMIt);
17347 }
17348 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
17349 // Perform requested actions for the remaining masks/vectors.
17350 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
17351 // Shuffle other input vectors, if any.
17352 std::pair<T *, bool> Res =
17353 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
17354 ArrayRef<int> SecMask = VMIt->second;
17355 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
17356 if (SecMask[I] != PoisonMaskElem) {
17357 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
17358 "Multiple uses of scalars.");
17359 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
17360 } else if (Mask[I] != PoisonMaskElem) {
17361 Mask[I] = I;
17362 }
17363 }
17364 Prev = Action(Mask, {Prev, Res.first});
17365 }
17366 return Prev;
17367}
17368
17370 ArrayRef<Value *> VectorizedVals) {
17372 SmallPtrSet<Value *, 4> CheckedExtracts;
17373 SmallSetVector<TreeEntry *, 4> GatheredLoadsNodes;
17375 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
17376 << VectorizableTree.size() << ".\n");
17377 auto IsExternallyUsed = [&](const TreeEntry &TE, Value *V) {
17378 assert(TE.hasState() && !TE.isGather() &&
17379 TE.State != TreeEntry::SplitVectorize && "Expected vector node.");
17380 if (V->hasOneUse() || V->getType()->isVoidTy())
17381 return false;
17382 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
17383 return false;
17384 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
17385 if (V->hasNUsesOrMore(NumVectScalars))
17386 return true;
17387 auto *I = dyn_cast<Instruction>(V);
17388 // Check if any user is used outside of the tree.
17389 return I && any_of(I->users(), [&](const User *U) {
17390 // store/insertelt v, [cast]U will likely be vectorized.
17391 if (match(U, m_InsertElt(m_Value(),
17392 m_OneUse(m_CastOrSelf(m_Specific(I))),
17393 m_ConstantInt())))
17394 return false;
17395 if (match(U,
17396 m_InsertElt(m_Value(), m_Specific(I), m_ConstantInt())))
17397 return false;
17398 if (match(U, m_Store(m_OneUse(m_CastOrSelf(m_Specific(I))),
17399 m_Value())))
17400 return false;
17401 if (match(U, m_Store(m_Specific(I), m_Value())))
17402 return false;
17403 ArrayRef<TreeEntry *> Entries = getTreeEntries(U);
17404 if (Entries.empty() && !MustGather.contains(U))
17405 return true;
17406 if (any_of(Entries, [&](TreeEntry *TE) {
17407 return DeletedNodes.contains(TE);
17408 }))
17409 return true;
17410 return any_of(ValueToGatherNodes.lookup(U),
17411 [&](const TreeEntry *TE) {
17412 return DeletedNodes.contains(TE);
17413 });
17414 });
17415 };
17417 InstructionCost Cost = 0;
17419 unsigned PrevScale = 0;
17420 BasicBlock *PrevVecParent = nullptr;
17421 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
17422 TreeEntry &TE = *Ptr;
17423 // No need to count the cost for combined entries, they are combined and
17424 // just skip their cost.
17425 if (TE.State == TreeEntry::CombinedVectorize) {
17426 LLVM_DEBUG(
17427 dbgs() << "SLP: Skipping cost for combined node that starts with "
17428 << *TE.Scalars[0] << ".\n";
17429 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17430 NodesCosts.try_emplace(&TE);
17431 continue;
17432 }
17433 if (TE.hasState() &&
17434 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
17435 if (const TreeEntry *E =
17436 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
17437 E && E->getVectorFactor() == TE.getVectorFactor()) {
17438 // Some gather nodes might be absolutely the same as some vectorizable
17439 // nodes after reordering, need to handle it.
17440 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
17441 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
17442 << "SLP: Current total cost = " << Cost << "\n");
17443 NodesCosts.try_emplace(&TE);
17444 continue;
17445 }
17446 }
17447
17448 // Exclude cost of gather loads nodes which are not used. These nodes were
17449 // built as part of the final attempt to vectorize gathered loads.
17450 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
17451 "Expected gather nodes with users only.");
17452
17453 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
17454 unsigned Scale = 0;
17455 bool CostIsFree = C == 0;
17456 if (!CostIsFree && !TE.isGather() && TE.hasState()) {
17457 if (PrevVecParent == TE.getMainOp()->getParent()) {
17458 Scale = PrevScale;
17459 C *= Scale;
17460 EntryToScale.try_emplace(&TE, Scale);
17461 }
17462 }
17463 if (!CostIsFree && !Scale) {
17464 Scale = getScaleToLoopIterations(TE);
17465 C *= Scale;
17466 EntryToScale.try_emplace(&TE, Scale);
17467 if (!TE.isGather() && TE.hasState()) {
17468 PrevVecParent = TE.getMainOp()->getParent();
17469 PrevScale = Scale;
17470 }
17471 }
17472 Cost += C;
17473 NodesCosts.try_emplace(&TE, C);
17474 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
17475 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
17476 << "SLP: Current total cost = " << Cost << "\n");
17477 // Add gathered loads nodes to the set for later processing.
17478 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
17479 TE.getOpcode() == Instruction::Load)
17480 GatheredLoadsNodes.insert(&TE);
17481 if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize &&
17482 !(TE.Idx == 0 && (TE.getOpcode() == Instruction::InsertElement ||
17483 TE.getOpcode() == Instruction::Store))) {
17484 // Calculate costs of external uses.
17485 APInt DemandedElts = APInt::getZero(TE.getVectorFactor());
17486 for (Value *V : TE.Scalars) {
17487 if (IsExternallyUsed(TE, V))
17488 DemandedElts.setBit(TE.findLaneForValue(V));
17489 }
17490 if (!DemandedElts.isZero()) {
17491 Type *ScalarTy = TE.Scalars.front()->getType();
17492 auto It = MinBWs.find(&TE);
17493 if (It != MinBWs.end())
17494 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
17495 auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
17497 *TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/false,
17498 /*Extract=*/true, CostKind);
17499 if (ExtCost.isValid() && ExtCost != 0) {
17500 if (!Scale)
17501 Scale = getScaleToLoopIterations(TE);
17502 ExtCost *= Scale;
17503 EntryToScale.try_emplace(&TE, Scale);
17504 }
17505 ExtractCosts.try_emplace(&TE, ExtCost);
17506 }
17507 }
17508 }
17509 // Bail out if the cost threshold is negative and cost already below it.
17510 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
17512 return Cost;
17513 // The narrow non-profitable tree in loop? Skip, may cause regressions.
17514 constexpr unsigned PartLimit = 2;
17515 const unsigned Sz =
17516 getVectorElementSize(VectorizableTree.front()->Scalars.front());
17517 const unsigned MinVF = getMinVF(Sz);
17518 if (Cost >= -SLPCostThreshold &&
17519 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
17520 (!VectorizableTree.front()->hasState() ||
17521 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
17522 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
17523 return Cost;
17524 // Store the cost + external uses estimation as the first element of the
17525 // tuple, just the cost as the second element of the tuple. Required to return
17526 // correct cost estimation for the tree, extracts are calculated separately.
17527 // Extracts, calculated here, are just quick estimations.
17529 std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
17530 SubtreeCosts(VectorizableTree.size());
17531 auto UpdateParentNodes =
17532 [&](const TreeEntry *UserTE, const TreeEntry *TE,
17534 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
17535 &VisitedUser,
17536 bool AddToList = true) {
17537 while (UserTE &&
17538 VisitedUser.insert(std::make_pair(TE, UserTE)).second) {
17539 std::get<0>(SubtreeCosts[UserTE->Idx]) += TotalCost;
17540 std::get<1>(SubtreeCosts[UserTE->Idx]) += Cost;
17541 if (AddToList)
17542 std::get<2>(SubtreeCosts[UserTE->Idx]).push_back(TE->Idx);
17543 UserTE = UserTE->UserTreeIndex.UserTE;
17544 }
17545 };
17546 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
17547 TreeEntry &TE = *Ptr;
17548 InstructionCost C = NodesCosts.at(&TE);
17549 InstructionCost ExtractCost = ExtractCosts.lookup(&TE);
17550 std::get<0>(SubtreeCosts[TE.Idx]) += C + ExtractCost;
17551 std::get<1>(SubtreeCosts[TE.Idx]) += C;
17552 if (const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
17553 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
17554 VisitedUser;
17555 UpdateParentNodes(UserTE, &TE, C + ExtractCost, C, VisitedUser);
17556 }
17557 }
17558 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
17559 for (TreeEntry *TE : GatheredLoadsNodes) {
17560 InstructionCost TotalCost = std::get<0>(SubtreeCosts[TE->Idx]);
17561 InstructionCost Cost = std::get<1>(SubtreeCosts[TE->Idx]);
17562 for (Value *V : TE->Scalars) {
17563 for (const TreeEntry *BVTE : ValueToGatherNodes.lookup(V))
17564 UpdateParentNodes(BVTE, TE, TotalCost, Cost, Visited,
17565 /*AddToList=*/false);
17566 }
17567 }
17568 Visited.clear();
17569 using CostIndicesTy =
17570 std::pair<TreeEntry *, std::tuple<InstructionCost, InstructionCost,
17571 SmallVector<unsigned>>>;
17572 struct FirstGreater {
17573 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
17574 return std::get<0>(LHS.second) < std::get<0>(RHS.second) ||
17575 (std::get<0>(LHS.second) == std::get<0>(RHS.second) &&
17576 LHS.first->Idx < RHS.first->Idx);
17577 }
17578 };
17579 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
17580 Worklist;
17581 for (const auto [Idx, P] : enumerate(SubtreeCosts))
17582 Worklist.emplace(VectorizableTree[Idx].get(), P);
17583
17584 // Narrow store trees with non-profitable immediate values - exit.
17585 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
17586 VectorizableTree.front()->hasState() &&
17587 VectorizableTree.front()->getOpcode() == Instruction::Store &&
17588 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
17589 return Cost;
17590
17591 bool Changed = false;
17592 while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) {
17593 TreeEntry *TE = Worklist.top().first;
17594 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
17595 // Exit early if the parent node is split node and any of scalars is
17596 // used in other split nodes.
17597 (TE->UserTreeIndex &&
17598 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
17599 any_of(TE->Scalars, [&](Value *V) {
17600 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
17601 return Entries.size() > 1;
17602 }))) {
17603 Worklist.pop();
17604 continue;
17605 }
17606 // Skip inversed compare nodes, they cannot be transformed to buildvectors.
17607 if (TE->State == TreeEntry::Vectorize && !TE->isAltShuffle() &&
17608 (TE->getOpcode() == Instruction::ICmp ||
17609 TE->getOpcode() == Instruction::FCmp) &&
17610 any_of(TE->Scalars, [&](Value *V) {
17611 auto *I = dyn_cast<CmpInst>(V);
17612 if (!I)
17613 return false;
17614 return I->getPredicate() !=
17615 cast<CmpInst>(TE->getMainOp())->getPredicate();
17616 })) {
17617 Worklist.pop();
17618 continue;
17619 }
17620
17621 // Calculate the gather cost of the root node.
17622 InstructionCost TotalSubtreeCost = std::get<0>(Worklist.top().second);
17623 InstructionCost SubtreeCost = std::get<1>(Worklist.top().second);
17624 if (TotalSubtreeCost < TE->Scalars.size()) {
17625 Worklist.pop();
17626 continue;
17627 }
17628 if (!TransformedToGatherNodes.empty()) {
17629 for (unsigned Idx : std::get<2>(Worklist.top().second)) {
17630 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
17631 if (It != TransformedToGatherNodes.end()) {
17632 TotalSubtreeCost -= std::get<0>(SubtreeCosts[Idx]);
17633 SubtreeCost -= std::get<1>(SubtreeCosts[Idx]);
17634 TotalSubtreeCost += It->second;
17635 SubtreeCost += It->second;
17636 }
17637 }
17638 }
17639 if (TotalSubtreeCost < 0 || TotalSubtreeCost < TE->Scalars.size()) {
17640 Worklist.pop();
17641 continue;
17642 }
17643 const unsigned Sz = TE->Scalars.size();
17644 APInt DemandedElts = APInt::getAllOnes(Sz);
17645 for (auto [Idx, V] : enumerate(TE->Scalars)) {
17646 if (isConstant(V))
17647 DemandedElts.clearBit(Idx);
17648 }
17649
17650 Type *ScalarTy = getValueType(TE->Scalars.front());
17651 auto It = MinBWs.find(TE);
17652 if (It != MinBWs.end())
17653 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
17654 if (isa<CmpInst>(TE->Scalars.front()))
17655 ScalarTy = TE->Scalars.front()->getType();
17656 auto *VecTy = getWidenedType(ScalarTy, Sz);
17657 const unsigned EntryVF = TE->getVectorFactor();
17658 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
17660 *TTI, ScalarTy, VecTy, DemandedElts,
17661 /*Insert=*/true, /*Extract=*/false, CostKind);
17662 SmallVector<int> Mask;
17663 if (!TE->ReorderIndices.empty() &&
17664 TE->State != TreeEntry::CompressVectorize &&
17665 (TE->State != TreeEntry::StridedVectorize ||
17666 !isReverseOrder(TE->ReorderIndices))) {
17667 SmallVector<int> NewMask;
17668 if (TE->getOpcode() == Instruction::Store) {
17669 // For stores the order is actually a mask.
17670 NewMask.resize(TE->ReorderIndices.size());
17671 copy(TE->ReorderIndices, NewMask.begin());
17672 } else {
17673 inversePermutation(TE->ReorderIndices, NewMask);
17674 }
17675 ::addMask(Mask, NewMask);
17676 }
17677 if (!TE->ReuseShuffleIndices.empty())
17678 ::addMask(Mask, TE->ReuseShuffleIndices);
17679 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
17680 GatherCost +=
17681 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
17682 // If all scalars are reused in gather node(s) or other vector nodes, there
17683 // might be extra cost for inserting them.
17684 if ((!TE->hasState() || !TE->isAltShuffle()) &&
17685 all_of(TE->Scalars, [&](Value *V) {
17686 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
17687 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
17688 }))
17689 GatherCost *= 2;
17690 // Erase subtree if it is non-profitable.
17691 if (TotalSubtreeCost > GatherCost) {
17692 // If the remaining tree is just a buildvector - exit, it will cause
17693 // endless attempts to vectorize.
17694 if (VectorizableTree.front()->hasState() &&
17695 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17696 TE->Idx == 1)
17697 return InstructionCost::getInvalid();
17698
17699 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
17700 << TE->Idx << " with cost "
17701 << std::get<0>(Worklist.top().second)
17702 << " and gather cost " << GatherCost << ".\n");
17703 if (TE->UserTreeIndex) {
17704 TransformedToGatherNodes.try_emplace(TE, GatherCost);
17705 NodesCosts.erase(TE);
17706 } else {
17707 DeletedNodes.insert(TE);
17708 TransformedToGatherNodes.erase(TE);
17709 NodesCosts.erase(TE);
17710 }
17711 for (unsigned Idx : std::get<2>(Worklist.top().second)) {
17712 TreeEntry &ChildTE = *VectorizableTree[Idx];
17713 DeletedNodes.insert(&ChildTE);
17714 TransformedToGatherNodes.erase(&ChildTE);
17715 NodesCosts.erase(&ChildTE);
17716 }
17717 Changed = true;
17718 }
17719 Worklist.pop();
17720 }
17721 if (!Changed)
17722 return std::get<1>(SubtreeCosts.front());
17723
17724 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
17725 InstructionCost LoadsExtractsCost = 0;
17726 // Check if all loads of gathered loads nodes are marked for deletion. In this
17727 // case the whole gathered loads subtree must be deleted.
17728 // Also, try to account for extracts, which might be required, if only part of
17729 // gathered load must be vectorized. Keep partially vectorized nodes, if
17730 // extracts are cheaper than gathers.
17731 for (TreeEntry *TE : GatheredLoadsNodes) {
17732 if (DeletedNodes.contains(TE) || TransformedToGatherNodes.contains(TE))
17733 continue;
17734 GatheredLoadsToDelete.insert(TE);
17735 APInt DemandedElts = APInt::getZero(TE->getVectorFactor());
17736 // All loads are removed from gathered? Need to delete the subtree.
17737 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
17738 for (Value *V : TE->Scalars) {
17739 unsigned Pos = TE->findLaneForValue(V);
17740 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
17741 if (DeletedNodes.contains(BVE))
17742 continue;
17743 DemandedElts.setBit(Pos);
17744 ValuesToInsert.try_emplace(BVE).first->second.push_back(V);
17745 }
17746 }
17747 if (!DemandedElts.isZero()) {
17748 Type *ScalarTy = TE->Scalars.front()->getType();
17749 auto It = MinBWs.find(TE);
17750 if (It != MinBWs.end())
17751 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
17752 auto *VecTy = getWidenedType(ScalarTy, TE->getVectorFactor());
17754 *TTI, ScalarTy, VecTy, DemandedElts,
17755 /*Insert=*/false, /*Extract=*/true, CostKind);
17756 InstructionCost BVCost = 0;
17757 for (const auto &[BVE, Values] : ValuesToInsert) {
17758 APInt BVDemandedElts = APInt::getZero(BVE->getVectorFactor());
17759 SmallVector<Value *> BVValues(BVE->getVectorFactor(),
17760 PoisonValue::get(ScalarTy));
17761 for (Value *V : Values) {
17762 unsigned Pos = BVE->findLaneForValue(V);
17763 BVValues[Pos] = V;
17764 BVDemandedElts.setBit(Pos);
17765 }
17766 auto *BVVecTy = getWidenedType(ScalarTy, BVE->getVectorFactor());
17768 *TTI, ScalarTy, BVVecTy, BVDemandedElts,
17769 /*Insert=*/true, /*Extract=*/false, CostKind,
17770 BVDemandedElts.isAllOnes(), BVValues);
17771 }
17772 if (ExtractsCost < BVCost) {
17773 LoadsExtractsCost += ExtractsCost;
17774 GatheredLoadsToDelete.erase(TE);
17775 continue;
17776 }
17777 LoadsExtractsCost += BVCost;
17778 }
17779 NodesCosts.erase(TE);
17780 }
17781
17782 // Deleted all subtrees rooted at gathered loads nodes.
17783 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17784 if (TE->UserTreeIndex &&
17785 GatheredLoadsToDelete.contains(TE->UserTreeIndex.UserTE)) {
17786 DeletedNodes.insert(TE.get());
17787 NodesCosts.erase(TE.get());
17788 GatheredLoadsToDelete.insert(TE.get());
17789 }
17790 if (GatheredLoadsToDelete.contains(TE.get()))
17791 DeletedNodes.insert(TE.get());
17792 }
17793
17794 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17795 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
17796 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
17797 continue;
17798 }
17799 if (DeletedNodes.contains(TE.get()))
17800 continue;
17801 if (!NodesCosts.contains(TE.get())) {
17803 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
17804 if (!C.isValid() || C == 0) {
17805 NodesCosts.try_emplace(TE.get(), C);
17806 continue;
17807 }
17808 unsigned Scale = EntryToScale.lookup(TE.get());
17809 if (!Scale)
17810 Scale = getScaleToLoopIterations(*TE.get());
17811 C *= Scale;
17812 NodesCosts.try_emplace(TE.get(), C);
17813 }
17814 }
17815
17816 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
17817 InstructionCost NewCost = 0;
17818 for (const auto &P : NodesCosts) {
17819 NewCost += P.second;
17820 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
17821 << shortBundleName(P.first->Scalars, P.first->Idx)
17822 << ".\n"
17823 << "SLP: Current total cost = " << Cost << "\n");
17824 }
17825 if (NewCost + LoadsExtractsCost >= Cost) {
17826 DeletedNodes.clear();
17827 TransformedToGatherNodes.clear();
17828 NewCost = Cost;
17829 } else {
17830 // If the remaining tree is just a buildvector - exit, it will cause
17831 // endless attempts to vectorize.
17832 if (VectorizableTree.size()>= 2 && VectorizableTree.front()->hasState() &&
17833 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17834 TransformedToGatherNodes.contains(VectorizableTree[1].get()))
17835 return InstructionCost::getInvalid();
17836 if (VectorizableTree.size() >= 3 && VectorizableTree.front()->hasState() &&
17837 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17838 VectorizableTree[1]->hasState() &&
17839 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17840 (VectorizableTree[1]->getOpcode() == Instruction::ZExt ||
17841 VectorizableTree[1]->getOpcode() == Instruction::SExt ||
17842 VectorizableTree[1]->getOpcode() == Instruction::Trunc) &&
17843 TransformedToGatherNodes.contains(VectorizableTree[2].get()))
17844 return InstructionCost::getInvalid();
17845 }
17846 return NewCost;
17847}
17848
17849namespace {
17850/// Data type for handling buildvector sequences with the reused scalars from
17851/// other tree entries.
17852template <typename T> struct ShuffledInsertData {
17853 /// List of insertelements to be replaced by shuffles.
17854 SmallVector<InsertElementInst *> InsertElements;
17855 /// The parent vectors and shuffle mask for the given list of inserts.
17856 MapVector<T, SmallVector<int>> ValueMasks;
17857};
17858} // namespace
17859
17861 ArrayRef<Value *> VectorizedVals,
17862 InstructionCost ReductionCost) {
17863 InstructionCost Cost = TreeCost;
17864
17866 auto ScaleCost = [&](InstructionCost C, const TreeEntry &TE,
17867 Value *Scalar = nullptr, Instruction *U = nullptr) {
17868 if (!C.isValid() || C == 0)
17869 return C;
17870 unsigned &Scale = EntryToScale.try_emplace(&TE, 0).first->getSecond();
17871 if (!Scale)
17872 Scale = getScaleToLoopIterations(TE, Scalar, U);
17873 return C * Scale;
17874 };
17875 Instruction *ReductionRoot = nullptr;
17876 if (UserIgnoreList) {
17877 const auto It = find_if(*UserIgnoreList, IsaPred<Instruction>);
17878 assert(It != UserIgnoreList->end() && "Expected reduction instruction.");
17879 ReductionRoot = cast<Instruction>(*It);
17880 // Scale reduction cost to the factor of the loop nest trip count.
17881 ReductionCost = ScaleCost(ReductionCost, *VectorizableTree.front().get(),
17882 /*Scalar=*/nullptr, ReductionRoot);
17883 }
17884
17885 // Add the cost for reduction.
17886 Cost += ReductionCost;
17887
17888 // Skip trees, which are non-profitable even if there are insertelements with
17889 // external uses.
17890 constexpr unsigned CostLimit = 100;
17891 if (Cost >= -SLPCostThreshold + CostLimit &&
17892 (VectorizableTree.size() - DeletedNodes.size()) *
17893 VectorizableTree.front()->getVectorFactor() <
17894 CostLimit)
17895 return Cost;
17896
17897 if (Cost >= -SLPCostThreshold &&
17898 none_of(ExternalUses, [](const ExternalUser &EU) {
17899 return isa_and_nonnull<InsertElementInst>(EU.User);
17900 }))
17901 return Cost;
17902
17903 SmallPtrSet<Value *, 16> ExtractCostCalculated;
17904 InstructionCost ExtractCost = 0;
17906 SmallVector<APInt> DemandedElts;
17907 SmallDenseSet<Value *, 4> UsedInserts;
17909 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
17911 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
17912 // Keep track {Scalar, Index, User} tuple.
17913 // On AArch64, this helps in fusing a mov instruction, associated with
17914 // extractelement, with fmul in the backend so that extractelement is free.
17916 for (ExternalUser &EU : ExternalUses) {
17917 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
17918 }
17919 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
17920 for (ExternalUser &EU : ExternalUses) {
17921 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
17922 << EU.E.Idx << " in lane " << EU.Lane << "\n");
17923 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
17924 else dbgs() << " User: nullptr\n");
17925 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
17926
17927 // Uses by ephemeral values are free (because the ephemeral value will be
17928 // removed prior to code generation, and so the extraction will be
17929 // removed as well).
17930 if (EphValues.count(EU.User))
17931 continue;
17932
17933 // Check if the scalar for the given user or all users is accounted already.
17934 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
17935 (EU.User &&
17936 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
17937 continue;
17938
17939 // Used in unreachable blocks or in EH pads (rarely executed) or is
17940 // terminated with unreachable instruction.
17941 if (BasicBlock *UserParent =
17942 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
17943 UserParent &&
17944 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
17945 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
17946 continue;
17947
17948 // We only add extract cost once for the same scalar.
17949 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
17950 !ExtractCostCalculated.insert(EU.Scalar).second)
17951 continue;
17952
17953 // No extract cost for vector "scalar" if REVEC is disabled
17954 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
17955 continue;
17956
17957 // If found user is an insertelement, do not calculate extract cost but try
17958 // to detect it as a final shuffled/identity match.
17959 // TODO: what if a user is insertvalue when REVEC is enabled?
17960 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
17961 VU && VU->getOperand(1) == EU.Scalar) {
17962 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
17963 if (!UsedInserts.insert(VU).second)
17964 continue;
17965 std::optional<unsigned> InsertIdx = getElementIndex(VU);
17966 if (InsertIdx) {
17967 const TreeEntry *ScalarTE = &EU.E;
17968 auto *It = find_if(
17969 ShuffledInserts,
17970 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
17971 // Checks if 2 insertelements are from the same buildvector.
17972 InsertElementInst *VecInsert = Data.InsertElements.front();
17974 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
17975 Value *Op0 = II->getOperand(0);
17976 if (isVectorized(II) && !isVectorized(Op0))
17977 return nullptr;
17978 return Op0;
17979 });
17980 });
17981 int VecId = -1;
17982 if (It == ShuffledInserts.end()) {
17983 auto &Data = ShuffledInserts.emplace_back();
17984 Data.InsertElements.emplace_back(VU);
17985 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
17986 VecId = ShuffledInserts.size() - 1;
17987 auto It = MinBWs.find(ScalarTE);
17988 if (It != MinBWs.end() &&
17989 VectorCasts
17990 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
17991 .second) {
17992 unsigned BWSz = It->second.first;
17993 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
17994 unsigned VecOpcode;
17995 if (DstBWSz < BWSz)
17996 VecOpcode = Instruction::Trunc;
17997 else
17998 VecOpcode =
17999 It->second.second ? Instruction::SExt : Instruction::ZExt;
18001 InstructionCost C = TTI->getCastInstrCost(
18002 VecOpcode, FTy,
18003 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
18004 FTy->getNumElements()),
18006 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
18007 << " for extending externally used vector with "
18008 "non-equal minimum bitwidth.\n");
18009 Cost += C;
18010 }
18011 } else {
18012 if (isFirstInsertElement(VU, It->InsertElements.front()))
18013 It->InsertElements.front() = VU;
18014 VecId = std::distance(ShuffledInserts.begin(), It);
18015 }
18016 int InIdx = *InsertIdx;
18017 SmallVectorImpl<int> &Mask =
18018 ShuffledInserts[VecId].ValueMasks[ScalarTE];
18019 if (Mask.empty())
18020 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
18021 Mask[InIdx] = EU.Lane;
18022 DemandedElts[VecId].setBit(InIdx);
18023 continue;
18024 }
18025 }
18026 }
18027
18029 // If we plan to rewrite the tree in a smaller type, we will need to sign
18030 // extend the extracted value back to the original type. Here, we account
18031 // for the extract and the added cost of the sign extend if needed.
18032 InstructionCost ExtraCost = TTI::TCC_Free;
18033 auto *ScalarTy = EU.Scalar->getType();
18034 const unsigned BundleWidth = EU.E.getVectorFactor();
18035 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
18036 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
18037 const TreeEntry *Entry = &EU.E;
18038 auto It = MinBWs.find(Entry);
18039 if (It != MinBWs.end()) {
18040 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
18041 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
18042 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
18043 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
18044 ? Instruction::ZExt
18045 : Instruction::SExt;
18046 VecTy = getWidenedType(MinTy, BundleWidth);
18047 ExtraCost =
18048 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
18049 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
18050 << ExtraCost << "\n");
18051 } else {
18052 ExtraCost =
18053 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
18054 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
18055 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
18056 << *VecTy << ": " << ExtraCost << "\n");
18057 }
18058 // Leave the scalar instructions as is if they are cheaper than extracts.
18059 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
18060 Entry->getOpcode() == Instruction::Load) {
18061 // Checks if the user of the external scalar is phi in loop body.
18062 auto IsPhiInLoop = [&](const ExternalUser &U) {
18063 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
18064 auto *I = cast<Instruction>(U.Scalar);
18065 const Loop *L = LI->getLoopFor(Phi->getParent());
18066 return L && (Phi->getParent() == I->getParent() ||
18067 L == LI->getLoopFor(I->getParent()));
18068 }
18069 return false;
18070 };
18071 if (!ValueToExtUses) {
18072 ValueToExtUses.emplace();
18073 for (const auto &P : enumerate(ExternalUses)) {
18074 // Ignore phis in loops.
18075 if (IsPhiInLoop(P.value()))
18076 continue;
18077
18078 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
18079 }
18080 }
18081 // Can use original instruction, if no operands vectorized or they are
18082 // marked as externally used already.
18083 auto *Inst = cast<Instruction>(EU.Scalar);
18084 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
18085 auto OperandIsScalar = [&](Value *V) {
18086 if (!isVectorized(V)) {
18087 // Some extractelements might be not vectorized, but
18088 // transformed into shuffle and removed from the function,
18089 // consider it here.
18090 if (auto *EE = dyn_cast<ExtractElementInst>(V))
18091 return !EE->hasOneUse() || !MustGather.contains(EE);
18092 return true;
18093 }
18094 return ValueToExtUses->contains(V);
18095 };
18096 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
18097 bool CanBeUsedAsScalarCast = false;
18098 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
18099 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
18100 Op && all_of(Op->operands(), OperandIsScalar)) {
18101 InstructionCost OpCost =
18102 (isVectorized(Op) && !ValueToExtUses->contains(Op))
18103 ? TTI->getInstructionCost(Op, CostKind)
18104 : 0;
18105 if (ScalarCost + OpCost <= ExtraCost) {
18106 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
18107 ScalarCost += OpCost;
18108 }
18109 }
18110 }
18111 if (CanBeUsedAsScalar) {
18112 bool KeepScalar = ScalarCost <= ExtraCost;
18113 // Try to keep original scalar if the user is the phi node from the same
18114 // block as the root phis, currently vectorized. It allows to keep
18115 // better ordering info of PHIs, being vectorized currently.
18116 bool IsProfitablePHIUser =
18117 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
18118 VectorizableTree.front()->Scalars.size() > 2)) &&
18119 VectorizableTree.front()->hasState() &&
18120 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
18121 !Inst->hasNUsesOrMore(UsesLimit) &&
18122 none_of(Inst->users(),
18123 [&](User *U) {
18124 auto *PHIUser = dyn_cast<PHINode>(U);
18125 return (!PHIUser ||
18126 PHIUser->getParent() !=
18127 cast<Instruction>(
18128 VectorizableTree.front()->getMainOp())
18129 ->getParent()) &&
18130 !isVectorized(U);
18131 }) &&
18132 count_if(Entry->Scalars, [&](Value *V) {
18133 return ValueToExtUses->contains(V);
18134 }) <= 2;
18135 if (IsProfitablePHIUser) {
18136 KeepScalar = true;
18137 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
18138 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
18139 (!GatheredLoadsEntriesFirst.has_value() ||
18140 Entry->Idx < *GatheredLoadsEntriesFirst)) {
18141 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
18142 return ValueToExtUses->contains(V);
18143 });
18144 auto It = ExtractsCount.find(Entry);
18145 if (It != ExtractsCount.end()) {
18146 assert(ScalarUsesCount >= It->getSecond().size() &&
18147 "Expected total number of external uses not less than "
18148 "number of scalar uses.");
18149 ScalarUsesCount -= It->getSecond().size();
18150 }
18151 // Keep original scalar if number of externally used instructions in
18152 // the same entry is not power of 2. It may help to do some extra
18153 // vectorization for now.
18154 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
18155 }
18156 if (KeepScalar) {
18157 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
18158 for (Value *V : Inst->operands()) {
18159 auto It = ValueToExtUses->find(V);
18160 if (It != ValueToExtUses->end()) {
18161 // Replace all uses to avoid compiler crash.
18162 ExternalUses[It->second].User = nullptr;
18163 }
18164 }
18165 ExtraCost = ScalarCost;
18166 if (!IsPhiInLoop(EU))
18167 ExtractsCount[Entry].insert(Inst);
18168 if (CanBeUsedAsScalarCast) {
18169 ScalarOpsFromCasts.insert(Inst->getOperand(0));
18170 // Update the users of the operands of the cast operand to avoid
18171 // compiler crash.
18172 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
18173 for (Value *V : IOp->operands()) {
18174 auto It = ValueToExtUses->find(V);
18175 if (It != ValueToExtUses->end()) {
18176 // Replace all uses to avoid compiler crash.
18177 ExternalUses[It->second].User = nullptr;
18178 }
18179 }
18180 }
18181 }
18182 }
18183 }
18184 }
18185
18186 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar,
18187 cast_or_null<Instruction>(EU.User));
18188
18189 ExtractCost += ExtraCost;
18190 }
18191 // Insert externals for extract of operands of casts to be emitted as scalars
18192 // instead of extractelement.
18193 for (Value *V : ScalarOpsFromCasts) {
18194 ExternalUsesAsOriginalScalar.insert(V);
18195 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
18196 const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
18197 return TransformedToGatherNodes.contains(TE) ||
18198 DeletedNodes.contains(TE);
18199 });
18200 if (It != TEs.end()) {
18201 const TreeEntry *UserTE = *It;
18202 ExternalUses.emplace_back(V, nullptr, *UserTE,
18203 UserTE->findLaneForValue(V));
18204 }
18205 }
18206 }
18207 // Add reduced value cost, if resized.
18208 if (!VectorizedVals.empty()) {
18209 const TreeEntry &Root = *VectorizableTree.front();
18210 auto BWIt = MinBWs.find(&Root);
18211 if (BWIt != MinBWs.end()) {
18212 Type *DstTy = Root.Scalars.front()->getType();
18213 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
18214 unsigned SrcSz =
18215 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
18216 if (OriginalSz != SrcSz) {
18217 unsigned Opcode = Instruction::Trunc;
18218 if (OriginalSz > SrcSz)
18219 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
18220 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
18221 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
18222 assert(SLPReVec && "Only supported by REVEC.");
18223 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
18224 }
18225 InstructionCost CastCost =
18226 TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
18229 CastCost = ScaleCost(CastCost, Root, /*Scalar=*/nullptr, ReductionRoot);
18230 Cost += CastCost;
18231 }
18232 }
18233 }
18234
18235 // Buildvector with externally used scalars, which should remain as scalars,
18236 // should not be vectorized, the compiler may hang.
18237 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
18238 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
18239 VectorizableTree[1]->hasState() &&
18240 VectorizableTree[1]->State == TreeEntry::Vectorize &&
18241 all_of(VectorizableTree[1]->Scalars, [&](Value *V) {
18242 return ExternalUsesAsOriginalScalar.contains(V);
18243 }))
18245
18246 Cost += ExtractCost;
18247 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
18248 bool ForSingleMask) {
18249 InstructionCost C = 0;
18250 unsigned VF = Mask.size();
18251 unsigned VecVF = TE->getVectorFactor();
18252 bool HasLargeIndex =
18253 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
18254 if ((VF != VecVF && HasLargeIndex) ||
18256
18257 if (HasLargeIndex) {
18258 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
18259 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
18260 OrigMask.begin());
18262 getWidenedType(TE->getMainOp()->getType(), VecVF),
18263 OrigMask);
18264 LLVM_DEBUG(
18265 dbgs() << "SLP: Adding cost " << C
18266 << " for final shuffle of insertelement external users.\n";
18267 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
18268 Cost += C;
18269 return std::make_pair(TE, true);
18270 }
18271
18272 if (!ForSingleMask) {
18273 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18274 for (unsigned I = 0; I < VF; ++I) {
18275 if (Mask[I] != PoisonMaskElem)
18276 ResizeMask[Mask[I]] = Mask[I];
18277 }
18278 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
18281 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
18282 LLVM_DEBUG(
18283 dbgs() << "SLP: Adding cost " << C
18284 << " for final shuffle of insertelement external users.\n";
18285 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
18286
18287 Cost += C;
18288 }
18289 }
18290 return std::make_pair(TE, false);
18291 };
18292 // Calculate the cost of the reshuffled vectors, if any.
18293 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
18294 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
18295 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
18296 unsigned VF = 0;
18297 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
18299 assert((TEs.size() == 1 || TEs.size() == 2) &&
18300 "Expected exactly 1 or 2 tree entries.");
18301 if (TEs.size() == 1) {
18302 if (VF == 0)
18303 VF = TEs.front()->getVectorFactor();
18304 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
18305 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
18306 !all_of(enumerate(Mask), [=](const auto &Data) {
18307 return Data.value() == PoisonMaskElem ||
18308 (Data.index() < VF &&
18309 static_cast<int>(Data.index()) == Data.value());
18310 })) {
18313 C = ScaleCost(C, *TEs.front());
18314 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
18315 << " for final shuffle of insertelement "
18316 "external users.\n";
18317 TEs.front()->dump();
18318 dbgs() << "SLP: Current total cost = " << Cost << "\n");
18319 Cost += C;
18320 }
18321 } else {
18322 if (VF == 0) {
18323 if (TEs.front() &&
18324 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
18325 VF = TEs.front()->getVectorFactor();
18326 else
18327 VF = Mask.size();
18328 }
18329 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
18331 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
18332 C = ScaleCost(C, *TEs.back());
18333 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
18334 << " for final shuffle of vector node and external "
18335 "insertelement users.\n";
18336 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
18337 dbgs() << "SLP: Current total cost = " << Cost << "\n");
18338 Cost += C;
18339 }
18340 VF = Mask.size();
18341 return TEs.back();
18342 };
18344 MutableArrayRef(Vector.data(), Vector.size()), Base,
18345 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
18346 EstimateShufflesCost);
18347 InstructionCost InsertCost = TTI->getScalarizationOverhead(
18349 ShuffledInserts[I].InsertElements.front()->getType()),
18350 DemandedElts[I],
18351 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
18352 Cost -= InsertCost;
18353 }
18354
18355 // Add the cost for reduced value resize (if required).
18356 if (ReductionBitWidth != 0) {
18357 assert(UserIgnoreList && "Expected reduction tree.");
18358 const TreeEntry &E = *VectorizableTree.front();
18359 auto It = MinBWs.find(&E);
18360 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
18361 unsigned SrcSize = It->second.first;
18362 unsigned DstSize = ReductionBitWidth;
18363 unsigned Opcode = Instruction::Trunc;
18364 if (SrcSize < DstSize) {
18365 bool IsArithmeticExtendedReduction =
18366 all_of(*UserIgnoreList, [](Value *V) {
18367 auto *I = cast<Instruction>(V);
18368 return is_contained({Instruction::Add, Instruction::FAdd,
18369 Instruction::Mul, Instruction::FMul,
18370 Instruction::And, Instruction::Or,
18371 Instruction::Xor},
18372 I->getOpcode());
18373 });
18374 if (IsArithmeticExtendedReduction)
18375 Opcode =
18376 Instruction::BitCast; // Handle it by getExtendedReductionCost
18377 else
18378 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
18379 }
18380 if (Opcode != Instruction::BitCast) {
18381 auto *SrcVecTy =
18382 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
18383 auto *DstVecTy =
18384 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
18385 TTI::CastContextHint CCH = getCastContextHint(E);
18386 switch (E.getOpcode()) {
18387 case Instruction::SExt:
18388 case Instruction::ZExt:
18389 case Instruction::Trunc: {
18390 const TreeEntry *OpTE = getOperandEntry(&E, 0);
18391 CCH = getCastContextHint(*OpTE);
18392 break;
18393 }
18394 default:
18395 break;
18396 }
18397 InstructionCost CastCost =
18398 TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
18400 CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
18401 /*Scalar=*/nullptr, ReductionRoot);
18402 Cost += CastCost;
18403 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
18404 << " for final resize for reduction from " << SrcVecTy
18405 << " to " << DstVecTy << "\n";
18406 dbgs() << "SLP: Current total cost = " << Cost << "\n");
18407 }
18408 }
18409 }
18410
18411 std::optional<InstructionCost> SpillCost;
18412 if (Cost < -SLPCostThreshold) {
18413 SpillCost = getSpillCost();
18414 Cost += *SpillCost;
18415 }
18416#ifndef NDEBUG
18417 SmallString<256> Str;
18418 {
18419 raw_svector_ostream OS(Str);
18420 OS << "SLP: Spill Cost = ";
18421 if (SpillCost)
18422 OS << *SpillCost;
18423 else
18424 OS << "<skipped>";
18425 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n";
18426 if (ReductionRoot)
18427 OS << "SLP: Reduction Cost = " << ReductionCost << ".\n";
18428 OS << "SLP: Total Cost = " << Cost << ".\n";
18429 }
18430 LLVM_DEBUG(dbgs() << Str);
18431 if (ViewSLPTree)
18432 ViewGraph(this, "SLP" + F->getName(), false, Str);
18433#endif
18434
18435 return Cost;
18436}
18437
18438/// Tries to find extractelement instructions with constant indices from fixed
18439/// vector type and gather such instructions into a bunch, which highly likely
18440/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
18441/// successful, the matched scalars are replaced by poison values in \p VL for
18442/// future analysis.
18443std::optional<TTI::ShuffleKind>
18444BoUpSLP::tryToGatherSingleRegisterExtractElements(
18446 // Scan list of gathered scalars for extractelements that can be represented
18447 // as shuffles.
18449 SmallVector<int> UndefVectorExtracts;
18450 for (int I = 0, E = VL.size(); I < E; ++I) {
18451 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
18452 if (!EI) {
18453 if (isa<UndefValue>(VL[I]))
18454 UndefVectorExtracts.push_back(I);
18455 continue;
18456 }
18457 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
18458 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
18459 continue;
18460 std::optional<unsigned> Idx = getExtractIndex(EI);
18461 // Undefined index.
18462 if (!Idx) {
18463 UndefVectorExtracts.push_back(I);
18464 continue;
18465 }
18466 if (Idx >= VecTy->getNumElements()) {
18467 UndefVectorExtracts.push_back(I);
18468 continue;
18469 }
18470 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
18471 ExtractMask.reset(*Idx);
18472 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
18473 UndefVectorExtracts.push_back(I);
18474 continue;
18475 }
18476 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
18477 }
18478 // Sort the vector operands by the maximum number of uses in extractelements.
18480 VectorOpToIdx.takeVector();
18481 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
18482 return P1.second.size() > P2.second.size();
18483 });
18484 // Find the best pair of the vectors or a single vector.
18485 const int UndefSz = UndefVectorExtracts.size();
18486 unsigned SingleMax = 0;
18487 unsigned PairMax = 0;
18488 if (!Vectors.empty()) {
18489 SingleMax = Vectors.front().second.size() + UndefSz;
18490 if (Vectors.size() > 1) {
18491 auto *ItNext = std::next(Vectors.begin());
18492 PairMax = SingleMax + ItNext->second.size();
18493 }
18494 }
18495 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
18496 return std::nullopt;
18497 // Check if better to perform a shuffle of 2 vectors or just of a single
18498 // vector.
18499 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
18500 SmallVector<Value *> GatheredExtracts(
18501 VL.size(), PoisonValue::get(VL.front()->getType()));
18502 if (SingleMax >= PairMax && SingleMax) {
18503 for (int Idx : Vectors.front().second)
18504 std::swap(GatheredExtracts[Idx], VL[Idx]);
18505 } else if (!Vectors.empty()) {
18506 for (unsigned Idx : {0, 1})
18507 for (int Idx : Vectors[Idx].second)
18508 std::swap(GatheredExtracts[Idx], VL[Idx]);
18509 }
18510 // Add extracts from undefs too.
18511 for (int Idx : UndefVectorExtracts)
18512 std::swap(GatheredExtracts[Idx], VL[Idx]);
18513 // Check that gather of extractelements can be represented as just a
18514 // shuffle of a single/two vectors the scalars are extracted from.
18515 std::optional<TTI::ShuffleKind> Res =
18516 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
18517 if (!Res || all_of(Mask, equal_to(PoisonMaskElem))) {
18518 // TODO: try to check other subsets if possible.
18519 // Restore the original VL if attempt was not successful.
18520 copy(SavedVL, VL.begin());
18521 return std::nullopt;
18522 }
18523 // Restore unused scalars from mask, if some of the extractelements were not
18524 // selected for shuffle.
18525 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
18526 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
18527 isa<UndefValue>(GatheredExtracts[I])) {
18528 std::swap(VL[I], GatheredExtracts[I]);
18529 continue;
18530 }
18531 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
18532 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
18533 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
18534 is_contained(UndefVectorExtracts, I))
18535 continue;
18536 }
18537 return Res;
18538}
18539
18540/// Tries to find extractelement instructions with constant indices from fixed
18541/// vector type and gather such instructions into a bunch, which highly likely
18542/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
18543/// successful, the matched scalars are replaced by poison values in \p VL for
18544/// future analysis.
18546BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
18547 SmallVectorImpl<int> &Mask,
18548 unsigned NumParts) const {
18549 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
18550 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
18551 Mask.assign(VL.size(), PoisonMaskElem);
18552 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18553 for (unsigned Part : seq<unsigned>(NumParts)) {
18554 // Scan list of gathered scalars for extractelements that can be represented
18555 // as shuffles.
18556 const unsigned PartOffset = Part * SliceSize;
18557 const unsigned PartSize = getNumElems(VL.size(), SliceSize, Part);
18558 // It may happen in case of revec, need to check no access out of bounds.
18559 if (PartOffset + PartSize > VL.size())
18560 break;
18562 MutableArrayRef(VL).slice(PartOffset, PartSize);
18563 SmallVector<int> SubMask;
18564 std::optional<TTI::ShuffleKind> Res =
18565 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
18566 ShufflesRes[Part] = Res;
18567 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
18568 }
18569 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
18570 return Res.has_value();
18571 }))
18572 ShufflesRes.clear();
18573 return ShufflesRes;
18574}
18575
18576std::optional<TargetTransformInfo::ShuffleKind>
18577BoUpSLP::isGatherShuffledSingleRegisterEntry(
18578 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
18579 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
18580 Entries.clear();
18581 if (TE->Idx == 0)
18582 return std::nullopt;
18583 // TODO: currently checking only for Scalars in the tree entry, need to count
18584 // reused elements too for better cost estimation.
18585 auto GetUserEntry = [&](const TreeEntry *TE) {
18586 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18587 TE = TE->UserTreeIndex.UserTE;
18588 if (TE == VectorizableTree.front().get())
18589 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
18590 return TE->UserTreeIndex;
18591 };
18592 auto HasGatherUser = [&](const TreeEntry *TE) {
18593 while (TE->Idx != 0 && TE->UserTreeIndex) {
18594 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18595 return true;
18596 TE = TE->UserTreeIndex.UserTE;
18597 }
18598 return false;
18599 };
18600 const EdgeInfo TEUseEI = GetUserEntry(TE);
18601 if (!TEUseEI || (TEUseEI.UserTE->Idx == 0 && TEUseEI.UserTE->isGather() &&
18602 !TEUseEI.UserTE->hasState()))
18603 return std::nullopt;
18604 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
18605 const BasicBlock *TEInsertBlock = nullptr;
18606 // Main node of PHI entries keeps the correct order of operands/incoming
18607 // blocks.
18608 if (auto *PHI = dyn_cast_or_null<PHINode>(
18609 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
18610 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
18611 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
18612 TEInsertPt = TEInsertBlock->getTerminator();
18613 } else {
18614 TEInsertBlock = TEInsertPt->getParent();
18615 }
18616 if (!DT->isReachableFromEntry(TEInsertBlock))
18617 return std::nullopt;
18618 auto *NodeUI = DT->getNode(TEInsertBlock);
18619 assert(NodeUI && "Should only process reachable instructions");
18620 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
18621 auto CheckOrdering = [&](const Instruction *InsertPt) {
18622 // Argument InsertPt is an instruction where vector code for some other
18623 // tree entry (one that shares one or more scalars with TE) is going to be
18624 // generated. This lambda returns true if insertion point of vector code
18625 // for the TE dominates that point (otherwise dependency is the other way
18626 // around). The other node is not limited to be of a gather kind. Gather
18627 // nodes are not scheduled and their vector code is inserted before their
18628 // first user. If user is PHI, that is supposed to be at the end of a
18629 // predecessor block. Otherwise it is the last instruction among scalars of
18630 // the user node. So, instead of checking dependency between instructions
18631 // themselves, we check dependency between their insertion points for vector
18632 // code (since each scalar instruction ends up as a lane of a vector
18633 // instruction).
18634 const BasicBlock *InsertBlock = InsertPt->getParent();
18635 auto *NodeEUI = DT->getNode(InsertBlock);
18636 if (!NodeEUI)
18637 return false;
18638 assert((NodeUI == NodeEUI) ==
18639 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
18640 "Different nodes should have different DFS numbers");
18641 // Check the order of the gather nodes users.
18642 if (TEInsertPt->getParent() != InsertBlock &&
18643 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
18644 return false;
18645 if (TEInsertPt->getParent() == InsertBlock &&
18646 TEInsertPt->comesBefore(InsertPt))
18647 return false;
18648 return true;
18649 };
18650 // Find all tree entries used by the gathered values. If no common entries
18651 // found - not a shuffle.
18652 // Here we build a set of tree nodes for each gathered value and trying to
18653 // find the intersection between these sets. If we have at least one common
18654 // tree node for each gathered value - we have just a permutation of the
18655 // single vector. If we have 2 different sets, we're in situation where we
18656 // have a permutation of 2 input vectors.
18658 SmallDenseMap<Value *, int> UsedValuesEntry;
18659 SmallPtrSet<const Value *, 16> VisitedValue;
18660 bool IsReusedNodeFound = false;
18661 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
18662 // The node is reused - exit.
18663 if (IsReusedNodeFound)
18664 return false;
18665 if ((TEPtr->getVectorFactor() != VL.size() &&
18666 TEPtr->Scalars.size() != VL.size()) ||
18667 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
18668 return false;
18669 IsReusedNodeFound =
18670 equal(TE->Scalars, TEPtr->Scalars) &&
18671 equal(TE->ReorderIndices, TEPtr->ReorderIndices) &&
18672 equal(TE->ReuseShuffleIndices, TEPtr->ReuseShuffleIndices);
18673 UsedTEs.clear();
18674 UsedTEs.emplace_back().insert(TEPtr);
18675 for (Value *V : VL) {
18676 if (isConstant(V))
18677 continue;
18678 UsedValuesEntry.try_emplace(V, 0);
18679 }
18680 return true;
18681 };
18682 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
18683 unsigned EdgeIdx) {
18684 const TreeEntry *Ptr1 = User1;
18685 const TreeEntry *Ptr2 = User2;
18686 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
18687 while (Ptr2) {
18688 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
18689 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
18690 Ptr2 = Ptr2->UserTreeIndex.UserTE;
18691 }
18692 while (Ptr1) {
18693 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
18694 Ptr1 = Ptr1->UserTreeIndex.UserTE;
18695 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
18696 return Idx < It->second;
18697 }
18698 return false;
18699 };
18700 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
18701 Instruction *InsertPt) {
18702 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
18703 !TEUseEI.UserTE->isCopyableElement(
18704 const_cast<Instruction *>(TEInsertPt)) &&
18705 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
18706 InsertPt->getNextNode() == TEInsertPt &&
18707 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
18708 !isUsedOutsideBlock(InsertPt));
18709 };
18710 for (Value *V : VL) {
18711 if (isConstant(V) || !VisitedValue.insert(V).second)
18712 continue;
18713 // Build a list of tree entries where V is used.
18714 SmallPtrSet<const TreeEntry *, 4> VToTEs;
18716 ValueToGatherNodes.lookup(V).takeVector());
18717 if (TransformedToGatherNodes.contains(TE)) {
18718 for (TreeEntry *E : getSplitTreeEntries(V)) {
18719 if (TE == E || !TransformedToGatherNodes.contains(E) ||
18720 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
18721 continue;
18722 GatherNodes.push_back(E);
18723 }
18724 for (TreeEntry *E : getTreeEntries(V)) {
18725 if (TE == E || !TransformedToGatherNodes.contains(E) ||
18726 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
18727 continue;
18728 GatherNodes.push_back(E);
18729 }
18730 }
18731 for (const TreeEntry *TEPtr : GatherNodes) {
18732 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
18733 continue;
18734 assert(any_of(TEPtr->Scalars,
18735 [&](Value *V) { return GatheredScalars.contains(V); }) &&
18736 "Must contain at least single gathered value.");
18737 assert(TEPtr->UserTreeIndex &&
18738 "Expected only single user of a gather node.");
18739 if (any_of(TEPtr->CombinedEntriesWithIndices,
18740 [&](const auto &P) { return P.first == TE->Idx; }))
18741 continue;
18742 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
18743
18744 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
18745 UseEI.UserTE->hasState())
18746 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
18747 : nullptr;
18748 Instruction *InsertPt =
18749 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
18750 : &getLastInstructionInBundle(UseEI.UserTE);
18751 if (TEInsertPt == InsertPt) {
18752 // Check nodes, which might be emitted first.
18753 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18754 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
18755 TEUseEI.UserTE->isAltShuffle()) &&
18756 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
18757 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
18758 (UseEI.UserTE->hasState() &&
18759 UseEI.UserTE->getOpcode() == Instruction::PHI &&
18760 !UseEI.UserTE->isAltShuffle()) ||
18761 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
18762 continue;
18763 }
18764
18765 // If the schedulable insertion point is used in multiple entries - just
18766 // exit, no known ordering at this point, available only after real
18767 // scheduling.
18768 if (!doesNotNeedToBeScheduled(InsertPt) &&
18769 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
18770 continue;
18771 // If the users are the PHI nodes with the same incoming blocks - skip.
18772 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18773 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
18774 UseEI.UserTE->State == TreeEntry::Vectorize &&
18775 UseEI.UserTE->getOpcode() == Instruction::PHI &&
18776 TEUseEI.UserTE != UseEI.UserTE)
18777 continue;
18778 // If 2 gathers are operands of the same entry (regardless of whether
18779 // user is PHI or else), compare operands indices, use the earlier one
18780 // as the base.
18781 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
18782 continue;
18783 // If the user instruction is used for some reason in different
18784 // vectorized nodes - make it depend on index.
18785 if (TEUseEI.UserTE != UseEI.UserTE &&
18786 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
18787 HasGatherUser(TEUseEI.UserTE)))
18788 continue;
18789 // If the user node is the operand of the other user node - skip.
18790 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
18791 continue;
18792 }
18793
18794 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
18795 TEUseEI.UserTE->doesNotNeedToSchedule() !=
18796 UseEI.UserTE->doesNotNeedToSchedule() &&
18797 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
18798 continue;
18799 // Check if the user node of the TE comes after user node of TEPtr,
18800 // otherwise TEPtr depends on TE.
18801 if ((TEInsertBlock != InsertPt->getParent() ||
18802 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
18803 (!CheckOrdering(InsertPt) ||
18804 (UseEI.UserTE->hasCopyableElements() &&
18805 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
18806 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
18807 continue;
18808 // The node is reused - exit.
18809 if (CheckAndUseSameNode(TEPtr))
18810 break;
18811 // The parent node is copyable with last inst used outside? And the last
18812 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
18813 // preserve def-use chain.
18814 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
18815 continue;
18816 VToTEs.insert(TEPtr);
18817 }
18818 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
18819 const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
18820 return MTE != TE && MTE != TEUseEI.UserTE &&
18821 !DeletedNodes.contains(MTE) &&
18822 !TransformedToGatherNodes.contains(MTE);
18823 });
18824 if (It != VTEs.end()) {
18825 const TreeEntry *VTE = *It;
18826 if (none_of(TE->CombinedEntriesWithIndices,
18827 [&](const auto &P) { return P.first == VTE->Idx; })) {
18828 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
18829 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
18830 continue;
18831 }
18832 // The node is reused - exit.
18833 if (CheckAndUseSameNode(VTE))
18834 break;
18835 VToTEs.insert(VTE);
18836 }
18837 }
18838 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
18839 const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
18840 return TE != MainTE && !DeletedNodes.contains(TE) &&
18841 !TransformedToGatherNodes.contains(TE);
18842 });
18843 if (It != VTEs.end()) {
18844 const TreeEntry *VTE = *It;
18845 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
18846 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
18847 VTEs = VTEs.drop_front();
18848 // Iterate through all vectorized nodes.
18849 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
18850 return MTE->State == TreeEntry::Vectorize;
18851 });
18852 if (MIt == VTEs.end())
18853 continue;
18854 VTE = *MIt;
18855 }
18856 if (none_of(TE->CombinedEntriesWithIndices,
18857 [&](const auto &P) { return P.first == VTE->Idx; })) {
18858 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
18859 if (&LastBundleInst == TEInsertPt ||
18860 !CheckOrdering(&LastBundleInst) ||
18861 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
18862 continue;
18863 }
18864 // The node is reused - exit.
18865 if (CheckAndUseSameNode(VTE))
18866 break;
18867 VToTEs.insert(VTE);
18868 }
18869 }
18870 if (IsReusedNodeFound)
18871 break;
18872 if (VToTEs.empty())
18873 continue;
18874 if (UsedTEs.empty()) {
18875 // The first iteration, just insert the list of nodes to vector.
18876 UsedTEs.push_back(VToTEs);
18877 UsedValuesEntry.try_emplace(V, 0);
18878 } else {
18879 // Need to check if there are any previously used tree nodes which use V.
18880 // If there are no such nodes, consider that we have another one input
18881 // vector.
18882 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
18883 unsigned Idx = 0;
18884 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
18885 // Do we have a non-empty intersection of previously listed tree entries
18886 // and tree entries using current V?
18887 set_intersect(VToTEs, Set);
18888 if (!VToTEs.empty()) {
18889 // Yes, write the new subset and continue analysis for the next
18890 // scalar.
18891 Set.swap(VToTEs);
18892 break;
18893 }
18894 VToTEs = SavedVToTEs;
18895 ++Idx;
18896 }
18897 // No non-empty intersection found - need to add a second set of possible
18898 // source vectors.
18899 if (Idx == UsedTEs.size()) {
18900 // If the number of input vectors is greater than 2 - not a permutation,
18901 // fallback to the regular gather.
18902 // TODO: support multiple reshuffled nodes.
18903 if (UsedTEs.size() == 2)
18904 continue;
18905 UsedTEs.push_back(SavedVToTEs);
18906 Idx = UsedTEs.size() - 1;
18907 }
18908 UsedValuesEntry.try_emplace(V, Idx);
18909 }
18910 }
18911
18912 if (UsedTEs.empty()) {
18913 Entries.clear();
18914 return std::nullopt;
18915 }
18916
18917 unsigned VF = 0;
18918 if (UsedTEs.size() == 1) {
18919 // Keep the order to avoid non-determinism.
18920 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
18921 UsedTEs.front().end());
18922 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
18923 return TE1->Idx < TE2->Idx;
18924 });
18925 // Try to find the perfect match in another gather node at first.
18926 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
18927 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
18928 });
18929 if (It != FirstEntries.end() &&
18930 (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size() ||
18931 ((*It)->getVectorFactor() == TE->Scalars.size() &&
18932 TE->ReuseShuffleIndices.size() == VL.size() &&
18933 (*It)->isSame(TE->Scalars)))) {
18934 Entries.push_back(*It);
18935 if (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size()) {
18936 std::iota(std::next(Mask.begin(), Part * VL.size()),
18937 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
18938 } else {
18939 SmallVector<int> CommonMask = TE->getCommonMask();
18940 copy(CommonMask, Mask.begin());
18941 }
18942 // Clear undef scalars.
18943 for (unsigned I : seq<unsigned>(VL.size()))
18944 if (isa<PoisonValue>(VL[I]))
18945 Mask[Part * VL.size() + I] = PoisonMaskElem;
18947 }
18948 // No perfect match, just shuffle, so choose the first tree node from the
18949 // tree.
18950 Entries.push_back(FirstEntries.front());
18951 // Update mapping between values and corresponding tree entries.
18952 for (auto &P : UsedValuesEntry)
18953 P.second = 0;
18954 VF = FirstEntries.front()->getVectorFactor();
18955 } else {
18956 // Try to find nodes with the same vector factor.
18957 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
18958 // Keep the order of tree nodes to avoid non-determinism.
18959 DenseMap<int, const TreeEntry *> VFToTE;
18960 for (const TreeEntry *TE : UsedTEs.front()) {
18961 unsigned VF = TE->getVectorFactor();
18962 auto It = VFToTE.find(VF);
18963 if (It != VFToTE.end()) {
18964 if (It->second->Idx > TE->Idx)
18965 It->getSecond() = TE;
18966 continue;
18967 }
18968 VFToTE.try_emplace(VF, TE);
18969 }
18970 // Same, keep the order to avoid non-determinism.
18971 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
18972 UsedTEs.back().end());
18973 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
18974 return TE1->Idx < TE2->Idx;
18975 });
18976 for (const TreeEntry *TE : SecondEntries) {
18977 auto It = VFToTE.find(TE->getVectorFactor());
18978 if (It != VFToTE.end()) {
18979 VF = It->first;
18980 Entries.push_back(It->second);
18981 Entries.push_back(TE);
18982 break;
18983 }
18984 }
18985 // No 2 source vectors with the same vector factor - just choose 2 with max
18986 // index.
18987 if (Entries.empty()) {
18989 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
18990 return TE1->Idx < TE2->Idx;
18991 }));
18992 Entries.push_back(SecondEntries.front());
18993 VF = std::max(Entries.front()->getVectorFactor(),
18994 Entries.back()->getVectorFactor());
18995 } else {
18996 VF = Entries.front()->getVectorFactor();
18997 }
18998 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
18999 for (const TreeEntry *E : Entries)
19000 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
19001 E->Scalars.end());
19002 // Update mapping between values and corresponding tree entries.
19003 for (auto &P : UsedValuesEntry) {
19004 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
19005 if (ValuesToEntries[Idx].contains(P.first)) {
19006 P.second = Idx;
19007 break;
19008 }
19009 }
19010 }
19011
19012 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
19013 // Checks if the 2 PHIs are compatible in terms of high possibility to be
19014 // vectorized.
19015 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
19016 auto *PHI = cast<PHINode>(V);
19017 auto *PHI1 = cast<PHINode>(V1);
19018 // Check that all incoming values are compatible/from same parent (if they
19019 // are instructions).
19020 // The incoming values are compatible if they all are constants, or
19021 // instruction with the same/alternate opcodes from the same basic block.
19022 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
19023 Value *In = PHI->getIncomingValue(I);
19024 Value *In1 = PHI1->getIncomingValue(I);
19025 if (isConstant(In) && isConstant(In1))
19026 continue;
19027 if (!getSameOpcode({In, In1}, *TLI))
19028 return false;
19029 if (cast<Instruction>(In)->getParent() !=
19031 return false;
19032 }
19033 return true;
19034 };
19035 // Check if the value can be ignored during analysis for shuffled gathers.
19036 // We suppose it is better to ignore instruction, which do not form splats,
19037 // are not vectorized/not extractelements (these instructions will be handled
19038 // by extractelements processing) or may form vector node in future.
19039 auto MightBeIgnored = [=](Value *V) {
19040 auto *I = dyn_cast<Instruction>(V);
19041 return I && !IsSplatOrUndefs && !isVectorized(I) &&
19043 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
19044 };
19045 // Check that the neighbor instruction may form a full vector node with the
19046 // current instruction V. It is possible, if they have same/alternate opcode
19047 // and same parent basic block.
19048 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
19049 Value *V1 = VL[Idx];
19050 bool UsedInSameVTE = false;
19051 auto It = UsedValuesEntry.find(V1);
19052 if (It != UsedValuesEntry.end())
19053 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
19054 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
19055 getSameOpcode({V, V1}, *TLI) &&
19056 cast<Instruction>(V)->getParent() ==
19057 cast<Instruction>(V1)->getParent() &&
19058 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
19059 };
19060 // Build a shuffle mask for better cost estimation and vector emission.
19061 SmallBitVector UsedIdxs(Entries.size());
19063 for (int I = 0, E = VL.size(); I < E; ++I) {
19064 Value *V = VL[I];
19065 auto It = UsedValuesEntry.find(V);
19066 if (It == UsedValuesEntry.end())
19067 continue;
19068 // Do not try to shuffle scalars, if they are constants, or instructions
19069 // that can be vectorized as a result of the following vector build
19070 // vectorization.
19071 if (isConstant(V) || (MightBeIgnored(V) &&
19072 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
19073 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
19074 continue;
19075 unsigned Idx = It->second;
19076 EntryLanes.emplace_back(Idx, I);
19077 UsedIdxs.set(Idx);
19078 }
19079 // Iterate through all shuffled scalars and select entries, which can be used
19080 // for final shuffle.
19082 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
19083 if (!UsedIdxs.test(I))
19084 continue;
19085 // Fix the entry number for the given scalar. If it is the first entry, set
19086 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
19087 // These indices are used when calculating final shuffle mask as the vector
19088 // offset.
19089 for (std::pair<unsigned, int> &Pair : EntryLanes)
19090 if (Pair.first == I)
19091 Pair.first = TempEntries.size();
19092 TempEntries.push_back(Entries[I]);
19093 }
19094 Entries.swap(TempEntries);
19095 if (EntryLanes.size() == Entries.size() &&
19096 !VL.equals(ArrayRef(TE->Scalars)
19097 .slice(Part * VL.size(),
19098 std::min<int>(VL.size(), TE->Scalars.size())))) {
19099 // We may have here 1 or 2 entries only. If the number of scalars is equal
19100 // to the number of entries, no need to do the analysis, it is not very
19101 // profitable. Since VL is not the same as TE->Scalars, it means we already
19102 // have some shuffles before. Cut off not profitable case.
19103 Entries.clear();
19104 return std::nullopt;
19105 }
19106 // Build the final mask, check for the identity shuffle, if possible.
19107 bool IsIdentity = Entries.size() == 1;
19108 // Pair.first is the offset to the vector, while Pair.second is the index of
19109 // scalar in the list.
19110 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
19111 unsigned Idx = Part * VL.size() + Pair.second;
19112 Mask[Idx] =
19113 Pair.first * VF +
19114 (ForOrder ? std::distance(
19115 Entries[Pair.first]->Scalars.begin(),
19116 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
19117 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
19118 IsIdentity &= Mask[Idx] == Pair.second;
19119 }
19120 if (ForOrder || IsIdentity || Entries.empty()) {
19121 switch (Entries.size()) {
19122 case 1:
19123 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
19125 break;
19126 case 2:
19127 if (EntryLanes.size() > 2 || VL.size() <= 2)
19129 break;
19130 default:
19131 break;
19132 }
19133 } else if (!isa<VectorType>(VL.front()->getType()) &&
19134 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
19135 // Do the cost estimation if shuffle beneficial than buildvector.
19136 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
19137 std::next(Mask.begin(), (Part + 1) * VL.size()));
19138 int MinElement = SubMask.front(), MaxElement = SubMask.front();
19139 for (int Idx : SubMask) {
19140 if (Idx == PoisonMaskElem)
19141 continue;
19142 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
19143 MinElement = Idx;
19144 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
19145 MaxElement = Idx;
19146 }
19147 assert(MaxElement >= 0 && MinElement >= 0 &&
19148 MaxElement % VF >= MinElement % VF &&
19149 "Expected at least single element.");
19150 unsigned NewVF = std::max<unsigned>(
19151 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
19152 (MaxElement % VF) -
19153 (MinElement % VF) + 1));
19154 if (NewVF < VF) {
19155 for (int &Idx : SubMask) {
19156 if (Idx == PoisonMaskElem)
19157 continue;
19158 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
19159 (Idx >= static_cast<int>(VF) ? NewVF : 0);
19160 }
19161 } else {
19162 NewVF = VF;
19163 }
19164
19166 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
19167 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
19168 auto GetShuffleCost = [&,
19169 &TTI = *TTI](ArrayRef<int> Mask,
19171 VectorType *VecTy) -> InstructionCost {
19172 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
19174 Mask, Entries.front()->getInterleaveFactor()))
19175 return TTI::TCC_Free;
19176 return ::getShuffleCost(TTI,
19177 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
19179 VecTy, Mask, CostKind);
19180 };
19181 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
19182 InstructionCost FirstShuffleCost = 0;
19183 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
19184 if (Entries.size() == 1 || !Entries[0]->isGather()) {
19185 FirstShuffleCost = ShuffleCost;
19186 } else {
19187 // Transform mask to include only first entry.
19188 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
19189 bool IsIdentity = true;
19190 for (auto [I, Idx] : enumerate(FirstMask)) {
19191 if (Idx >= static_cast<int>(NewVF)) {
19192 Idx = PoisonMaskElem;
19193 } else {
19194 DemandedElts.clearBit(I);
19195 if (Idx != PoisonMaskElem)
19196 IsIdentity &= static_cast<int>(I) == Idx;
19197 }
19198 }
19199 if (!IsIdentity)
19200 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
19201 FirstShuffleCost += getScalarizationOverhead(
19202 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
19203 /*Extract=*/false, CostKind);
19204 }
19205 InstructionCost SecondShuffleCost = 0;
19206 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
19207 if (Entries.size() == 1 || !Entries[1]->isGather()) {
19208 SecondShuffleCost = ShuffleCost;
19209 } else {
19210 // Transform mask to include only first entry.
19211 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
19212 bool IsIdentity = true;
19213 for (auto [I, Idx] : enumerate(SecondMask)) {
19214 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
19215 Idx = PoisonMaskElem;
19216 } else {
19217 DemandedElts.clearBit(I);
19218 if (Idx != PoisonMaskElem) {
19219 Idx -= NewVF;
19220 IsIdentity &= static_cast<int>(I) == Idx;
19221 }
19222 }
19223 }
19224 if (!IsIdentity)
19225 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
19226 SecondShuffleCost += getScalarizationOverhead(
19227 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
19228 /*Extract=*/false, CostKind);
19229 }
19230 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
19231 for (auto [I, Idx] : enumerate(SubMask))
19232 if (Idx == PoisonMaskElem)
19233 DemandedElts.clearBit(I);
19234 InstructionCost BuildVectorCost = getScalarizationOverhead(
19235 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
19236 /*Extract=*/false, CostKind);
19237 const TreeEntry *BestEntry = nullptr;
19238 if (FirstShuffleCost < ShuffleCost) {
19239 std::for_each(std::next(Mask.begin(), Part * VL.size()),
19240 std::next(Mask.begin(), (Part + 1) * VL.size()),
19241 [&](int &Idx) {
19242 if (Idx >= static_cast<int>(VF))
19243 Idx = PoisonMaskElem;
19244 });
19245 BestEntry = Entries.front();
19246 ShuffleCost = FirstShuffleCost;
19247 }
19248 if (SecondShuffleCost < ShuffleCost) {
19249 std::for_each(std::next(Mask.begin(), Part * VL.size()),
19250 std::next(Mask.begin(), (Part + 1) * VL.size()),
19251 [&](int &Idx) {
19252 if (Idx < static_cast<int>(VF))
19253 Idx = PoisonMaskElem;
19254 else
19255 Idx -= VF;
19256 });
19257 BestEntry = Entries[1];
19258 ShuffleCost = SecondShuffleCost;
19259 }
19260 if (BuildVectorCost >= ShuffleCost) {
19261 if (BestEntry) {
19262 Entries.clear();
19263 Entries.push_back(BestEntry);
19264 }
19265 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
19267 }
19268 }
19269 Entries.clear();
19270 // Clear the corresponding mask elements.
19271 std::fill(std::next(Mask.begin(), Part * VL.size()),
19272 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
19273 return std::nullopt;
19274}
19275
19277BoUpSLP::isGatherShuffledEntry(
19278 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
19279 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
19280 bool ForOrder) {
19281 assert(NumParts > 0 && NumParts < VL.size() &&
19282 "Expected positive number of registers.");
19283 Entries.clear();
19284 // No need to check for the topmost gather node.
19285 if (TE == VectorizableTree.front().get() &&
19286 (!GatheredLoadsEntriesFirst.has_value() ||
19287 none_of(ArrayRef(VectorizableTree).drop_front(),
19288 [](const std::unique_ptr<TreeEntry> &TE) {
19289 return !TE->isGather();
19290 })))
19291 return {};
19292 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
19293 // implemented yet.
19294 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
19295 return {};
19296 Mask.assign(VL.size(), PoisonMaskElem);
19297 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
19298 "Expected only single user of the gather node.");
19299 assert(VL.size() % NumParts == 0 &&
19300 "Number of scalars must be divisible by NumParts.");
19301 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
19302 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
19303 (TE->Idx == 0 ||
19304 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
19305 isSplat(TE->Scalars) ||
19306 (TE->hasState() &&
19307 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
19308 return {};
19309 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
19311 for (unsigned Part : seq<unsigned>(NumParts)) {
19312 ArrayRef<Value *> SubVL =
19313 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
19314 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
19315 std::optional<TTI::ShuffleKind> SubRes =
19316 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
19317 ForOrder);
19318 if (!SubRes)
19319 SubEntries.clear();
19320 Res.push_back(SubRes);
19321 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
19322 SubEntries.front()->getVectorFactor() == VL.size() &&
19323 (SubEntries.front()->isSame(TE->Scalars) ||
19324 SubEntries.front()->isSame(VL))) {
19325 SmallVector<const TreeEntry *> LocalSubEntries;
19326 LocalSubEntries.swap(SubEntries);
19327 Entries.clear();
19328 Res.clear();
19329 std::iota(Mask.begin(), Mask.end(), 0);
19330 // Clear undef scalars.
19331 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
19332 if (isa<PoisonValue>(VL[I]))
19334 Entries.emplace_back(1, LocalSubEntries.front());
19336 return Res;
19337 }
19338 }
19339 if (all_of(Res,
19340 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
19341 Entries.clear();
19342 return {};
19343 }
19344 return Res;
19345}
19346
19347InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
19348 Type *ScalarTy) const {
19349 const unsigned VF = VL.size();
19350 auto *VecTy = getWidenedType(ScalarTy, VF);
19351 // Find the cost of inserting/extracting values from the vector.
19352 // Check if the same elements are inserted several times and count them as
19353 // shuffle candidates.
19354 APInt DemandedElements = APInt::getZero(VF);
19357 auto EstimateInsertCost = [&](unsigned I, Value *V) {
19358 DemandedElements.setBit(I);
19359 if (V->getType() != ScalarTy)
19360 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
19362 };
19363 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
19364 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
19365 for (auto [I, V] : enumerate(VL)) {
19366 // No need to shuffle duplicates for constants.
19367 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
19368 continue;
19369
19370 if (isConstant(V)) {
19371 ConstantShuffleMask[I] = I + VF;
19372 continue;
19373 }
19374 EstimateInsertCost(I, V);
19375 }
19376 // FIXME: add a cost for constant vector materialization.
19377 bool IsAnyNonUndefConst =
19378 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
19379 // 1. Shuffle input source vector and constant vector.
19380 if (!ForPoisonSrc && IsAnyNonUndefConst) {
19382 ConstantShuffleMask);
19383 }
19384
19385 // 2. Insert unique non-constants.
19386 if (!DemandedElements.isZero())
19387 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
19388 /*Insert=*/true,
19389 /*Extract=*/false, CostKind,
19390 ForPoisonSrc && !IsAnyNonUndefConst, VL);
19391 return Cost;
19392}
19393
19394Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
19395 auto It = EntryToLastInstruction.find(E);
19396 if (It != EntryToLastInstruction.end())
19397 return *cast<Instruction>(It->second);
19398 Instruction *Res = nullptr;
19399 // Get the basic block this bundle is in. All instructions in the bundle
19400 // should be in this block (except for extractelement-like instructions with
19401 // constant indices or gathered loads or copyables).
19402 Instruction *Front;
19403 unsigned Opcode;
19404 if (E->hasState()) {
19405 Front = E->getMainOp();
19406 Opcode = E->getOpcode();
19407 } else {
19408 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
19409 Opcode = Front->getOpcode();
19410 }
19411 auto *BB = Front->getParent();
19412 assert(
19413 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
19414 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
19415 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
19416 all_of(E->Scalars,
19417 [=](Value *V) -> bool {
19418 if (Opcode == Instruction::GetElementPtr &&
19419 !isa<GetElementPtrInst>(V))
19420 return true;
19421 auto *I = dyn_cast<Instruction>(V);
19422 return !I || !E->getMatchingMainOpOrAltOp(I) ||
19423 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
19424 })) &&
19425 "Expected gathered loads or GEPs or instructions from same basic "
19426 "block.");
19427
19428 auto FindLastInst = [&]() {
19429 Instruction *LastInst = Front;
19430 for (Value *V : E->Scalars) {
19431 auto *I = dyn_cast<Instruction>(V);
19432 if (!I)
19433 continue;
19434 if (E->isCopyableElement(I))
19435 continue;
19436 if (LastInst->getParent() == I->getParent()) {
19437 if (LastInst->comesBefore(I))
19438 LastInst = I;
19439 continue;
19440 }
19441 assert(((Opcode == Instruction::GetElementPtr &&
19443 E->State == TreeEntry::SplitVectorize ||
19444 (isVectorLikeInstWithConstOps(LastInst) &&
19446 (GatheredLoadsEntriesFirst.has_value() &&
19447 Opcode == Instruction::Load && E->isGather() &&
19448 E->Idx < *GatheredLoadsEntriesFirst)) &&
19449 "Expected vector-like or non-GEP in GEP node insts only.");
19450 if (!DT->isReachableFromEntry(LastInst->getParent())) {
19451 LastInst = I;
19452 continue;
19453 }
19454 if (!DT->isReachableFromEntry(I->getParent()))
19455 continue;
19456 auto *NodeA = DT->getNode(LastInst->getParent());
19457 auto *NodeB = DT->getNode(I->getParent());
19458 assert(NodeA && "Should only process reachable instructions");
19459 assert(NodeB && "Should only process reachable instructions");
19460 assert((NodeA == NodeB) ==
19461 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
19462 "Different nodes should have different DFS numbers");
19463 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
19464 LastInst = I;
19465 }
19466 BB = LastInst->getParent();
19467 return LastInst;
19468 };
19469
19470 auto FindFirstInst = [&]() {
19471 Instruction *FirstInst = Front;
19472 for (Value *V : E->Scalars) {
19473 auto *I = dyn_cast<Instruction>(V);
19474 if (!I)
19475 continue;
19476 if (E->isCopyableElement(I))
19477 continue;
19478 if (FirstInst->getParent() == I->getParent()) {
19479 if (I->comesBefore(FirstInst))
19480 FirstInst = I;
19481 continue;
19482 }
19483 assert(((Opcode == Instruction::GetElementPtr &&
19485 (isVectorLikeInstWithConstOps(FirstInst) &&
19487 "Expected vector-like or non-GEP in GEP node insts only.");
19488 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
19489 FirstInst = I;
19490 continue;
19491 }
19492 if (!DT->isReachableFromEntry(I->getParent()))
19493 continue;
19494 auto *NodeA = DT->getNode(FirstInst->getParent());
19495 auto *NodeB = DT->getNode(I->getParent());
19496 assert(NodeA && "Should only process reachable instructions");
19497 assert(NodeB && "Should only process reachable instructions");
19498 assert((NodeA == NodeB) ==
19499 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
19500 "Different nodes should have different DFS numbers");
19501 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
19502 FirstInst = I;
19503 }
19504 return FirstInst;
19505 };
19506
19507 if (E->State == TreeEntry::SplitVectorize) {
19508 Res = FindLastInst();
19509 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
19510 for (auto *E : Entries) {
19511 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
19512 if (!I)
19513 I = &getLastInstructionInBundle(E);
19514 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
19515 Res = I;
19516 }
19517 }
19518 EntryToLastInstruction.try_emplace(E, Res);
19519 return *Res;
19520 }
19521
19522 // Set insertpoint for gathered loads to the very first load.
19523 if (GatheredLoadsEntriesFirst.has_value() &&
19524 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
19525 Opcode == Instruction::Load) {
19526 Res = FindFirstInst();
19527 EntryToLastInstruction.try_emplace(E, Res);
19528 return *Res;
19529 }
19530
19531 // Set the insert point to the beginning of the basic block if the entry
19532 // should not be scheduled.
19533 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
19534 if (E->isGather())
19535 return nullptr;
19536 // Found previously that the instruction do not need to be scheduled.
19537 const auto *It = BlocksSchedules.find(BB);
19538 if (It == BlocksSchedules.end())
19539 return nullptr;
19540 for (Value *V : E->Scalars) {
19541 auto *I = dyn_cast<Instruction>(V);
19542 if (!I || isa<PHINode>(I) ||
19543 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
19544 continue;
19545 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
19546 if (Bundles.empty())
19547 continue;
19548 const auto *It = find_if(
19549 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
19550 if (It != Bundles.end())
19551 return *It;
19552 }
19553 return nullptr;
19554 };
19555 const ScheduleBundle *Bundle = FindScheduleBundle(E);
19556 if (!E->isGather() && !Bundle) {
19557 if ((Opcode == Instruction::GetElementPtr &&
19558 any_of(E->Scalars,
19559 [](Value *V) {
19560 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
19561 })) ||
19562 (all_of(E->Scalars,
19563 [&](Value *V) {
19564 return isa<PoisonValue>(V) ||
19565 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
19566 E->isCopyableElement(V) ||
19567 (!isVectorLikeInstWithConstOps(V) &&
19568 isUsedOutsideBlock(V));
19569 }) &&
19570 (!E->doesNotNeedToSchedule() ||
19571 any_of(E->Scalars,
19572 [&](Value *V) {
19573 if (!isa<Instruction>(V) ||
19574 (E->hasCopyableElements() && E->isCopyableElement(V)))
19575 return false;
19576 return !areAllOperandsNonInsts(V);
19577 }) ||
19578 none_of(E->Scalars, [&](Value *V) {
19579 if (!isa<Instruction>(V) ||
19580 (E->hasCopyableElements() && E->isCopyableElement(V)))
19581 return false;
19582 return MustGather.contains(V);
19583 }))))
19584 Res = FindLastInst();
19585 else
19586 Res = FindFirstInst();
19587 EntryToLastInstruction.try_emplace(E, Res);
19588 return *Res;
19589 }
19590
19591 // Find the last instruction. The common case should be that BB has been
19592 // scheduled, and the last instruction is VL.back(). So we start with
19593 // VL.back() and iterate over schedule data until we reach the end of the
19594 // bundle. The end of the bundle is marked by null ScheduleData.
19595 if (Bundle) {
19596 assert(!E->isGather() && "Gathered instructions should not be scheduled");
19597 Res = Bundle->getBundle().back()->getInst();
19598 EntryToLastInstruction.try_emplace(E, Res);
19599 return *Res;
19600 }
19601
19602 // LastInst can still be null at this point if there's either not an entry
19603 // for BB in BlocksSchedules or there's no ScheduleData available for
19604 // VL.back(). This can be the case if buildTreeRec aborts for various
19605 // reasons (e.g., the maximum recursion depth is reached, the maximum region
19606 // size is reached, etc.). ScheduleData is initialized in the scheduling
19607 // "dry-run".
19608 //
19609 // If this happens, we can still find the last instruction by brute force. We
19610 // iterate forwards from Front (inclusive) until we either see all
19611 // instructions in the bundle or reach the end of the block. If Front is the
19612 // last instruction in program order, LastInst will be set to Front, and we
19613 // will visit all the remaining instructions in the block.
19614 //
19615 // One of the reasons we exit early from buildTreeRec is to place an upper
19616 // bound on compile-time. Thus, taking an additional compile-time hit here is
19617 // not ideal. However, this should be exceedingly rare since it requires that
19618 // we both exit early from buildTreeRec and that the bundle be out-of-order
19619 // (causing us to iterate all the way to the end of the block).
19620 if (!Res)
19621 Res = FindLastInst();
19622 assert(Res && "Failed to find last instruction in bundle");
19623 EntryToLastInstruction.try_emplace(E, Res);
19624 return *Res;
19625}
19626
19627void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
19628 auto *Front = E->getMainOp();
19629 Instruction *LastInst = &getLastInstructionInBundle(E);
19630 assert(LastInst && "Failed to find last instruction in bundle");
19631 BasicBlock::iterator LastInstIt = LastInst->getIterator();
19632 // If the instruction is PHI, set the insert point after all the PHIs.
19633 bool IsPHI = isa<PHINode>(LastInst);
19634 if (IsPHI) {
19635 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
19636 if (LastInstIt != LastInst->getParent()->end() &&
19637 LastInstIt->getParent()->isLandingPad())
19638 LastInstIt = std::next(LastInstIt);
19639 }
19640 if (IsPHI ||
19641 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
19642 (E->doesNotNeedToSchedule() ||
19643 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
19644 isUsedOutsideBlock(LastInst)))) ||
19645 (GatheredLoadsEntriesFirst.has_value() &&
19646 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
19647 E->getOpcode() == Instruction::Load)) {
19648 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
19649 } else {
19650 // Set the insertion point after the last instruction in the bundle. Set the
19651 // debug location to Front.
19652 Builder.SetInsertPoint(
19653 LastInst->getParent(),
19654 LastInst->getNextNode()->getIterator());
19655 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
19656 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
19657 } else {
19658 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
19659 PoisonValue::get(Builder.getPtrTy()),
19660 MaybeAlign());
19661 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
19662 eraseInstruction(Res);
19663 LastInstructionToPos.try_emplace(LastInst, Res);
19664 }
19665 }
19666 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
19667}
19668
19669Value *BoUpSLP::gather(
19670 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
19671 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
19672 // List of instructions/lanes from current block and/or the blocks which are
19673 // part of the current loop. These instructions will be inserted at the end to
19674 // make it possible to optimize loops and hoist invariant instructions out of
19675 // the loops body with better chances for success.
19677 SmallSet<int, 4> PostponedIndices;
19678 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
19679 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
19680 SmallPtrSet<BasicBlock *, 4> Visited;
19681 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
19682 InsertBB = InsertBB->getSinglePredecessor();
19683 return InsertBB && InsertBB == InstBB;
19684 };
19685 for (int I = 0, E = VL.size(); I < E; ++I) {
19686 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
19687 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
19688 isVectorized(Inst) ||
19689 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
19690 PostponedIndices.insert(I).second)
19691 PostponedInsts.emplace_back(Inst, I);
19692 }
19693
19694 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
19695 Type *Ty) {
19696 Value *Scalar = V;
19697 // Drop NUW from trunc to avoid incorrect codegen.
19698 Value *Trunced;
19699 if (match(Scalar, m_NUWTrunc(m_Value(Trunced))))
19700 cast<TruncInst>(Scalar)->setHasNoUnsignedWrap(/*B=*/false);
19701 if (Scalar->getType() != Ty) {
19702 assert(Scalar->getType()->isIntOrIntVectorTy() &&
19703 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
19704 Value *V = Scalar;
19705 if (auto *CI = dyn_cast<CastInst>(Scalar);
19707 Value *Op = CI->getOperand(0);
19708 if (auto *IOp = dyn_cast<Instruction>(Op);
19709 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
19710 V = Op;
19711 }
19712 Scalar = Builder.CreateIntCast(
19713 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
19714 }
19715
19716 Instruction *InsElt;
19717 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
19718 assert(SLPReVec && "FixedVectorType is not expected.");
19719 Vec =
19720 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
19721 auto *II = dyn_cast<Instruction>(Vec);
19722 if (!II)
19723 return Vec;
19724 InsElt = II;
19725 } else {
19726 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
19727 InsElt = dyn_cast<InsertElementInst>(Vec);
19728 if (!InsElt)
19729 return Vec;
19730 }
19731 GatherShuffleExtractSeq.insert(InsElt);
19732 CSEBlocks.insert(InsElt->getParent());
19733 // Add to our 'need-to-extract' list.
19734 if (isa<Instruction>(V)) {
19735 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
19736 const auto *It = find_if(Entries, [&](const TreeEntry *E) {
19737 return !TransformedToGatherNodes.contains(E) &&
19738 !DeletedNodes.contains(E);
19739 });
19740 if (It != Entries.end()) {
19741 // Find which lane we need to extract.
19742 User *UserOp = nullptr;
19743 if (Scalar != V) {
19744 if (auto *SI = dyn_cast<Instruction>(Scalar))
19745 UserOp = SI;
19746 } else {
19747 if (V->getType()->isVectorTy()) {
19748 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
19749 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
19750 // Find shufflevector, caused by resize.
19751 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
19752 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
19753 if (SV->getOperand(0) == V)
19754 return SV;
19755 if (SV->getOperand(1) == V)
19756 return SV;
19757 }
19758 return nullptr;
19759 };
19760 InsElt = nullptr;
19761 if (Instruction *User = FindOperand(SV->getOperand(0), V))
19762 InsElt = User;
19763 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
19764 InsElt = User;
19765 assert(InsElt &&
19766 "Failed to find shufflevector, caused by resize.");
19767 }
19768 }
19769 UserOp = InsElt;
19770 }
19771 if (UserOp) {
19772 unsigned FoundLane = (*It)->findLaneForValue(V);
19773 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
19774 }
19775 }
19776 }
19777 return Vec;
19778 };
19779 auto *VecTy = getWidenedType(ScalarTy, VL.size());
19780 Value *Vec = PoisonValue::get(VecTy);
19781 SmallVector<int> NonConsts;
19782 SmallVector<int> Mask(VL.size());
19783 std::iota(Mask.begin(), Mask.end(), 0);
19784 Value *OriginalRoot = Root;
19785 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
19786 SV && isa<PoisonValue>(SV->getOperand(1)) &&
19787 SV->getOperand(0)->getType() == VecTy) {
19788 Root = SV->getOperand(0);
19789 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
19790 }
19791 // Insert constant values at first.
19792 for (int I = 0, E = VL.size(); I < E; ++I) {
19793 if (PostponedIndices.contains(I))
19794 continue;
19795 if (!isConstant(VL[I])) {
19796 NonConsts.push_back(I);
19797 continue;
19798 }
19799 if (isa<PoisonValue>(VL[I]))
19800 continue;
19801 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
19802 Mask[I] = I + E;
19803 }
19804 if (Root) {
19805 if (isa<PoisonValue>(Vec)) {
19806 Vec = OriginalRoot;
19807 } else {
19808 Vec = CreateShuffle(Root, Vec, Mask);
19809 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
19810 OI && OI->use_empty() &&
19811 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
19812 return TE->VectorizedValue == OI;
19813 }))
19814 eraseInstruction(OI);
19815 }
19816 }
19817 // Insert non-constant values.
19818 for (int I : NonConsts)
19819 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
19820 // Append instructions, which are/may be part of the loop, in the end to make
19821 // it possible to hoist non-loop-based instructions.
19822 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
19823 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
19824
19825 return Vec;
19826}
19827
19828/// Merges shuffle masks and emits final shuffle instruction, if required. It
19829/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
19830/// when the actual shuffle instruction is generated only if this is actually
19831/// required. Otherwise, the shuffle instruction emission is delayed till the
19832/// end of the process, to reduce the number of emitted instructions and further
19833/// analysis/transformations.
19834/// The class also will look through the previously emitted shuffle instructions
19835/// and properly mark indices in mask as undef.
19836/// For example, given the code
19837/// \code
19838/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
19839/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
19840/// \endcode
19841/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
19842/// look through %s1 and %s2 and emit
19843/// \code
19844/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
19845/// \endcode
19846/// instead.
19847/// If 2 operands are of different size, the smallest one will be resized and
19848/// the mask recalculated properly.
19849/// For example, given the code
19850/// \code
19851/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
19852/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
19853/// \endcode
19854/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
19855/// look through %s1 and %s2 and emit
19856/// \code
19857/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
19858/// \endcode
19859/// instead.
19860class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
19861 bool IsFinalized = false;
19862 /// Combined mask for all applied operands and masks. It is built during
19863 /// analysis and actual emission of shuffle vector instructions.
19864 SmallVector<int> CommonMask;
19865 /// List of operands for the shuffle vector instruction. It hold at max 2
19866 /// operands, if the 3rd is going to be added, the first 2 are combined into
19867 /// shuffle with \p CommonMask mask, the first operand sets to be the
19868 /// resulting shuffle and the second operand sets to be the newly added
19869 /// operand. The \p CommonMask is transformed in the proper way after that.
19870 SmallVector<Value *, 2> InVectors;
19871 IRBuilderBase &Builder;
19872 BoUpSLP &R;
19873
19874 class ShuffleIRBuilder {
19875 IRBuilderBase &Builder;
19876 /// Holds all of the instructions that we gathered.
19877 SetVector<Instruction *> &GatherShuffleExtractSeq;
19878 /// A list of blocks that we are going to CSE.
19879 DenseSet<BasicBlock *> &CSEBlocks;
19880 /// Data layout.
19881 const DataLayout &DL;
19882
19883 public:
19884 ShuffleIRBuilder(IRBuilderBase &Builder,
19885 SetVector<Instruction *> &GatherShuffleExtractSeq,
19886 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
19887 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
19888 CSEBlocks(CSEBlocks), DL(DL) {}
19889 ~ShuffleIRBuilder() = default;
19890 /// Creates shufflevector for the 2 operands with the given mask.
19891 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
19892 if (V1->getType() != V2->getType()) {
19894 V1->getType()->isIntOrIntVectorTy() &&
19895 "Expected integer vector types only.");
19896 if (V1->getType() != V2->getType()) {
19897 if (cast<VectorType>(V2->getType())
19898 ->getElementType()
19899 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
19900 ->getElementType()
19901 ->getIntegerBitWidth())
19902 V2 = Builder.CreateIntCast(
19903 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
19904 else
19905 V1 = Builder.CreateIntCast(
19906 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
19907 }
19908 }
19909 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
19910 if (auto *I = dyn_cast<Instruction>(Vec)) {
19911 GatherShuffleExtractSeq.insert(I);
19912 CSEBlocks.insert(I->getParent());
19913 }
19914 return Vec;
19915 }
19916 /// Creates permutation of the single vector operand with the given mask, if
19917 /// it is not identity mask.
19918 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
19919 if (Mask.empty())
19920 return V1;
19921 unsigned VF = Mask.size();
19922 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
19923 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
19924 return V1;
19925 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
19926 if (auto *I = dyn_cast<Instruction>(Vec)) {
19927 GatherShuffleExtractSeq.insert(I);
19928 CSEBlocks.insert(I->getParent());
19929 }
19930 return Vec;
19931 }
19932 Value *createIdentity(Value *V) { return V; }
19933 Value *createPoison(Type *Ty, unsigned VF) {
19934 return PoisonValue::get(getWidenedType(Ty, VF));
19935 }
19936 /// Resizes 2 input vector to match the sizes, if the they are not equal
19937 /// yet. The smallest vector is resized to the size of the larger vector.
19938 void resizeToMatch(Value *&V1, Value *&V2) {
19939 if (V1->getType() == V2->getType())
19940 return;
19941 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
19942 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
19943 int VF = std::max(V1VF, V2VF);
19944 int MinVF = std::min(V1VF, V2VF);
19945 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
19946 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
19947 0);
19948 Value *&Op = MinVF == V1VF ? V1 : V2;
19949 Op = Builder.CreateShuffleVector(Op, IdentityMask);
19950 if (auto *I = dyn_cast<Instruction>(Op)) {
19951 GatherShuffleExtractSeq.insert(I);
19952 CSEBlocks.insert(I->getParent());
19953 }
19954 if (MinVF == V1VF)
19955 V1 = Op;
19956 else
19957 V2 = Op;
19958 }
19959 };
19960
19961 /// Smart shuffle instruction emission, walks through shuffles trees and
19962 /// tries to find the best matching vector for the actual shuffle
19963 /// instruction.
19964 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
19965 assert(V1 && "Expected at least one vector value.");
19966 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
19967 R.CSEBlocks, *R.DL);
19968 return BaseShuffleAnalysis::createShuffle<Value *>(
19969 V1, V2, Mask, ShuffleBuilder, ScalarTy);
19970 }
19971
19972 /// Cast value \p V to the vector type with the same number of elements, but
19973 /// the base type \p ScalarTy.
19974 Value *castToScalarTyElem(Value *V,
19975 std::optional<bool> IsSigned = std::nullopt) {
19976 auto *VecTy = cast<VectorType>(V->getType());
19977 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
19978 if (VecTy->getElementType() == ScalarTy->getScalarType())
19979 return V;
19980 return Builder.CreateIntCast(
19981 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
19982 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
19983 }
19984
19985 Value *getVectorizedValue(const TreeEntry &E) {
19986 Value *Vec = E.VectorizedValue;
19987 if (!Vec->getType()->isIntOrIntVectorTy())
19988 return Vec;
19989 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
19990 return !isa<PoisonValue>(V) &&
19991 !isKnownNonNegative(
19992 V, SimplifyQuery(*R.DL));
19993 }));
19994 }
19995
19996public:
19998 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
19999
20000 /// Adjusts extractelements after reusing them.
20001 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
20002 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
20003 unsigned NumParts, bool &UseVecBaseAsInput) {
20004 UseVecBaseAsInput = false;
20005 SmallPtrSet<Value *, 4> UniqueBases;
20006 Value *VecBase = nullptr;
20007 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
20008 if (!E->ReorderIndices.empty()) {
20009 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
20010 E->ReorderIndices.end());
20011 reorderScalars(VL, ReorderMask);
20012 }
20013 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
20014 int Idx = Mask[I];
20015 if (Idx == PoisonMaskElem)
20016 continue;
20017 auto *EI = cast<ExtractElementInst>(VL[I]);
20018 VecBase = EI->getVectorOperand();
20019 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
20020 VecBase = TEs.front()->VectorizedValue;
20021 assert(VecBase && "Expected vectorized value.");
20022 UniqueBases.insert(VecBase);
20023 // If the only one use is vectorized - can delete the extractelement
20024 // itself.
20025 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
20026 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
20027 !R.isVectorized(EI) &&
20028 count_if(E->Scalars, [&](Value *V) { return V == EI; }) !=
20029 count_if(E->UserTreeIndex.UserTE->Scalars,
20030 [&](Value *V) { return V == EI; })) ||
20031 (NumParts != 1 && count(VL, EI) > 1) ||
20032 any_of(EI->users(), [&](User *U) {
20033 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
20034 return UTEs.empty() || UTEs.size() > 1 ||
20035 any_of(UTEs,
20036 [&](const TreeEntry *TE) {
20037 return R.DeletedNodes.contains(TE) ||
20038 R.TransformedToGatherNodes.contains(TE);
20039 }) ||
20041 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
20042 (!UTEs.empty() &&
20043 count_if(R.VectorizableTree,
20044 [&](const std::unique_ptr<TreeEntry> &TE) {
20045 return TE->UserTreeIndex.UserTE ==
20046 UTEs.front() &&
20047 is_contained(VL, EI);
20048 }) != 1);
20049 }))
20050 continue;
20051 R.eraseInstruction(EI);
20052 }
20053 if (NumParts == 1 || UniqueBases.size() == 1) {
20054 assert(VecBase && "Expected vectorized value.");
20055 return castToScalarTyElem(VecBase);
20056 }
20057 UseVecBaseAsInput = true;
20058 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
20059 for (auto [I, Idx] : enumerate(Mask))
20060 if (Idx != PoisonMaskElem)
20061 Idx = I;
20062 };
20063 // Perform multi-register vector shuffle, joining them into a single virtual
20064 // long vector.
20065 // Need to shuffle each part independently and then insert all this parts
20066 // into a long virtual vector register, forming the original vector.
20067 Value *Vec = nullptr;
20068 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
20069 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
20070 for (unsigned Part : seq<unsigned>(NumParts)) {
20071 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
20072 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
20073 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
20074 constexpr int MaxBases = 2;
20075 SmallVector<Value *, MaxBases> Bases(MaxBases);
20076 auto VLMask = zip(SubVL, SubMask);
20077 const unsigned VF = std::accumulate(
20078 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
20079 if (std::get<1>(D) == PoisonMaskElem)
20080 return S;
20081 Value *VecOp =
20082 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
20083 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
20084 !TEs.empty())
20085 VecOp = TEs.front()->VectorizedValue;
20086 assert(VecOp && "Expected vectorized value.");
20087 const unsigned Size =
20088 cast<FixedVectorType>(VecOp->getType())->getNumElements();
20089 return std::max(S, Size);
20090 });
20091 for (const auto [V, I] : VLMask) {
20092 if (I == PoisonMaskElem)
20093 continue;
20094 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
20095 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
20096 VecOp = TEs.front()->VectorizedValue;
20097 assert(VecOp && "Expected vectorized value.");
20098 VecOp = castToScalarTyElem(VecOp);
20099 Bases[I / VF] = VecOp;
20100 }
20101 if (!Bases.front())
20102 continue;
20103 Value *SubVec;
20104 if (Bases.back()) {
20105 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
20106 TransformToIdentity(SubMask);
20107 } else {
20108 SubVec = Bases.front();
20109 }
20110 if (!Vec) {
20111 Vec = SubVec;
20112 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
20113 [&](unsigned P) {
20114 ArrayRef<int> SubMask =
20115 Mask.slice(P * SliceSize,
20116 getNumElems(Mask.size(),
20117 SliceSize, P));
20118 return all_of(SubMask, [](int Idx) {
20119 return Idx == PoisonMaskElem;
20120 });
20121 })) &&
20122 "Expected first part or all previous parts masked.");
20123 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
20124 } else {
20125 unsigned NewVF =
20126 cast<FixedVectorType>(Vec->getType())->getNumElements();
20127 if (Vec->getType() != SubVec->getType()) {
20128 unsigned SubVecVF =
20129 cast<FixedVectorType>(SubVec->getType())->getNumElements();
20130 NewVF = std::max(NewVF, SubVecVF);
20131 }
20132 // Adjust SubMask.
20133 for (int &Idx : SubMask)
20134 if (Idx != PoisonMaskElem)
20135 Idx += NewVF;
20136 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
20137 Vec = createShuffle(Vec, SubVec, VecMask);
20138 TransformToIdentity(VecMask);
20139 }
20140 }
20141 copy(VecMask, Mask.begin());
20142 return Vec;
20143 }
20144 /// Checks if the specified entry \p E needs to be delayed because of its
20145 /// dependency nodes.
20146 std::optional<Value *>
20147 needToDelay(const TreeEntry *E,
20149 // No need to delay emission if all deps are ready.
20150 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
20151 return all_of(
20152 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
20153 }))
20154 return std::nullopt;
20155 // Postpone gather emission, will be emitted after the end of the
20156 // process to keep correct order.
20157 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
20158 return Builder.CreateAlignedLoad(
20159 ResVecTy,
20160 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
20161 MaybeAlign());
20162 }
20163 /// Reset the builder to handle perfect diamond match.
20165 IsFinalized = false;
20166 CommonMask.clear();
20167 InVectors.clear();
20168 }
20169 /// Adds 2 input vectors (in form of tree entries) and the mask for their
20170 /// shuffling.
20171 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
20172 Value *V1 = getVectorizedValue(E1);
20173 Value *V2 = getVectorizedValue(E2);
20174 add(V1, V2, Mask);
20175 }
20176 /// Adds single input vector (in form of tree entry) and the mask for its
20177 /// shuffling.
20178 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
20179 Value *V1 = getVectorizedValue(E1);
20180 add(V1, Mask);
20181 }
20182 /// Adds 2 input vectors and the mask for their shuffling.
20183 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
20184 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
20187 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
20188 V1 = castToScalarTyElem(V1);
20189 V2 = castToScalarTyElem(V2);
20190 if (InVectors.empty()) {
20191 InVectors.push_back(V1);
20192 InVectors.push_back(V2);
20193 CommonMask.assign(Mask.begin(), Mask.end());
20194 return;
20195 }
20196 Value *Vec = InVectors.front();
20197 if (InVectors.size() == 2) {
20198 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
20199 transformMaskAfterShuffle(CommonMask, CommonMask);
20200 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
20201 Mask.size()) {
20202 Vec = createShuffle(Vec, nullptr, CommonMask);
20203 transformMaskAfterShuffle(CommonMask, CommonMask);
20204 }
20205 V1 = createShuffle(V1, V2, Mask);
20206 unsigned VF = std::max(getVF(V1), getVF(Vec));
20207 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20208 if (Mask[Idx] != PoisonMaskElem)
20209 CommonMask[Idx] = Idx + VF;
20210 InVectors.front() = Vec;
20211 if (InVectors.size() == 2)
20212 InVectors.back() = V1;
20213 else
20214 InVectors.push_back(V1);
20215 }
20216 /// Adds another one input vector and the mask for the shuffling.
20217 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
20219 "castToScalarTyElem expects V1 to be FixedVectorType");
20220 V1 = castToScalarTyElem(V1);
20221 if (InVectors.empty()) {
20222 InVectors.push_back(V1);
20223 CommonMask.assign(Mask.begin(), Mask.end());
20224 return;
20225 }
20226 const auto *It = find(InVectors, V1);
20227 if (It == InVectors.end()) {
20228 if (InVectors.size() == 2 ||
20229 InVectors.front()->getType() != V1->getType()) {
20230 Value *V = InVectors.front();
20231 if (InVectors.size() == 2) {
20232 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
20233 transformMaskAfterShuffle(CommonMask, CommonMask);
20234 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
20235 CommonMask.size()) {
20236 V = createShuffle(InVectors.front(), nullptr, CommonMask);
20237 transformMaskAfterShuffle(CommonMask, CommonMask);
20238 }
20239 unsigned VF = std::max(CommonMask.size(), Mask.size());
20240 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20241 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
20242 CommonMask[Idx] = V->getType() != V1->getType()
20243 ? Idx + VF
20244 : Mask[Idx] + getVF(V1);
20245 if (V->getType() != V1->getType())
20246 V1 = createShuffle(V1, nullptr, Mask);
20247 InVectors.front() = V;
20248 if (InVectors.size() == 2)
20249 InVectors.back() = V1;
20250 else
20251 InVectors.push_back(V1);
20252 return;
20253 }
20254 // Check if second vector is required if the used elements are already
20255 // used from the first one.
20256 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20257 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
20258 InVectors.push_back(V1);
20259 break;
20260 }
20261 }
20262 unsigned VF = 0;
20263 for (Value *V : InVectors)
20264 VF = std::max(VF, getVF(V));
20265 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20266 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
20267 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
20268 }
20269 /// Adds another one input vector and the mask for the shuffling.
20271 SmallVector<int> NewMask;
20272 inversePermutation(Order, NewMask);
20273 add(V1, NewMask);
20274 }
20275 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
20276 Value *Root = nullptr) {
20277 return R.gather(VL, Root, ScalarTy,
20278 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20279 return createShuffle(V1, V2, Mask);
20280 });
20281 }
20282 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
20283 /// Finalize emission of the shuffles.
20284 /// \param Action the action (if any) to be performed before final applying of
20285 /// the \p ExtMask mask.
20287 ArrayRef<int> ExtMask,
20288 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
20289 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
20292 Action = {}) {
20293 IsFinalized = true;
20294 if (Action) {
20295 Value *Vec = InVectors.front();
20296 if (InVectors.size() == 2) {
20297 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
20298 InVectors.pop_back();
20299 } else {
20300 Vec = createShuffle(Vec, nullptr, CommonMask);
20301 }
20302 transformMaskAfterShuffle(CommonMask, CommonMask);
20303 assert(VF > 0 &&
20304 "Expected vector length for the final value before action.");
20305 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20306 if (VecVF < VF) {
20307 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20308 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
20309 Vec = createShuffle(Vec, nullptr, ResizeMask);
20310 }
20311 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
20312 return createShuffle(V1, V2, Mask);
20313 });
20314 InVectors.front() = Vec;
20315 }
20316 if (!SubVectors.empty()) {
20317 Value *Vec = InVectors.front();
20318 if (InVectors.size() == 2) {
20319 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
20320 InVectors.pop_back();
20321 } else {
20322 Vec = createShuffle(Vec, nullptr, CommonMask);
20323 }
20324 transformMaskAfterShuffle(CommonMask, CommonMask);
20325 auto CreateSubVectors = [&](Value *Vec,
20326 SmallVectorImpl<int> &CommonMask) {
20327 for (auto [E, Idx] : SubVectors) {
20328 Value *V = getVectorizedValue(*E);
20329 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
20330 // Use scalar version of the SCalarType to correctly handle shuffles
20331 // for revectorization. The revectorization mode operates by the
20332 // vectors, but here we need to operate on the scalars, because the
20333 // masks were already transformed for the vector elements and we don't
20334 // need doing this transformation again.
20335 Type *OrigScalarTy = ScalarTy;
20336 ScalarTy = ScalarTy->getScalarType();
20337 Vec = createInsertVector(
20338 Builder, Vec, V, InsertionIndex,
20339 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
20340 _3));
20341 ScalarTy = OrigScalarTy;
20342 if (!CommonMask.empty()) {
20343 std::iota(std::next(CommonMask.begin(), Idx),
20344 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
20345 Idx);
20346 }
20347 }
20348 return Vec;
20349 };
20350 if (SubVectorsMask.empty()) {
20351 Vec = CreateSubVectors(Vec, CommonMask);
20352 } else {
20353 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
20354 copy(SubVectorsMask, SVMask.begin());
20355 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
20356 if (I2 != PoisonMaskElem) {
20357 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
20358 I1 = I2 + CommonMask.size();
20359 }
20360 }
20361 Value *InsertVec =
20362 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
20363 Vec = createShuffle(InsertVec, Vec, SVMask);
20364 transformMaskAfterShuffle(CommonMask, SVMask);
20365 }
20366 InVectors.front() = Vec;
20367 }
20368
20369 if (!ExtMask.empty()) {
20370 if (CommonMask.empty()) {
20371 CommonMask.assign(ExtMask.begin(), ExtMask.end());
20372 } else {
20373 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
20374 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
20375 if (ExtMask[I] == PoisonMaskElem)
20376 continue;
20377 NewMask[I] = CommonMask[ExtMask[I]];
20378 }
20379 CommonMask.swap(NewMask);
20380 }
20381 }
20382 if (CommonMask.empty()) {
20383 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
20384 return InVectors.front();
20385 }
20386 if (InVectors.size() == 2)
20387 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
20388 return createShuffle(InVectors.front(), nullptr, CommonMask);
20389 }
20390
20392 assert((IsFinalized || CommonMask.empty()) &&
20393 "Shuffle construction must be finalized.");
20394 }
20395};
20396
20397Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
20398 return vectorizeTree(getOperandEntry(E, NodeIdx));
20399}
20400
20401template <typename BVTy, typename ResTy, typename... Args>
20402ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
20403 Args &...Params) {
20404 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
20405 "Expected gather node.");
20406 unsigned VF = E->getVectorFactor();
20407
20408 bool NeedFreeze = false;
20409 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
20410 // Do not process split vectorize node, marked to be gathers/buildvectors.
20412 E->CombinedEntriesWithIndices.size());
20413 if (E->State == TreeEntry::SplitVectorize &&
20414 TransformedToGatherNodes.contains(E)) {
20415 SubVectors.clear();
20416 } else {
20417 // Clear values, to be replaced by insertvector instructions.
20418 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
20419 for_each(MutableArrayRef(GatheredScalars)
20420 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
20421 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
20422 transform(
20423 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
20424 return std::make_pair(VectorizableTree[P.first].get(), P.second);
20425 });
20426 }
20427 // Build a mask out of the reorder indices and reorder scalars per this
20428 // mask.
20429 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
20430 E->ReorderIndices.end());
20431 if (!ReorderMask.empty())
20432 reorderScalars(GatheredScalars, ReorderMask);
20433 SmallVector<int> SubVectorsMask;
20434 inversePermutation(E->ReorderIndices, SubVectorsMask);
20435 // Transform non-clustered elements in the mask to poison (-1).
20436 // "Clustered" operations will be reordered using this mask later.
20437 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
20438 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
20439 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
20440 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
20441 } else {
20442 SubVectorsMask.clear();
20443 }
20444 SmallVector<Value *> StoredGS(GatheredScalars);
20445 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
20446 unsigned I, unsigned SliceSize,
20447 bool IsNotPoisonous) {
20448 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
20449 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
20450 }))
20451 return false;
20452 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
20453 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
20454 if (UserTE->getNumOperands() != 2)
20455 return false;
20456 if (!IsNotPoisonous) {
20457 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
20458 [=](const std::unique_ptr<TreeEntry> &TE) {
20459 return TE->UserTreeIndex.UserTE == UserTE &&
20460 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
20461 });
20462 if (It == VectorizableTree.end())
20463 return false;
20464 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
20465 if (!(*It)->ReorderIndices.empty()) {
20466 inversePermutation((*It)->ReorderIndices, ReorderMask);
20467 reorderScalars(GS, ReorderMask);
20468 }
20469 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
20470 Value *V0 = std::get<0>(P);
20471 Value *V1 = std::get<1>(P);
20472 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
20473 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
20474 is_contained(E->Scalars, V1));
20475 }))
20476 return false;
20477 }
20478 int Idx;
20479 if ((Mask.size() < InputVF &&
20480 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
20481 Idx == 0) ||
20482 (Mask.size() == InputVF &&
20483 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
20484 std::iota(
20485 std::next(Mask.begin(), I * SliceSize),
20486 std::next(Mask.begin(),
20487 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
20488 0);
20489 } else {
20490 unsigned IVal =
20491 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
20492 std::fill(
20493 std::next(Mask.begin(), I * SliceSize),
20494 std::next(Mask.begin(),
20495 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
20496 IVal);
20497 }
20498 return true;
20499 };
20500 BVTy ShuffleBuilder(ScalarTy, Params...);
20501 ResTy Res = ResTy();
20502 SmallVector<int> Mask;
20503 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
20505 Value *ExtractVecBase = nullptr;
20506 bool UseVecBaseAsInput = false;
20509 Type *OrigScalarTy = GatheredScalars.front()->getType();
20510 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
20511 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
20512 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
20513 // Check for gathered extracts.
20514 bool Resized = false;
20515 ExtractShuffles =
20516 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
20517 if (!ExtractShuffles.empty()) {
20518 SmallVector<const TreeEntry *> ExtractEntries;
20519 for (auto [Idx, I] : enumerate(ExtractMask)) {
20520 if (I == PoisonMaskElem)
20521 continue;
20522 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
20523 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
20524 !TEs.empty())
20525 ExtractEntries.append(TEs.begin(), TEs.end());
20526 }
20527 if (std::optional<ResTy> Delayed =
20528 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
20529 // Delay emission of gathers which are not ready yet.
20530 PostponedGathers.insert(E);
20531 // Postpone gather emission, will be emitted after the end of the
20532 // process to keep correct order.
20533 return *Delayed;
20534 }
20535 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
20536 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
20537 ExtractVecBase = VecBase;
20538 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
20539 if (VF == VecBaseTy->getNumElements() &&
20540 GatheredScalars.size() != VF) {
20541 Resized = true;
20542 GatheredScalars.append(VF - GatheredScalars.size(),
20543 PoisonValue::get(OrigScalarTy));
20544 NumParts =
20545 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
20546 }
20547 }
20548 }
20549 // Gather extracts after we check for full matched gathers only.
20550 if (!ExtractShuffles.empty() || !E->hasState() ||
20551 E->getOpcode() != Instruction::Load ||
20552 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
20553 any_of(E->Scalars, IsaPred<LoadInst>)) &&
20554 any_of(E->Scalars,
20555 [this](Value *V) {
20556 return isa<LoadInst>(V) && isVectorized(V);
20557 })) ||
20558 (E->hasState() && E->isAltShuffle()) ||
20559 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
20560 isSplat(E->Scalars) ||
20561 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
20562 GatherShuffles =
20563 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
20564 }
20565 if (!GatherShuffles.empty()) {
20566 if (std::optional<ResTy> Delayed =
20567 ShuffleBuilder.needToDelay(E, Entries)) {
20568 // Delay emission of gathers which are not ready yet.
20569 PostponedGathers.insert(E);
20570 // Postpone gather emission, will be emitted after the end of the
20571 // process to keep correct order.
20572 return *Delayed;
20573 }
20574 if (GatherShuffles.size() == 1 &&
20575 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
20576 Entries.front().front()->isSame(E->Scalars)) {
20577 // Perfect match in the graph, will reuse the previously vectorized
20578 // node. Cost is 0.
20579 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
20580 << shortBundleName(E->Scalars, E->Idx) << ".\n");
20581 // Restore the mask for previous partially matched values.
20582 Mask.resize(E->Scalars.size());
20583 const TreeEntry *FrontTE = Entries.front().front();
20584 if (FrontTE->ReorderIndices.empty() &&
20585 ((FrontTE->ReuseShuffleIndices.empty() &&
20586 E->Scalars.size() == FrontTE->Scalars.size()) ||
20587 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
20588 std::iota(Mask.begin(), Mask.end(), 0);
20589 } else {
20590 for (auto [I, V] : enumerate(E->Scalars)) {
20591 if (isa<PoisonValue>(V)) {
20592 Mask[I] = PoisonMaskElem;
20593 continue;
20594 }
20595 Mask[I] = FrontTE->findLaneForValue(V);
20596 }
20597 }
20598 // Reset the builder(s) to correctly handle perfect diamond matched
20599 // nodes.
20600 ShuffleBuilder.resetForSameNode();
20601 // Full matched entry found, no need to insert subvectors.
20602 if (equal(E->Scalars, FrontTE->Scalars) &&
20603 equal(E->ReorderIndices, FrontTE->ReorderIndices) &&
20604 equal(E->ReuseShuffleIndices, FrontTE->ReuseShuffleIndices)) {
20605 Mask.resize(FrontTE->getVectorFactor());
20606 std::iota(Mask.begin(), Mask.end(), 0);
20607 ShuffleBuilder.add(*FrontTE, Mask);
20608 Res = ShuffleBuilder.finalize({}, {}, {});
20609 } else {
20610 ShuffleBuilder.add(*FrontTE, Mask);
20611 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
20612 }
20613 return Res;
20614 }
20615 if (!Resized) {
20616 if (GatheredScalars.size() != VF &&
20617 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
20618 return any_of(TEs, [&](const TreeEntry *TE) {
20619 return TE->getVectorFactor() == VF;
20620 });
20621 }))
20622 GatheredScalars.append(VF - GatheredScalars.size(),
20623 PoisonValue::get(OrigScalarTy));
20624 }
20625 // Remove shuffled elements from list of gathers.
20626 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
20627 if (Mask[I] != PoisonMaskElem)
20628 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
20629 }
20630 }
20631 }
20632 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
20633 SmallVectorImpl<int> &ReuseMask,
20634 bool IsRootPoison) {
20635 // For splats with can emit broadcasts instead of gathers, so try to find
20636 // such sequences.
20637 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
20638 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
20639 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
20640 SmallVector<int> UndefPos;
20641 DenseMap<Value *, unsigned> UniquePositions;
20642 // Gather unique non-const values and all constant values.
20643 // For repeated values, just shuffle them.
20644 int NumNonConsts = 0;
20645 int SinglePos = 0;
20646 for (auto [I, V] : enumerate(Scalars)) {
20647 if (isa<UndefValue>(V)) {
20648 if (!isa<PoisonValue>(V)) {
20649 ReuseMask[I] = I;
20650 UndefPos.push_back(I);
20651 }
20652 continue;
20653 }
20654 if (isConstant(V)) {
20655 ReuseMask[I] = I;
20656 continue;
20657 }
20658 ++NumNonConsts;
20659 SinglePos = I;
20660 Value *OrigV = V;
20661 Scalars[I] = PoisonValue::get(OrigScalarTy);
20662 if (IsSplat) {
20663 Scalars.front() = OrigV;
20664 ReuseMask[I] = 0;
20665 } else {
20666 const auto Res = UniquePositions.try_emplace(OrigV, I);
20667 Scalars[Res.first->second] = OrigV;
20668 ReuseMask[I] = Res.first->second;
20669 }
20670 }
20671 if (NumNonConsts == 1) {
20672 // Restore single insert element.
20673 if (IsSplat) {
20674 ReuseMask.assign(VF, PoisonMaskElem);
20675 std::swap(Scalars.front(), Scalars[SinglePos]);
20676 if (!UndefPos.empty() && UndefPos.front() == 0)
20677 Scalars.front() = UndefValue::get(OrigScalarTy);
20678 }
20679 ReuseMask[SinglePos] = SinglePos;
20680 } else if (!UndefPos.empty() && IsSplat) {
20681 // For undef values, try to replace them with the simple broadcast.
20682 // We can do it if the broadcasted value is guaranteed to be
20683 // non-poisonous, or by freezing the incoming scalar value first.
20684 auto *It = find_if(Scalars, [this, E](Value *V) {
20685 return !isa<UndefValue>(V) &&
20687 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
20688 // Check if the value already used in the same operation in
20689 // one of the nodes already.
20690 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
20691 is_contained(E->UserTreeIndex.UserTE->Scalars,
20692 U.getUser());
20693 })));
20694 });
20695 if (It != Scalars.end()) {
20696 // Replace undefs by the non-poisoned scalars and emit broadcast.
20697 int Pos = std::distance(Scalars.begin(), It);
20698 for (int I : UndefPos) {
20699 // Set the undef position to the non-poisoned scalar.
20700 ReuseMask[I] = Pos;
20701 // Replace the undef by the poison, in the mask it is replaced by
20702 // non-poisoned scalar already.
20703 if (I != Pos)
20704 Scalars[I] = PoisonValue::get(OrigScalarTy);
20705 }
20706 } else {
20707 // Replace undefs by the poisons, emit broadcast and then emit
20708 // freeze.
20709 for (int I : UndefPos) {
20710 ReuseMask[I] = PoisonMaskElem;
20711 if (isa<UndefValue>(Scalars[I]))
20712 Scalars[I] = PoisonValue::get(OrigScalarTy);
20713 }
20714 NeedFreeze = true;
20715 }
20716 }
20717 };
20718 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
20719 bool IsNonPoisoned = true;
20720 bool IsUsedInExpr = true;
20721 Value *Vec1 = nullptr;
20722 if (!ExtractShuffles.empty()) {
20723 // Gather of extractelements can be represented as just a shuffle of
20724 // a single/two vectors the scalars are extracted from.
20725 // Find input vectors.
20726 Value *Vec2 = nullptr;
20727 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
20728 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
20729 ExtractMask[I] = PoisonMaskElem;
20730 }
20731 if (UseVecBaseAsInput) {
20732 Vec1 = ExtractVecBase;
20733 } else {
20734 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
20735 if (ExtractMask[I] == PoisonMaskElem)
20736 continue;
20737 if (isa<UndefValue>(StoredGS[I]))
20738 continue;
20739 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
20740 Value *VecOp = EI->getVectorOperand();
20741 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
20742 !TEs.empty() && TEs.front()->VectorizedValue)
20743 VecOp = TEs.front()->VectorizedValue;
20744 if (!Vec1) {
20745 Vec1 = VecOp;
20746 } else if (Vec1 != VecOp) {
20747 assert((!Vec2 || Vec2 == VecOp) &&
20748 "Expected only 1 or 2 vectors shuffle.");
20749 Vec2 = VecOp;
20750 }
20751 }
20752 }
20753 if (Vec2) {
20754 IsUsedInExpr = false;
20755 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
20756 isGuaranteedNotToBePoison(Vec2, AC);
20757 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
20758 } else if (Vec1) {
20759 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
20760 IsUsedInExpr &= FindReusedSplat(
20761 ExtractMask,
20762 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
20763 ExtractMask.size(), IsNotPoisonedVec);
20764 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
20765 IsNonPoisoned &= IsNotPoisonedVec;
20766 } else {
20767 IsUsedInExpr = false;
20768 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
20769 /*ForExtracts=*/true);
20770 }
20771 }
20772 if (!GatherShuffles.empty()) {
20773 unsigned SliceSize =
20774 getPartNumElems(E->Scalars.size(),
20775 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
20776 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
20777 for (const auto [I, TEs] : enumerate(Entries)) {
20778 if (TEs.empty()) {
20779 assert(!GatherShuffles[I] &&
20780 "No shuffles with empty entries list expected.");
20781 continue;
20782 }
20783 assert((TEs.size() == 1 || TEs.size() == 2) &&
20784 "Expected shuffle of 1 or 2 entries.");
20785 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
20786 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
20787 VecMask.assign(VecMask.size(), PoisonMaskElem);
20788 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
20789 if (TEs.size() == 1) {
20790 bool IsNotPoisonedVec =
20791 TEs.front()->VectorizedValue
20792 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
20793 : true;
20794 IsUsedInExpr &=
20795 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
20796 SliceSize, IsNotPoisonedVec);
20797 ShuffleBuilder.add(*TEs.front(), VecMask);
20798 IsNonPoisoned &= IsNotPoisonedVec;
20799 } else {
20800 IsUsedInExpr = false;
20801 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
20802 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
20803 IsNonPoisoned &=
20804 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
20805 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
20806 }
20807 }
20808 }
20809 // Try to figure out best way to combine values: build a shuffle and insert
20810 // elements or just build several shuffles.
20811 // Insert non-constant scalars.
20812 SmallVector<Value *> NonConstants(GatheredScalars);
20813 int EMSz = ExtractMask.size();
20814 int MSz = Mask.size();
20815 // Try to build constant vector and shuffle with it only if currently we
20816 // have a single permutation and more than 1 scalar constants.
20817 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
20818 bool IsIdentityShuffle =
20819 ((UseVecBaseAsInput ||
20820 all_of(ExtractShuffles,
20821 [](const std::optional<TTI::ShuffleKind> &SK) {
20822 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
20824 })) &&
20825 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
20826 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
20827 (!GatherShuffles.empty() &&
20828 all_of(GatherShuffles,
20829 [](const std::optional<TTI::ShuffleKind> &SK) {
20830 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
20832 }) &&
20833 none_of(Mask, [&](int I) { return I >= MSz; }) &&
20835 bool EnoughConstsForShuffle =
20836 IsSingleShuffle &&
20837 (none_of(GatheredScalars,
20838 [](Value *V) {
20839 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
20840 }) ||
20841 any_of(GatheredScalars,
20842 [](Value *V) {
20843 return isa<Constant>(V) && !isa<UndefValue>(V);
20844 })) &&
20845 (!IsIdentityShuffle ||
20846 (GatheredScalars.size() == 2 &&
20847 any_of(GatheredScalars,
20848 [](Value *V) { return !isa<UndefValue>(V); })) ||
20849 count_if(GatheredScalars, [](Value *V) {
20850 return isa<Constant>(V) && !isa<PoisonValue>(V);
20851 }) > 1);
20852 // NonConstants array contains just non-constant values, GatheredScalars
20853 // contains only constant to build final vector and then shuffle.
20854 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
20855 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
20856 NonConstants[I] = PoisonValue::get(OrigScalarTy);
20857 else
20858 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
20859 }
20860 // Generate constants for final shuffle and build a mask for them.
20861 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
20862 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
20863 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
20864 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
20865 ShuffleBuilder.add(BV, BVMask);
20866 }
20867 if (all_of(NonConstants, [=](Value *V) {
20868 return isa<PoisonValue>(V) ||
20869 (IsSingleShuffle && ((IsIdentityShuffle &&
20870 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
20871 }))
20872 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20873 SubVectorsMask);
20874 else
20875 Res = ShuffleBuilder.finalize(
20876 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
20877 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
20878 bool IsSplat = isSplat(NonConstants);
20879 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
20880 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
20881 auto CheckIfSplatIsProfitable = [&]() {
20882 // Estimate the cost of splatting + shuffle and compare with
20883 // insert + shuffle.
20884 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
20885 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
20886 if (isa<ExtractElementInst>(V) || isVectorized(V))
20887 return false;
20888 InstructionCost SplatCost = TTI->getVectorInstrCost(
20889 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
20890 PoisonValue::get(VecTy), V);
20891 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20892 for (auto [Idx, I] : enumerate(BVMask))
20893 if (I != PoisonMaskElem)
20894 NewMask[Idx] = Mask.size();
20895 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
20896 NewMask, CostKind);
20897 InstructionCost BVCost = TTI->getVectorInstrCost(
20898 Instruction::InsertElement, VecTy, CostKind,
20899 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
20900 // Shuffle required?
20901 if (count(BVMask, PoisonMaskElem) <
20902 static_cast<int>(BVMask.size() - 1)) {
20903 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20904 for (auto [Idx, I] : enumerate(BVMask))
20905 if (I != PoisonMaskElem)
20906 NewMask[Idx] = I;
20907 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
20908 VecTy, NewMask, CostKind);
20909 }
20910 return SplatCost <= BVCost;
20911 };
20912 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
20913 for (auto [Idx, I] : enumerate(BVMask))
20914 if (I != PoisonMaskElem)
20915 Mask[Idx] = I;
20916 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
20917 } else {
20918 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
20919 SmallVector<Value *> Values(NonConstants.size(),
20920 PoisonValue::get(ScalarTy));
20921 Values[0] = V;
20922 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
20923 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
20924 transform(BVMask, SplatMask.begin(), [](int I) {
20925 return I == PoisonMaskElem ? PoisonMaskElem : 0;
20926 });
20927 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
20928 BV = CreateShuffle(BV, nullptr, SplatMask);
20929 for (auto [Idx, I] : enumerate(BVMask))
20930 if (I != PoisonMaskElem)
20931 Mask[Idx] = BVMask.size() + Idx;
20932 Vec = CreateShuffle(Vec, BV, Mask);
20933 for (auto [Idx, I] : enumerate(Mask))
20934 if (I != PoisonMaskElem)
20935 Mask[Idx] = Idx;
20936 }
20937 });
20938 } else if (!allConstant(GatheredScalars)) {
20939 // Gather unique scalars and all constants.
20940 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
20941 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
20942 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
20943 ShuffleBuilder.add(BV, ReuseMask);
20944 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20945 SubVectorsMask);
20946 } else {
20947 // Gather all constants.
20948 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
20949 for (auto [I, V] : enumerate(GatheredScalars)) {
20950 if (!isa<PoisonValue>(V))
20951 Mask[I] = I;
20952 }
20953 Value *BV = ShuffleBuilder.gather(GatheredScalars);
20954 ShuffleBuilder.add(BV, Mask);
20955 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20956 SubVectorsMask);
20957 }
20958
20959 if (NeedFreeze)
20960 Res = ShuffleBuilder.createFreeze(Res);
20961 return Res;
20962}
20963
20964Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
20965 // Do not do this for split vectorize node, marked to be gathers/buildvectors.
20966 if (E->State != TreeEntry::SplitVectorize ||
20967 !TransformedToGatherNodes.contains(E)) {
20968 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
20969 (void)vectorizeTree(VectorizableTree[EIdx].get());
20970 }
20971 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
20972 Builder, *this);
20973}
20974
20975/// \returns \p I after propagating metadata from \p VL only for instructions in
20976/// \p VL.
20979 for (Value *V : VL)
20980 if (isa<Instruction>(V))
20981 Insts.push_back(V);
20982 return llvm::propagateMetadata(Inst, Insts);
20983}
20984
20986 if (DebugLoc DL = PN.getDebugLoc())
20987 return DL;
20988 return DebugLoc::getUnknown();
20989}
20990
20991Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
20992 IRBuilderBase::InsertPointGuard Guard(Builder);
20993
20994 Value *V = E->Scalars.front();
20995 Type *ScalarTy = V->getType();
20996 if (!isa<CmpInst>(V))
20997 ScalarTy = getValueType(V);
20998 auto It = MinBWs.find(E);
20999 if (It != MinBWs.end()) {
21000 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
21001 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
21002 if (VecTy)
21003 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
21004 }
21005 if (E->VectorizedValue)
21006 return E->VectorizedValue;
21007 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
21008 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
21009 // Set insert point for non-reduction initial nodes.
21010 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
21011 setInsertPointAfterBundle(E);
21012 Value *Vec = createBuildVector(E, ScalarTy);
21013 E->VectorizedValue = Vec;
21014 return Vec;
21015 }
21016 if (E->State == TreeEntry::SplitVectorize) {
21017 assert(E->CombinedEntriesWithIndices.size() == 2 &&
21018 "Expected exactly 2 combined entries.");
21019 setInsertPointAfterBundle(E);
21020 TreeEntry &OpTE1 =
21021 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
21022 assert(OpTE1.isSame(
21023 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
21024 "Expected same first part of scalars.");
21025 Value *Op1 = vectorizeTree(&OpTE1);
21026 TreeEntry &OpTE2 =
21027 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
21028 assert(
21029 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
21030 "Expected same second part of scalars.");
21031 Value *Op2 = vectorizeTree(&OpTE2);
21032 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
21033 bool IsSigned = false;
21034 auto It = MinBWs.find(OpE);
21035 if (It != MinBWs.end())
21036 IsSigned = It->second.second;
21037 else
21038 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
21039 if (isa<PoisonValue>(V))
21040 return false;
21041 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21042 });
21043 return IsSigned;
21044 };
21045 if (cast<VectorType>(Op1->getType())->getElementType() !=
21046 ScalarTy->getScalarType()) {
21047 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
21048 Op1 = Builder.CreateIntCast(
21049 Op1,
21051 ScalarTy,
21052 cast<FixedVectorType>(Op1->getType())->getNumElements()),
21053 GetOperandSignedness(&OpTE1));
21054 }
21055 if (cast<VectorType>(Op2->getType())->getElementType() !=
21056 ScalarTy->getScalarType()) {
21057 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
21058 Op2 = Builder.CreateIntCast(
21059 Op2,
21061 ScalarTy,
21062 cast<FixedVectorType>(Op2->getType())->getNumElements()),
21063 GetOperandSignedness(&OpTE2));
21064 }
21065 if (E->ReorderIndices.empty()) {
21066 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
21067 std::iota(
21068 Mask.begin(),
21069 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
21070 0);
21071 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
21072 if (ScalarTyNumElements != 1) {
21073 assert(SLPReVec && "Only supported by REVEC.");
21074 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
21075 }
21076 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
21077 Vec = createInsertVector(Builder, Vec, Op2,
21078 E->CombinedEntriesWithIndices.back().second *
21079 ScalarTyNumElements);
21080 E->VectorizedValue = Vec;
21081 return Vec;
21082 }
21083 unsigned CommonVF =
21084 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
21085 const unsigned Scale = getNumElements(ScalarTy);
21086 CommonVF *= Scale;
21087 if (getNumElements(Op1->getType()) != CommonVF) {
21088 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
21089 copy(createReplicatedMask(Scale, OpTE1.getVectorFactor() * Scale),
21090 Mask.begin());
21091 Op1 = Builder.CreateShuffleVector(Op1, Mask);
21092 }
21093 if (getNumElements(Op2->getType()) != CommonVF) {
21094 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
21095 copy(createReplicatedMask(Scale, OpTE2.getVectorFactor() * Scale),
21096 Mask.begin());
21097 Op2 = Builder.CreateShuffleVector(Op2, Mask);
21098 }
21099 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
21100 E->VectorizedValue = Vec;
21101 return Vec;
21102 }
21103
21104 bool IsReverseOrder =
21105 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
21106 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
21107 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
21108 if (E->getOpcode() == Instruction::Store &&
21109 E->State == TreeEntry::Vectorize) {
21110 ArrayRef<int> Mask =
21111 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
21112 E->ReorderIndices.size());
21113 ShuffleBuilder.add(V, Mask);
21114 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
21115 E->State == TreeEntry::CompressVectorize) {
21116 ShuffleBuilder.addOrdered(V, {});
21117 } else {
21118 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
21119 }
21121 E->CombinedEntriesWithIndices.size());
21122 transform(
21123 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
21124 return std::make_pair(VectorizableTree[P.first].get(), P.second);
21125 });
21126 assert(
21127 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
21128 "Expected either combined subnodes or reordering");
21129 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
21130 };
21131
21132 assert(!E->isGather() && "Unhandled state");
21133 unsigned ShuffleOrOp =
21134 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
21135 if (!E->isAltShuffle()) {
21136 switch (E->CombinedOp) {
21137 case TreeEntry::ReducedBitcast:
21138 case TreeEntry::ReducedBitcastBSwap:
21139 case TreeEntry::ReducedBitcastLoads:
21140 case TreeEntry::ReducedBitcastBSwapLoads:
21141 case TreeEntry::ReducedCmpBitcast:
21142 ShuffleOrOp = E->CombinedOp;
21143 break;
21144 default:
21145 break;
21146 }
21147 }
21148 Instruction *VL0 = E->getMainOp();
21149 auto GetOperandSignedness = [&](unsigned Idx) {
21150 const TreeEntry *OpE = getOperandEntry(E, Idx);
21151 bool IsSigned = false;
21152 auto It = MinBWs.find(OpE);
21153 if (It != MinBWs.end())
21154 IsSigned = It->second.second;
21155 else
21156 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
21157 if (isa<PoisonValue>(V))
21158 return false;
21159 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21160 });
21161 return IsSigned;
21162 };
21163 switch (ShuffleOrOp) {
21164 case Instruction::PHI: {
21165 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
21166 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
21167 "PHI reordering is free.");
21168 auto *PH = cast<PHINode>(VL0);
21169 Builder.SetInsertPoint(PH->getParent(),
21170 PH->getParent()->getFirstNonPHIIt());
21171 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
21172 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
21173 Value *V = NewPhi;
21174
21175 // Adjust insertion point once all PHI's have been generated.
21176 Builder.SetInsertPoint(PH->getParent(),
21177 PH->getParent()->getFirstInsertionPt());
21178 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
21179
21180 V = FinalShuffle(V, E);
21181
21182 E->VectorizedValue = V;
21183 // If phi node is fully emitted - exit.
21184 if (NewPhi->getNumIncomingValues() != 0)
21185 return NewPhi;
21186
21187 // PHINodes may have multiple entries from the same block. We want to
21188 // visit every block once.
21189 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
21190 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
21191 BasicBlock *IBB = PH->getIncomingBlock(I);
21192
21193 // Stop emission if all incoming values are generated.
21194 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
21195 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
21196 return NewPhi;
21197 }
21198
21199 auto Res = VisitedBBs.try_emplace(IBB, I);
21200 if (!Res.second) {
21201 TreeEntry *OpTE = getOperandEntry(E, I);
21202 if (OpTE->isGather() || DeletedNodes.contains(OpTE) ||
21203 TransformedToGatherNodes.contains(OpTE)) {
21204 Value *VecOp = NewPhi->getIncomingValue(Res.first->getSecond());
21205 NewPhi->addIncoming(VecOp, IBB);
21206 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
21207 OpTE->VectorizedValue = VecOp;
21208 continue;
21209 }
21210 }
21211
21212 Builder.SetInsertPoint(IBB->getTerminator());
21213 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
21214 Value *Vec = vectorizeOperand(E, I);
21215 if (VecTy != Vec->getType()) {
21216 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
21217 MinBWs.contains(getOperandEntry(E, I))) &&
21218 "Expected item in MinBWs.");
21219 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
21220 }
21221 NewPhi->addIncoming(Vec, IBB);
21222 }
21223
21224 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
21225 "Invalid number of incoming values");
21226 assert(E->VectorizedValue && "Expected vectorized value.");
21227 return E->VectorizedValue;
21228 }
21229
21230 case Instruction::ExtractElement: {
21231 Value *V = E->getSingleOperand(0);
21232 setInsertPointAfterBundle(E);
21233 V = FinalShuffle(V, E);
21234 E->VectorizedValue = V;
21235 return V;
21236 }
21237 case Instruction::ExtractValue: {
21238 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
21239 Builder.SetInsertPoint(LI);
21240 Value *Ptr = LI->getPointerOperand();
21241 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
21242 Value *NewV = ::propagateMetadata(V, E->Scalars);
21243 NewV = FinalShuffle(NewV, E);
21244 E->VectorizedValue = NewV;
21245 return NewV;
21246 }
21247 case Instruction::InsertElement: {
21248 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
21249 if (const TreeEntry *OpE = getOperandEntry(E, 1);
21250 OpE && !OpE->isGather() && OpE->hasState() &&
21251 !OpE->hasCopyableElements())
21252 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
21253 else
21254 setInsertPointAfterBundle(E);
21255 Value *V = vectorizeOperand(E, 1);
21256 ArrayRef<Value *> Op = E->getOperand(1);
21257 Type *ScalarTy = Op.front()->getType();
21258 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
21259 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
21260 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
21261 assert(Res.first > 0 && "Expected item in MinBWs.");
21262 V = Builder.CreateIntCast(
21263 V,
21265 ScalarTy,
21266 cast<FixedVectorType>(V->getType())->getNumElements()),
21267 Res.second);
21268 }
21269
21270 // Create InsertVector shuffle if necessary
21271 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
21272 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
21273 }));
21274 const unsigned NumElts =
21275 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
21276 const unsigned NumScalars = E->Scalars.size();
21277
21278 unsigned Offset = *getElementIndex(VL0);
21279 assert(Offset < NumElts && "Failed to find vector index offset");
21280
21281 // Create shuffle to resize vector
21282 SmallVector<int> Mask;
21283 if (!E->ReorderIndices.empty()) {
21284 inversePermutation(E->ReorderIndices, Mask);
21285 Mask.append(NumElts - NumScalars, PoisonMaskElem);
21286 } else {
21287 Mask.assign(NumElts, PoisonMaskElem);
21288 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
21289 }
21290 // Create InsertVector shuffle if necessary
21291 bool IsIdentity = true;
21292 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
21293 Mask.swap(PrevMask);
21294 for (unsigned I = 0; I < NumScalars; ++I) {
21295 Value *Scalar = E->Scalars[PrevMask[I]];
21296 unsigned InsertIdx = *getElementIndex(Scalar);
21297 IsIdentity &= InsertIdx - Offset == I;
21298 Mask[InsertIdx - Offset] = I;
21299 }
21300 if (!IsIdentity || NumElts != NumScalars) {
21301 Value *V2 = nullptr;
21302 bool IsVNonPoisonous =
21304 SmallVector<int> InsertMask(Mask);
21305 if (NumElts != NumScalars && Offset == 0) {
21306 // Follow all insert element instructions from the current buildvector
21307 // sequence.
21308 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
21309 do {
21310 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
21311 if (!InsertIdx)
21312 break;
21313 if (InsertMask[*InsertIdx] == PoisonMaskElem)
21314 InsertMask[*InsertIdx] = *InsertIdx;
21315 if (!Ins->hasOneUse())
21316 break;
21319 } while (Ins);
21320 SmallBitVector UseMask =
21321 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
21322 SmallBitVector IsFirstPoison =
21323 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
21324 SmallBitVector IsFirstUndef =
21325 isUndefVector(FirstInsert->getOperand(0), UseMask);
21326 if (!IsFirstPoison.all()) {
21327 unsigned Idx = 0;
21328 for (unsigned I = 0; I < NumElts; I++) {
21329 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
21330 IsFirstUndef.test(I)) {
21331 if (IsVNonPoisonous) {
21332 InsertMask[I] = I < NumScalars ? I : 0;
21333 continue;
21334 }
21335 if (!V2)
21336 V2 = UndefValue::get(V->getType());
21337 if (Idx >= NumScalars)
21338 Idx = NumScalars - 1;
21339 InsertMask[I] = NumScalars + Idx;
21340 ++Idx;
21341 } else if (InsertMask[I] != PoisonMaskElem &&
21342 Mask[I] == PoisonMaskElem) {
21343 InsertMask[I] = PoisonMaskElem;
21344 }
21345 }
21346 } else {
21347 InsertMask = Mask;
21348 }
21349 }
21350 if (!V2)
21351 V2 = PoisonValue::get(V->getType());
21352 V = Builder.CreateShuffleVector(V, V2, InsertMask);
21353 if (auto *I = dyn_cast<Instruction>(V)) {
21354 GatherShuffleExtractSeq.insert(I);
21355 CSEBlocks.insert(I->getParent());
21356 }
21357 }
21358
21359 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
21360 for (unsigned I = 0; I < NumElts; I++) {
21361 if (Mask[I] != PoisonMaskElem)
21362 InsertMask[Offset + I] = I;
21363 }
21364 SmallBitVector UseMask =
21365 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
21366 SmallBitVector IsFirstUndef =
21367 isUndefVector(FirstInsert->getOperand(0), UseMask);
21368 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
21369 NumElts != NumScalars) {
21370 if (IsFirstUndef.all()) {
21371 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
21372 SmallBitVector IsFirstPoison =
21373 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
21374 if (!IsFirstPoison.all()) {
21375 for (unsigned I = 0; I < NumElts; I++) {
21376 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
21377 InsertMask[I] = I + NumElts;
21378 }
21379 }
21380 V = Builder.CreateShuffleVector(
21381 V,
21382 IsFirstPoison.all() ? PoisonValue::get(V->getType())
21383 : FirstInsert->getOperand(0),
21384 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
21385 if (auto *I = dyn_cast<Instruction>(V)) {
21386 GatherShuffleExtractSeq.insert(I);
21387 CSEBlocks.insert(I->getParent());
21388 }
21389 }
21390 } else {
21391 SmallBitVector IsFirstPoison =
21392 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
21393 for (unsigned I = 0; I < NumElts; I++) {
21394 if (InsertMask[I] == PoisonMaskElem)
21395 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
21396 else
21397 InsertMask[I] += NumElts;
21398 }
21399 V = Builder.CreateShuffleVector(
21400 FirstInsert->getOperand(0), V, InsertMask,
21401 cast<Instruction>(E->Scalars.back())->getName());
21402 if (auto *I = dyn_cast<Instruction>(V)) {
21403 GatherShuffleExtractSeq.insert(I);
21404 CSEBlocks.insert(I->getParent());
21405 }
21406 }
21407 }
21408
21409 ++NumVectorInstructions;
21410 E->VectorizedValue = V;
21411 return V;
21412 }
21413 case Instruction::ZExt:
21414 case Instruction::SExt:
21415 case Instruction::FPToUI:
21416 case Instruction::FPToSI:
21417 case Instruction::FPExt:
21418 case Instruction::PtrToInt:
21419 case Instruction::IntToPtr:
21420 case Instruction::SIToFP:
21421 case Instruction::UIToFP:
21422 case Instruction::Trunc:
21423 case Instruction::FPTrunc:
21424 case Instruction::BitCast: {
21425 setInsertPointAfterBundle(E);
21426
21427 Value *InVec = vectorizeOperand(E, 0);
21428
21429 auto *CI = cast<CastInst>(VL0);
21430 Instruction::CastOps VecOpcode = CI->getOpcode();
21431 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
21432 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
21433 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
21434 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
21435 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
21436 // Check if the values are candidates to demote.
21437 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
21438 if (SrcIt != MinBWs.end())
21439 SrcBWSz = SrcIt->second.first;
21440 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
21441 if (BWSz == SrcBWSz) {
21442 VecOpcode = Instruction::BitCast;
21443 } else if (BWSz < SrcBWSz) {
21444 VecOpcode = Instruction::Trunc;
21445 } else if (It != MinBWs.end()) {
21446 assert(BWSz > SrcBWSz && "Invalid cast!");
21447 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
21448 } else if (SrcIt != MinBWs.end()) {
21449 assert(BWSz > SrcBWSz && "Invalid cast!");
21450 VecOpcode =
21451 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
21452 }
21453 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
21454 !SrcIt->second.second) {
21455 VecOpcode = Instruction::UIToFP;
21456 } else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
21457 ScalarTy->isFPOrFPVectorTy()) {
21458 Type *OrigSrcScalarTy = CI->getSrcTy();
21459 auto *OrigSrcVectorTy =
21460 getWidenedType(OrigSrcScalarTy, E->Scalars.size());
21461 InVec =
21462 Builder.CreateIntCast(InVec, OrigSrcVectorTy, SrcIt->second.second);
21463 }
21464 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
21465 ? InVec
21466 : Builder.CreateCast(VecOpcode, InVec, VecTy);
21467 V = FinalShuffle(V, E);
21468
21469 E->VectorizedValue = V;
21470 ++NumVectorInstructions;
21471 return V;
21472 }
21473 case Instruction::FCmp:
21474 case Instruction::ICmp: {
21475 setInsertPointAfterBundle(E);
21476
21477 Value *L = vectorizeOperand(E, 0);
21478 Value *R = vectorizeOperand(E, 1);
21479 if (L->getType() != R->getType()) {
21480 assert((getOperandEntry(E, 0)->isGather() ||
21481 getOperandEntry(E, 1)->isGather() ||
21482 MinBWs.contains(getOperandEntry(E, 0)) ||
21483 MinBWs.contains(getOperandEntry(E, 1))) &&
21484 "Expected item in MinBWs.");
21485 const unsigned LBW = cast<VectorType>(L->getType())
21486 ->getElementType()
21487 ->getIntegerBitWidth();
21488 const unsigned RBW = cast<VectorType>(R->getType())
21489 ->getElementType()
21490 ->getIntegerBitWidth();
21491 if ((LBW < RBW && !allConstant(E->getOperand(1))) ||
21492 (LBW > RBW && allConstant(E->getOperand(0)))) {
21493 Type *CastTy = R->getType();
21494 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
21495 } else {
21496 Type *CastTy = L->getType();
21497 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
21498 }
21499 }
21500
21501 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
21502 Value *V = Builder.CreateCmp(P0, L, R);
21503 propagateIRFlags(V, E->Scalars, VL0);
21504 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
21505 ICmp->setSameSign(/*B=*/false);
21506 // Do not cast for cmps.
21507 VecTy = cast<FixedVectorType>(V->getType());
21508 V = FinalShuffle(V, E);
21509
21510 E->VectorizedValue = V;
21511 ++NumVectorInstructions;
21512 return V;
21513 }
21514 case Instruction::Select: {
21515 setInsertPointAfterBundle(E);
21516
21517 Value *Cond = vectorizeOperand(E, 0);
21518 Value *True = vectorizeOperand(E, 1);
21519 Value *False = vectorizeOperand(E, 2);
21520 if (True->getType() != VecTy || False->getType() != VecTy) {
21521 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
21522 getOperandEntry(E, 2)->isGather() ||
21523 MinBWs.contains(getOperandEntry(E, 1)) ||
21524 MinBWs.contains(getOperandEntry(E, 2))) &&
21525 "Expected item in MinBWs.");
21526 if (True->getType() != VecTy)
21527 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
21528 if (False->getType() != VecTy)
21529 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
21530 }
21531
21532 unsigned CondNumElements = getNumElements(Cond->getType());
21533 unsigned TrueNumElements = getNumElements(True->getType());
21534 assert(TrueNumElements >= CondNumElements &&
21535 TrueNumElements % CondNumElements == 0 &&
21536 "Cannot vectorize Instruction::Select");
21537 assert(TrueNumElements == getNumElements(False->getType()) &&
21538 "Cannot vectorize Instruction::Select");
21539 if (CondNumElements != TrueNumElements) {
21540 // When the return type is i1 but the source is fixed vector type, we
21541 // need to duplicate the condition value.
21542 Cond = Builder.CreateShuffleVector(
21543 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
21544 CondNumElements));
21545 }
21546 assert(getNumElements(Cond->getType()) == TrueNumElements &&
21547 "Cannot vectorize Instruction::Select");
21548 Value *V =
21549 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
21550 V = FinalShuffle(V, E);
21551
21552 E->VectorizedValue = V;
21553 ++NumVectorInstructions;
21554 return V;
21555 }
21556 case Instruction::FNeg: {
21557 setInsertPointAfterBundle(E);
21558
21559 Value *Op = vectorizeOperand(E, 0);
21560
21561 Value *V = Builder.CreateUnOp(
21562 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
21563 propagateIRFlags(V, E->Scalars, VL0);
21564 if (auto *I = dyn_cast<Instruction>(V))
21565 V = ::propagateMetadata(I, E->Scalars);
21566
21567 V = FinalShuffle(V, E);
21568
21569 E->VectorizedValue = V;
21570 ++NumVectorInstructions;
21571
21572 return V;
21573 }
21574 case Instruction::Freeze: {
21575 setInsertPointAfterBundle(E);
21576
21577 Value *Op = vectorizeOperand(E, 0);
21578
21579 if (Op->getType() != VecTy) {
21580 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
21581 MinBWs.contains(getOperandEntry(E, 0))) &&
21582 "Expected item in MinBWs.");
21583 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
21584 }
21585 Value *V = Builder.CreateFreeze(Op);
21586 V = FinalShuffle(V, E);
21587
21588 E->VectorizedValue = V;
21589 ++NumVectorInstructions;
21590
21591 return V;
21592 }
21593 case Instruction::Add:
21594 case Instruction::FAdd:
21595 case Instruction::Sub:
21596 case Instruction::FSub:
21597 case Instruction::Mul:
21598 case Instruction::FMul:
21599 case Instruction::UDiv:
21600 case Instruction::SDiv:
21601 case Instruction::FDiv:
21602 case Instruction::URem:
21603 case Instruction::SRem:
21604 case Instruction::FRem:
21605 case Instruction::Shl:
21606 case Instruction::LShr:
21607 case Instruction::AShr:
21608 case Instruction::And:
21609 case Instruction::Or:
21610 case Instruction::Xor: {
21611 setInsertPointAfterBundle(E);
21612
21613 Value *LHS = vectorizeOperand(E, 0);
21614 Value *RHS = vectorizeOperand(E, 1);
21615 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
21616 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
21617 ArrayRef<Value *> Ops = E->getOperand(I);
21618 if (all_of(Ops, [&](Value *Op) {
21619 auto *CI = dyn_cast<ConstantInt>(Op);
21620 return CI && CI->getValue().countr_one() >= It->second.first;
21621 })) {
21622 V = FinalShuffle(I == 0 ? RHS : LHS, E);
21623 E->VectorizedValue = V;
21624 ++NumVectorInstructions;
21625 return V;
21626 }
21627 }
21628 }
21629 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
21630 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
21631 getOperandEntry(E, 1)->isGather() ||
21632 MinBWs.contains(getOperandEntry(E, 0)) ||
21633 MinBWs.contains(getOperandEntry(E, 1))) &&
21634 "Expected item in MinBWs.");
21635 if (LHS->getType() != VecTy)
21636 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
21637 if (RHS->getType() != VecTy)
21638 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
21639 }
21640
21641 Value *V = Builder.CreateBinOp(
21642 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
21643 RHS);
21644 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
21645 if (auto *I = dyn_cast<Instruction>(V)) {
21646 V = ::propagateMetadata(I, E->Scalars);
21647 // Drop nuw flags for abs(sub(commutative), true).
21648 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
21649 any_of(E->Scalars, [E](Value *V) {
21650 return isa<PoisonValue>(V) ||
21651 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
21652 isCommutative(cast<Instruction>(V));
21653 }))
21654 I->setHasNoUnsignedWrap(/*b=*/false);
21655 }
21656
21657 V = FinalShuffle(V, E);
21658
21659 E->VectorizedValue = V;
21660 ++NumVectorInstructions;
21661
21662 return V;
21663 }
21664 case Instruction::Load: {
21665 // Loads are inserted at the head of the tree because we don't want to
21666 // sink them all the way down past store instructions.
21667 setInsertPointAfterBundle(E);
21668
21669 LoadInst *LI = cast<LoadInst>(VL0);
21670 Instruction *NewLI;
21671 FixedVectorType *StridedLoadTy = nullptr;
21672 Value *PO = LI->getPointerOperand();
21673 if (E->State == TreeEntry::Vectorize) {
21674 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
21675 } else if (E->State == TreeEntry::CompressVectorize) {
21676 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
21677 CompressEntryToData.at(E);
21678 Align CommonAlignment = LI->getAlign();
21679 if (IsMasked) {
21680 unsigned VF = getNumElements(LoadVecTy);
21681 SmallVector<Constant *> MaskValues(
21682 VF / getNumElements(LI->getType()),
21683 ConstantInt::getFalse(VecTy->getContext()));
21684 for (int I : CompressMask)
21685 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
21686 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
21687 assert(SLPReVec && "Only supported by REVEC.");
21688 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
21689 }
21690 Constant *MaskValue = ConstantVector::get(MaskValues);
21691 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
21692 MaskValue);
21693 } else {
21694 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
21695 }
21696 NewLI = ::propagateMetadata(NewLI, E->Scalars);
21697 // TODO: include this cost into CommonCost.
21698 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
21699 assert(SLPReVec && "FixedVectorType is not expected.");
21700 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
21701 CompressMask);
21702 }
21703 NewLI =
21704 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
21705 } else if (E->State == TreeEntry::StridedVectorize) {
21706 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
21707 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
21708 PO = IsReverseOrder ? PtrN : Ptr0;
21709 Type *StrideTy = DL->getIndexType(PO->getType());
21710 Value *StrideVal;
21711 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
21712 StridedLoadTy = SPtrInfo.Ty;
21713 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
21714 unsigned StridedLoadEC =
21715 StridedLoadTy->getElementCount().getKnownMinValue();
21716
21717 Value *Stride = SPtrInfo.StrideVal;
21718 if (!Stride) {
21719 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
21720 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
21721 SCEVExpander Expander(*SE, "strided-load-vec");
21722 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
21723 &*Builder.GetInsertPoint());
21724 }
21725 Value *NewStride =
21726 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
21727 StrideVal = Builder.CreateMul(
21728 NewStride, ConstantInt::getSigned(
21729 StrideTy, (IsReverseOrder ? -1 : 1) *
21730 static_cast<int>(
21731 DL->getTypeAllocSize(ScalarTy))));
21732 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
21733 auto *Inst = Builder.CreateIntrinsic(
21734 Intrinsic::experimental_vp_strided_load,
21735 {StridedLoadTy, PO->getType(), StrideTy},
21736 {PO, StrideVal,
21737 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
21738 Builder.getInt32(StridedLoadEC)});
21739 Inst->addParamAttr(
21740 /*ArgNo=*/0,
21741 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
21742 NewLI = Inst;
21743 } else {
21744 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
21745 Value *VecPtr = vectorizeOperand(E, 0);
21746 if (isa<FixedVectorType>(ScalarTy)) {
21747 assert(SLPReVec && "FixedVectorType is not expected.");
21748 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
21749 // to expand VecPtr if ScalarTy is a vector type.
21750 unsigned ScalarTyNumElements =
21751 cast<FixedVectorType>(ScalarTy)->getNumElements();
21752 unsigned VecTyNumElements =
21753 cast<FixedVectorType>(VecTy)->getNumElements();
21754 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
21755 "Cannot expand getelementptr.");
21756 unsigned VF = VecTyNumElements / ScalarTyNumElements;
21757 SmallVector<Constant *> Indices(VecTyNumElements);
21758 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
21759 return Builder.getInt64(I % ScalarTyNumElements);
21760 });
21761 VecPtr = Builder.CreateGEP(
21762 VecTy->getElementType(),
21763 Builder.CreateShuffleVector(
21764 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
21765 ConstantVector::get(Indices));
21766 }
21767 // Use the minimum alignment of the gathered loads.
21768 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
21769 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
21770 }
21771 Value *V = E->State == TreeEntry::CompressVectorize
21772 ? NewLI
21773 : ::propagateMetadata(NewLI, E->Scalars);
21774
21775 if (StridedLoadTy != VecTy)
21776 V = Builder.CreateBitOrPointerCast(V, VecTy);
21777 V = FinalShuffle(V, E);
21778 E->VectorizedValue = V;
21779 ++NumVectorInstructions;
21780 return V;
21781 }
21782 case Instruction::Store: {
21783 auto *SI = cast<StoreInst>(VL0);
21784
21785 setInsertPointAfterBundle(E);
21786
21787 Value *VecValue = vectorizeOperand(E, 0);
21788 if (VecValue->getType() != VecTy)
21789 VecValue =
21790 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
21791 VecValue = FinalShuffle(VecValue, E);
21792
21793 Value *Ptr = SI->getPointerOperand();
21794 Instruction *ST;
21795 if (E->State == TreeEntry::Vectorize) {
21796 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
21797 } else {
21798 assert(E->State == TreeEntry::StridedVectorize &&
21799 "Expected either strided or consecutive stores.");
21800 if (!E->ReorderIndices.empty()) {
21801 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
21802 Ptr = SI->getPointerOperand();
21803 }
21804 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
21805 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
21806 auto *Inst = Builder.CreateIntrinsic(
21807 Intrinsic::experimental_vp_strided_store,
21808 {VecTy, Ptr->getType(), StrideTy},
21809 {VecValue, Ptr,
21811 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
21812 Builder.getAllOnesMask(VecTy->getElementCount()),
21813 Builder.getInt32(E->Scalars.size())});
21814 Inst->addParamAttr(
21815 /*ArgNo=*/1,
21816 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
21817 ST = Inst;
21818 }
21819
21820 Value *V = ::propagateMetadata(ST, E->Scalars);
21821
21822 E->VectorizedValue = V;
21823 ++NumVectorInstructions;
21824 return V;
21825 }
21826 case Instruction::GetElementPtr: {
21827 auto *GEP0 = cast<GetElementPtrInst>(VL0);
21828 setInsertPointAfterBundle(E);
21829
21830 Value *Op0 = vectorizeOperand(E, 0);
21831
21832 SmallVector<Value *> OpVecs;
21833 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
21834 Value *OpVec = vectorizeOperand(E, J);
21835 OpVecs.push_back(OpVec);
21836 }
21837
21838 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
21839 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
21841 for (Value *V : E->Scalars) {
21843 GEPs.push_back(V);
21844 }
21845 V = ::propagateMetadata(I, GEPs);
21846 }
21847
21848 V = FinalShuffle(V, E);
21849
21850 E->VectorizedValue = V;
21851 ++NumVectorInstructions;
21852
21853 return V;
21854 }
21855 case Instruction::Call: {
21856 CallInst *CI = cast<CallInst>(VL0);
21857 setInsertPointAfterBundle(E);
21858
21860
21862 CI, ID, VecTy->getNumElements(),
21863 It != MinBWs.end() ? It->second.first : 0, TTI);
21864 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
21865 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
21866 VecCallCosts.first <= VecCallCosts.second;
21867
21868 Value *ScalarArg = nullptr;
21869 SmallVector<Value *> OpVecs;
21870 SmallVector<Type *, 2> TysForDecl;
21871 // Add return type if intrinsic is overloaded on it.
21872 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
21873 TysForDecl.push_back(VecTy);
21874 auto *CEI = cast<CallInst>(VL0);
21875 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
21876 // Some intrinsics have scalar arguments. This argument should not be
21877 // vectorized.
21878 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
21879 ScalarArg = CEI->getArgOperand(I);
21880 // if decided to reduce bitwidth of abs intrinsic, it second argument
21881 // must be set false (do not return poison, if value issigned min).
21882 if (ID == Intrinsic::abs && It != MinBWs.end() &&
21883 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
21884 ScalarArg = Builder.getFalse();
21885 OpVecs.push_back(ScalarArg);
21887 TysForDecl.push_back(ScalarArg->getType());
21888 continue;
21889 }
21890
21891 Value *OpVec = vectorizeOperand(E, I);
21892 ScalarArg = CEI->getArgOperand(I);
21893 if (cast<VectorType>(OpVec->getType())->getElementType() !=
21894 ScalarArg->getType()->getScalarType() &&
21895 It == MinBWs.end()) {
21896 auto *CastTy =
21897 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
21898 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
21899 } else if (It != MinBWs.end()) {
21900 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
21901 }
21902 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
21903 OpVecs.push_back(OpVec);
21904 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
21905 TysForDecl.push_back(OpVec->getType());
21906 }
21907
21908 Function *CF;
21909 if (!UseIntrinsic) {
21910 VFShape Shape =
21912 ElementCount::getFixed(VecTy->getNumElements()),
21913 false /*HasGlobalPred*/);
21914 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
21915 } else {
21916 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
21917 }
21918
21920 CI->getOperandBundlesAsDefs(OpBundles);
21921 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
21922
21923 propagateIRFlags(V, E->Scalars, VL0);
21924 cast<CallInst>(V)->setCallingConv(CF->getCallingConv());
21925 V = FinalShuffle(V, E);
21926
21927 E->VectorizedValue = V;
21928 ++NumVectorInstructions;
21929 return V;
21930 }
21931 case Instruction::ShuffleVector: {
21932 Value *V;
21933 if (SLPReVec && !E->isAltShuffle()) {
21934 setInsertPointAfterBundle(E);
21935 Value *Src = vectorizeOperand(E, 0);
21936 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
21937 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
21938 SmallVector<int> NewMask(ThisMask.size());
21939 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
21940 return SVSrc->getShuffleMask()[Mask];
21941 });
21942 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
21943 SVSrc->getOperand(1), NewMask);
21944 } else {
21945 V = Builder.CreateShuffleVector(Src, ThisMask);
21946 }
21947 propagateIRFlags(V, E->Scalars, VL0);
21948 if (auto *I = dyn_cast<Instruction>(V))
21949 V = ::propagateMetadata(I, E->Scalars);
21950 V = FinalShuffle(V, E);
21951 } else {
21952 assert(E->isAltShuffle() &&
21953 ((Instruction::isBinaryOp(E->getOpcode()) &&
21954 Instruction::isBinaryOp(E->getAltOpcode())) ||
21955 (Instruction::isCast(E->getOpcode()) &&
21956 Instruction::isCast(E->getAltOpcode())) ||
21957 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
21958 "Invalid Shuffle Vector Operand");
21959
21960 Value *LHS = nullptr, *RHS = nullptr;
21961 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
21962 setInsertPointAfterBundle(E);
21963 LHS = vectorizeOperand(E, 0);
21964 RHS = vectorizeOperand(E, 1);
21965 } else {
21966 setInsertPointAfterBundle(E);
21967 LHS = vectorizeOperand(E, 0);
21968 }
21969 if (LHS && RHS &&
21970 ((Instruction::isBinaryOp(E->getOpcode()) &&
21971 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
21972 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
21973 assert((It != MinBWs.end() ||
21974 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
21975 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
21976 MinBWs.contains(getOperandEntry(E, 0)) ||
21977 MinBWs.contains(getOperandEntry(E, 1))) &&
21978 "Expected item in MinBWs.");
21979 Type *CastTy = VecTy;
21980 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
21982 ->getElementType()
21983 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
21984 ->getElementType()
21985 ->getIntegerBitWidth())
21986 CastTy = RHS->getType();
21987 else
21988 CastTy = LHS->getType();
21989 }
21990 if (LHS->getType() != CastTy)
21991 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
21992 if (RHS->getType() != CastTy)
21993 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
21994 }
21995
21996 Value *V0, *V1;
21997 if (Instruction::isBinaryOp(E->getOpcode())) {
21998 V0 = Builder.CreateBinOp(
21999 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
22000 V1 = Builder.CreateBinOp(
22001 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
22002 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
22003 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
22004 auto *AltCI = cast<CmpInst>(E->getAltOp());
22005 CmpInst::Predicate AltPred = AltCI->getPredicate();
22006 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
22007 } else {
22008 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
22009 unsigned SrcBWSz = DL->getTypeSizeInBits(
22010 cast<VectorType>(LHS->getType())->getElementType());
22011 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
22012 if (BWSz <= SrcBWSz) {
22013 if (BWSz < SrcBWSz)
22014 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
22015 assert(LHS->getType() == VecTy &&
22016 "Expected same type as operand.");
22017 if (auto *I = dyn_cast<Instruction>(LHS))
22018 LHS = ::propagateMetadata(I, E->Scalars);
22019 LHS = FinalShuffle(LHS, E);
22020 E->VectorizedValue = LHS;
22021 ++NumVectorInstructions;
22022 return LHS;
22023 }
22024 }
22025 V0 = Builder.CreateCast(
22026 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
22027 V1 = Builder.CreateCast(
22028 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
22029 }
22030 // Add V0 and V1 to later analysis to try to find and remove matching
22031 // instruction, if any.
22032 for (Value *V : {V0, V1}) {
22033 if (auto *I = dyn_cast<Instruction>(V)) {
22034 GatherShuffleExtractSeq.insert(I);
22035 CSEBlocks.insert(I->getParent());
22036 }
22037 }
22038
22039 // Create shuffle to take alternate operations from the vector.
22040 // Also, gather up main and alt scalar ops to propagate IR flags to
22041 // each vector operation.
22042 ValueList OpScalars, AltScalars;
22043 SmallVector<int> Mask;
22044 E->buildAltOpShuffleMask(
22045 [E, this](Instruction *I) {
22046 assert(E->getMatchingMainOpOrAltOp(I) &&
22047 "Unexpected main/alternate opcode");
22048 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
22049 *TLI);
22050 },
22051 Mask, &OpScalars, &AltScalars);
22052
22053 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
22054 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
22055 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
22056 // Drop nuw flags for abs(sub(commutative), true).
22057 if (auto *I = dyn_cast<Instruction>(Vec);
22058 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
22059 any_of(E->Scalars, [E](Value *V) {
22060 if (isa<PoisonValue>(V))
22061 return false;
22062 if (E->hasCopyableElements() && E->isCopyableElement(V))
22063 return false;
22064 auto *IV = cast<Instruction>(V);
22065 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
22066 }))
22067 I->setHasNoUnsignedWrap(/*b=*/false);
22068 };
22069 DropNuwFlag(V0, E->getOpcode());
22070 DropNuwFlag(V1, E->getAltOpcode());
22071
22072 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
22073 assert(SLPReVec && "FixedVectorType is not expected.");
22074 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
22075 }
22076 V = Builder.CreateShuffleVector(V0, V1, Mask);
22077 if (auto *I = dyn_cast<Instruction>(V)) {
22078 V = ::propagateMetadata(I, E->Scalars);
22079 GatherShuffleExtractSeq.insert(I);
22080 CSEBlocks.insert(I->getParent());
22081 }
22082 }
22083
22084 E->VectorizedValue = V;
22085 ++NumVectorInstructions;
22086
22087 return V;
22088 }
22089 case TreeEntry::ReducedBitcast:
22090 case TreeEntry::ReducedBitcastBSwap: {
22091 assert(UserIgnoreList && "Expected reduction operations only.");
22092 setInsertPointAfterBundle(E);
22093 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
22094 ZExt->VectorizedValue = PoisonValue::get(getWidenedType(
22095 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
22096 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
22097 Const->VectorizedValue = PoisonValue::get(getWidenedType(
22098 Const->Scalars.front()->getType(), Const->getVectorFactor()));
22099 Value *Op = vectorizeOperand(ZExt, 0);
22100 auto *SrcType = IntegerType::get(
22101 Op->getContext(),
22102 DL->getTypeSizeInBits(cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
22103 E->getVectorFactor());
22104 auto *OrigScalarTy = ScalarTy;
22105 // Set the scalar type properly to avoid casting to the extending type.
22106 ScalarTy = cast<CastInst>(ZExt->getMainOp())->getSrcTy();
22107 Op = FinalShuffle(Op, E);
22108 auto *V = Builder.CreateBitCast(Op, SrcType);
22109 ++NumVectorInstructions;
22110 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
22111 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
22112 ++NumVectorInstructions;
22113 }
22114 if (SrcType != OrigScalarTy) {
22115 V = Builder.CreateIntCast(V, OrigScalarTy, /*isSigned=*/false);
22116 ++NumVectorInstructions;
22117 }
22118 E->VectorizedValue = V;
22119 return V;
22120 }
22121 case TreeEntry::ReducedBitcastLoads:
22122 case TreeEntry::ReducedBitcastBSwapLoads: {
22123 assert(UserIgnoreList && "Expected reduction operations only.");
22124 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
22125 TreeEntry *Load = getOperandEntry(ZExt, /*Idx=*/0);
22126 setInsertPointAfterBundle(Load);
22127 ZExt->VectorizedValue = PoisonValue::get(getWidenedType(
22128 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
22129 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
22130 Const->VectorizedValue = PoisonValue::get(getWidenedType(
22131 Const->Scalars.front()->getType(), Const->getVectorFactor()));
22132 Load->VectorizedValue = PoisonValue::get(getWidenedType(
22133 Load->getMainOp()->getType(), Load->getVectorFactor()));
22134 LoadInst *LI = cast<LoadInst>(Load->getMainOp());
22135 Value *PO = LI->getPointerOperand();
22136 auto *SrcTy = IntegerType::get(
22137 ScalarTy->getContext(),
22138 DL->getTypeSizeInBits(cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
22139 E->getVectorFactor());
22140 auto *OrigScalarTy = ScalarTy;
22141 ScalarTy = ZExt->getMainOp()->getType();
22142 Value *V = Builder.CreateAlignedLoad(SrcTy, PO, LI->getAlign());
22143 ++NumVectorInstructions;
22144 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
22145 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
22146 ++NumVectorInstructions;
22147 }
22148 if (SrcTy != OrigScalarTy) {
22149 V = Builder.CreateIntCast(V, OrigScalarTy, /*isSigned=*/false);
22150 ++NumVectorInstructions;
22151 }
22152 E->VectorizedValue = V;
22153 return V;
22154 }
22155 case TreeEntry::ReducedCmpBitcast: {
22156 assert(UserIgnoreList && "Expected reduction operations only.");
22157 setInsertPointAfterBundle(E);
22158 TreeEntry *Op1TE = getOperandEntry(E, /*Idx=*/1);
22159 TreeEntry *Op2TE = getOperandEntry(E, /*Idx=*/2);
22160 Op1TE->VectorizedValue =
22161 PoisonValue::get(getWidenedType(ScalarTy, Op1TE->getVectorFactor()));
22162 Op2TE->VectorizedValue =
22163 PoisonValue::get(getWidenedType(ScalarTy, Op2TE->getVectorFactor()));
22164 Value *Cmp = vectorizeOperand(E, /*NodeIdx=*/0);
22165 // Set the scalar type properly to avoid casting to the extending type.
22166 auto *DstTy =
22167 IntegerType::getIntNTy(ScalarTy->getContext(), E->getVectorFactor());
22168 auto *V = Builder.CreateBitCast(Cmp, DstTy);
22169 ++NumVectorInstructions;
22170 if (DstTy != ScalarTy) {
22171 V = Builder.CreateIntCast(V, ScalarTy, /*isSigned=*/false);
22172 ++NumVectorInstructions;
22173 }
22174 E->VectorizedValue = V;
22175 return V;
22176 }
22177 default:
22178 llvm_unreachable("unknown inst");
22179 }
22180 return nullptr;
22181}
22182
22184 ExtraValueToDebugLocsMap ExternallyUsedValues;
22185 return vectorizeTree(ExternallyUsedValues);
22186}
22187
22189 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
22190 Instruction *ReductionRoot,
22191 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
22192 VectorValuesAndScales) {
22193 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
22194 // need to rebuild it.
22195 EntryToLastInstruction.clear();
22196 // All blocks must be scheduled before any instructions are inserted.
22197 for (auto &BSIter : BlocksSchedules)
22198 scheduleBlock(*this, BSIter.second.get());
22199 // Cache last instructions for the nodes to avoid side effects, which may
22200 // appear during vectorization, like extra uses, etc.
22201 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22202 // Need to generate insertion point for loads nodes of the bitcast/bswap
22203 // ops.
22204 if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
22205 (TE->State == TreeEntry::CombinedVectorize &&
22206 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
22207 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
22208 ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
22209 TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
22210 TE->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
22211 (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
22212 continue;
22213 (void)getLastInstructionInBundle(TE.get());
22214 }
22215
22216 if (ReductionRoot)
22217 Builder.SetInsertPoint(ReductionRoot->getParent(),
22218 ReductionRoot->getIterator());
22219 else
22220 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
22221
22222 // Vectorize gather operands of the nodes with the external uses only.
22224 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22225 if (DeletedNodes.contains(TE.get()))
22226 continue;
22227 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
22228 TE->UserTreeIndex.UserTE->hasState() &&
22229 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
22230 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
22231 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
22232 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
22233 all_of(TE->UserTreeIndex.UserTE->Scalars,
22234 [](Value *V) { return isUsedOutsideBlock(V); })) {
22235 Instruction &LastInst =
22236 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
22237 GatherEntries.emplace_back(TE.get(), &LastInst);
22238 }
22239 }
22240 for (auto &Entry : GatherEntries) {
22241 IRBuilderBase::InsertPointGuard Guard(Builder);
22242 Builder.SetInsertPoint(Entry.second);
22243 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
22244 (void)vectorizeTree(Entry.first);
22245 }
22246 // Emit gathered loads first to emit better code for the users of those
22247 // gathered loads.
22248 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22249 if (DeletedNodes.contains(TE.get()))
22250 continue;
22251 if (GatheredLoadsEntriesFirst.has_value() &&
22252 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
22253 (!TE->isGather() || TE->UserTreeIndex)) {
22254 assert((TE->UserTreeIndex ||
22255 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
22256 "Expected gathered load node.");
22257 (void)vectorizeTree(TE.get());
22258 }
22259 }
22260 (void)vectorizeTree(VectorizableTree[0].get());
22261 // Run through the list of postponed gathers and emit them, replacing the temp
22262 // emitted allocas with actual vector instructions.
22263 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
22265 for (const TreeEntry *E : PostponedNodes) {
22266 auto *TE = const_cast<TreeEntry *>(E);
22267 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
22268 TE->VectorizedValue = nullptr;
22269 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
22270 // If user is a PHI node, its vector code have to be inserted right before
22271 // block terminator. Since the node was delayed, there were some unresolved
22272 // dependencies at the moment when stab instruction was emitted. In a case
22273 // when any of these dependencies turn out an operand of another PHI, coming
22274 // from this same block, position of a stab instruction will become invalid.
22275 // The is because source vector that supposed to feed this gather node was
22276 // inserted at the end of the block [after stab instruction]. So we need
22277 // to adjust insertion point again to the end of block.
22278 if (isa<PHINode>(UserI) ||
22279 (TE->UserTreeIndex.UserTE->hasState() &&
22280 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
22281 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
22282 // Insert before all users.
22283 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
22284 for (User *U : PrevVec->users()) {
22285 if (U == UserI)
22286 continue;
22287 auto *UI = dyn_cast<Instruction>(U);
22288 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
22289 continue;
22290 if (UI->comesBefore(InsertPt))
22291 InsertPt = UI;
22292 }
22293 Builder.SetInsertPoint(InsertPt);
22294 } else {
22295 Builder.SetInsertPoint(PrevVec);
22296 }
22297 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
22298 Value *Vec = vectorizeTree(TE);
22299 if (auto *VecI = dyn_cast<Instruction>(Vec);
22300 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
22301 Builder.GetInsertPoint()->comesBefore(VecI))
22302 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
22303 Builder.GetInsertPoint());
22304 if (Vec->getType() != PrevVec->getType()) {
22305 assert(Vec->getType()->isIntOrIntVectorTy() &&
22306 PrevVec->getType()->isIntOrIntVectorTy() &&
22307 "Expected integer vector types only.");
22308 std::optional<bool> IsSigned;
22309 for (Value *V : TE->Scalars) {
22310 if (isVectorized(V)) {
22311 for (const TreeEntry *MNTE : getTreeEntries(V)) {
22312 auto It = MinBWs.find(MNTE);
22313 if (It != MinBWs.end()) {
22314 IsSigned = IsSigned.value_or(false) || It->second.second;
22315 if (*IsSigned)
22316 break;
22317 }
22318 }
22319 if (IsSigned.value_or(false))
22320 break;
22321 // Scan through gather nodes.
22322 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
22323 auto It = MinBWs.find(BVE);
22324 if (It != MinBWs.end()) {
22325 IsSigned = IsSigned.value_or(false) || It->second.second;
22326 if (*IsSigned)
22327 break;
22328 }
22329 }
22330 if (IsSigned.value_or(false))
22331 break;
22332 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
22333 IsSigned =
22334 IsSigned.value_or(false) ||
22335 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
22336 continue;
22337 }
22338 if (IsSigned.value_or(false))
22339 break;
22340 }
22341 }
22342 if (IsSigned.value_or(false)) {
22343 // Final attempt - check user node.
22344 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
22345 if (It != MinBWs.end())
22346 IsSigned = It->second.second;
22347 }
22348 assert(IsSigned &&
22349 "Expected user node or perfect diamond match in MinBWs.");
22350 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
22351 }
22352 PrevVec->replaceAllUsesWith(Vec);
22353 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
22354 // Replace the stub vector node, if it was used before for one of the
22355 // buildvector nodes already.
22356 auto It = PostponedValues.find(PrevVec);
22357 if (It != PostponedValues.end()) {
22358 for (TreeEntry *VTE : It->getSecond())
22359 VTE->VectorizedValue = Vec;
22360 }
22361 eraseInstruction(PrevVec);
22362 }
22363
22364 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
22365 << " values .\n");
22366
22368 // Maps vector instruction to original insertelement instruction
22369 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
22370 // Maps extract Scalar to the corresponding extractelement instruction in the
22371 // basic block. Only one extractelement per block should be emitted.
22373 ScalarToEEs;
22374 SmallDenseSet<Value *, 4> UsedInserts;
22376 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
22378 // Extract all of the elements with the external uses.
22379 for (const auto &ExternalUse : ExternalUses) {
22380 Value *Scalar = ExternalUse.Scalar;
22381 llvm::User *User = ExternalUse.User;
22382
22383 // Skip users that we already RAUW. This happens when one instruction
22384 // has multiple uses of the same value.
22385 if (User && !is_contained(Scalar->users(), User))
22386 continue;
22387 const TreeEntry *E = &ExternalUse.E;
22388 assert(E && "Invalid scalar");
22389 assert(!E->isGather() && "Extracting from a gather list");
22390 // Non-instruction pointers are not deleted, just skip them.
22391 if (E->getOpcode() == Instruction::GetElementPtr &&
22392 !isa<GetElementPtrInst>(Scalar))
22393 continue;
22394
22395 Value *Vec = E->VectorizedValue;
22396 assert(Vec && "Can't find vectorizable value");
22397
22398 Value *Lane = Builder.getInt32(ExternalUse.Lane);
22399 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
22400 if (Scalar->getType() != Vec->getType()) {
22401 Value *Ex = nullptr;
22402 Value *ExV = nullptr;
22403 auto *Inst = dyn_cast<Instruction>(Scalar);
22404 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
22405 auto It = ScalarToEEs.find(Scalar);
22406 if (It != ScalarToEEs.end()) {
22407 // No need to emit many extracts, just move the only one in the
22408 // current block.
22409 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
22410 : Builder.GetInsertBlock());
22411 if (EEIt != It->second.end()) {
22412 Value *PrevV = EEIt->second.first;
22413 if (auto *I = dyn_cast<Instruction>(PrevV);
22414 I && !ReplaceInst &&
22415 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
22416 Builder.GetInsertPoint()->comesBefore(I)) {
22417 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
22418 Builder.GetInsertPoint());
22419 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
22420 CI->moveAfter(I);
22421 }
22422 Ex = PrevV;
22423 ExV = EEIt->second.second ? EEIt->second.second : Ex;
22424 }
22425 }
22426 if (!Ex) {
22427 // "Reuse" the existing extract to improve final codegen.
22428 if (ReplaceInst) {
22429 // Leave the instruction as is, if it cheaper extracts and all
22430 // operands are scalar.
22431 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
22432 IgnoredExtracts.insert(EE);
22433 Ex = EE;
22434 } else {
22435 auto *CloneInst = Inst->clone();
22436 CloneInst->insertBefore(Inst->getIterator());
22437 if (Inst->hasName())
22438 CloneInst->takeName(Inst);
22439 Ex = CloneInst;
22440 }
22441 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
22442 ES && isa<Instruction>(Vec)) {
22443 Value *V = ES->getVectorOperand();
22444 auto *IVec = cast<Instruction>(Vec);
22445 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
22446 V = ETEs.front()->VectorizedValue;
22447 if (auto *IV = dyn_cast<Instruction>(V);
22448 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
22449 IV->comesBefore(IVec))
22450 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
22451 else
22452 Ex = Builder.CreateExtractElement(Vec, Lane);
22453 } else if (auto *VecTy =
22454 dyn_cast<FixedVectorType>(Scalar->getType())) {
22455 assert(SLPReVec && "FixedVectorType is not expected.");
22456 unsigned VecTyNumElements = VecTy->getNumElements();
22457 // When REVEC is enabled, we need to extract a vector.
22458 // Note: The element size of Scalar may be different from the
22459 // element size of Vec.
22460 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
22461 ExternalUse.Lane * VecTyNumElements);
22462 } else {
22463 Ex = Builder.CreateExtractElement(Vec, Lane);
22464 }
22465 // If necessary, sign-extend or zero-extend ScalarRoot
22466 // to the larger type.
22467 ExV = Ex;
22468 if (Scalar->getType() != Ex->getType())
22469 ExV = Builder.CreateIntCast(
22470 Ex, Scalar->getType(),
22471 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
22472 auto *I = dyn_cast<Instruction>(Ex);
22473 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
22474 : &F->getEntryBlock(),
22475 std::make_pair(Ex, ExV));
22476 }
22477 // The then branch of the previous if may produce constants, since 0
22478 // operand might be a constant.
22479 if (auto *ExI = dyn_cast<Instruction>(Ex);
22480 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
22481 GatherShuffleExtractSeq.insert(ExI);
22482 CSEBlocks.insert(ExI->getParent());
22483 }
22484 return ExV;
22485 }
22486 assert(isa<FixedVectorType>(Scalar->getType()) &&
22487 isa<InsertElementInst>(Scalar) &&
22488 "In-tree scalar of vector type is not insertelement?");
22489 auto *IE = cast<InsertElementInst>(Scalar);
22490 VectorToInsertElement.try_emplace(Vec, IE);
22491 return Vec;
22492 };
22493 // If User == nullptr, the Scalar remains as scalar in vectorized
22494 // instructions or is used as extra arg. Generate ExtractElement instruction
22495 // and update the record for this scalar in ExternallyUsedValues.
22496 if (!User) {
22497 if (!ScalarsWithNullptrUser.insert(Scalar).second)
22498 continue;
22499 assert(
22500 (ExternallyUsedValues.count(Scalar) ||
22501 ExternalUsesWithNonUsers.count(Scalar) ||
22502 ExternalUsesAsOriginalScalar.contains(Scalar) ||
22503 any_of(
22504 Scalar->users(),
22505 [&, TTI = TTI](llvm::User *U) {
22506 if (ExternalUsesAsOriginalScalar.contains(U))
22507 return true;
22508 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
22509 return !UseEntries.empty() &&
22510 (E->State == TreeEntry::Vectorize ||
22511 E->State == TreeEntry::StridedVectorize ||
22512 E->State == TreeEntry::CompressVectorize) &&
22513 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
22514 return (UseEntry->State == TreeEntry::Vectorize ||
22515 UseEntry->State ==
22516 TreeEntry::StridedVectorize ||
22517 UseEntry->State ==
22518 TreeEntry::CompressVectorize) &&
22519 doesInTreeUserNeedToExtract(
22520 Scalar, getRootEntryInstruction(*UseEntry),
22521 TLI, TTI);
22522 });
22523 })) &&
22524 "Scalar with nullptr User must be registered in "
22525 "ExternallyUsedValues map or remain as scalar in vectorized "
22526 "instructions");
22527 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
22528 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
22529 if (PHI->getParent()->isLandingPad())
22530 Builder.SetInsertPoint(
22531 PHI->getParent(),
22532 std::next(
22533 PHI->getParent()->getLandingPadInst()->getIterator()));
22534 else
22535 Builder.SetInsertPoint(PHI->getParent(),
22536 PHI->getParent()->getFirstNonPHIIt());
22537 } else {
22538 Builder.SetInsertPoint(VecI->getParent(),
22539 std::next(VecI->getIterator()));
22540 }
22541 } else {
22542 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
22543 }
22544 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22545 // Required to update internally referenced instructions.
22546 if (Scalar != NewInst) {
22547 assert((!isa<ExtractElementInst>(Scalar) ||
22548 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
22549 "Extractelements should not be replaced.");
22550 Scalar->replaceAllUsesWith(NewInst);
22551 }
22552 continue;
22553 }
22554
22555 if (auto *VU = dyn_cast<InsertElementInst>(User);
22556 VU && VU->getOperand(1) == Scalar) {
22557 // Skip if the scalar is another vector op or Vec is not an instruction.
22558 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
22559 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
22560 if (!UsedInserts.insert(VU).second)
22561 continue;
22562 // Need to use original vector, if the root is truncated.
22563 auto BWIt = MinBWs.find(E);
22564 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
22565 auto *ScalarTy = FTy->getElementType();
22566 auto Key = std::make_pair(Vec, ScalarTy);
22567 auto VecIt = VectorCasts.find(Key);
22568 if (VecIt == VectorCasts.end()) {
22569 IRBuilderBase::InsertPointGuard Guard(Builder);
22570 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
22571 if (IVec->getParent()->isLandingPad())
22572 Builder.SetInsertPoint(IVec->getParent(),
22573 std::next(IVec->getParent()
22574 ->getLandingPadInst()
22575 ->getIterator()));
22576 else
22577 Builder.SetInsertPoint(
22578 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
22579 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
22580 Builder.SetInsertPoint(IVec->getNextNode());
22581 }
22582 Vec = Builder.CreateIntCast(
22583 Vec,
22585 ScalarTy,
22586 cast<FixedVectorType>(Vec->getType())->getNumElements()),
22587 BWIt->second.second);
22588 VectorCasts.try_emplace(Key, Vec);
22589 } else {
22590 Vec = VecIt->second;
22591 }
22592 }
22593
22594 std::optional<unsigned> InsertIdx = getElementIndex(VU);
22595 if (InsertIdx) {
22596 auto *It = find_if(
22597 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
22598 // Checks if 2 insertelements are from the same buildvector.
22599 InsertElementInst *VecInsert = Data.InsertElements.front();
22601 VU, VecInsert,
22602 [](InsertElementInst *II) { return II->getOperand(0); });
22603 });
22604 unsigned Idx = *InsertIdx;
22605 if (It == ShuffledInserts.end()) {
22606 (void)ShuffledInserts.emplace_back();
22607 It = std::next(ShuffledInserts.begin(),
22608 ShuffledInserts.size() - 1);
22609 }
22610 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
22611 if (Mask.empty())
22612 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
22613 Mask[Idx] = ExternalUse.Lane;
22614 It->InsertElements.push_back(cast<InsertElementInst>(User));
22615 continue;
22616 }
22617 }
22618 }
22619 }
22620
22621 // Generate extracts for out-of-tree users.
22622 // Find the insertion point for the extractelement lane.
22623 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
22624 if (PHINode *PH = dyn_cast<PHINode>(User)) {
22625 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
22626 if (PH->getIncomingValue(I) == Scalar) {
22627 Instruction *IncomingTerminator =
22628 PH->getIncomingBlock(I)->getTerminator();
22629 if (isa<CatchSwitchInst>(IncomingTerminator)) {
22630 Builder.SetInsertPoint(VecI->getParent(),
22631 std::next(VecI->getIterator()));
22632 } else {
22633 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
22634 }
22635 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22636 PH->setOperand(I, NewInst);
22637 }
22638 }
22639 } else {
22640 Builder.SetInsertPoint(cast<Instruction>(User));
22641 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22642 User->replaceUsesOfWith(Scalar, NewInst);
22643 }
22644 } else {
22645 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
22646 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22647 User->replaceUsesOfWith(Scalar, NewInst);
22648 }
22649
22650 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
22651 }
22652
22653 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
22654 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
22655 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
22656 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
22657 for (int I = 0, E = Mask.size(); I < E; ++I) {
22658 if (Mask[I] < VF)
22659 CombinedMask1[I] = Mask[I];
22660 else
22661 CombinedMask2[I] = Mask[I] - VF;
22662 }
22663 ShuffleInstructionBuilder ShuffleBuilder(
22664 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
22665 ShuffleBuilder.add(V1, CombinedMask1);
22666 if (V2)
22667 ShuffleBuilder.add(V2, CombinedMask2);
22668 return ShuffleBuilder.finalize({}, {}, {});
22669 };
22670
22671 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
22672 bool ForSingleMask) {
22673 unsigned VF = Mask.size();
22674 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
22675 if (VF != VecVF) {
22676 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
22677 Vec = CreateShuffle(Vec, nullptr, Mask);
22678 return std::make_pair(Vec, true);
22679 }
22680 if (!ForSingleMask) {
22681 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
22682 for (unsigned I = 0; I < VF; ++I) {
22683 if (Mask[I] != PoisonMaskElem)
22684 ResizeMask[Mask[I]] = Mask[I];
22685 }
22686 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
22687 }
22688 }
22689
22690 return std::make_pair(Vec, false);
22691 };
22692 // Perform shuffling of the vectorize tree entries for better handling of
22693 // external extracts.
22694 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
22695 // Find the first and the last instruction in the list of insertelements.
22696 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
22697 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
22698 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
22699 Builder.SetInsertPoint(LastInsert);
22700 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
22702 MutableArrayRef(Vector.data(), Vector.size()),
22703 FirstInsert->getOperand(0),
22704 [](Value *Vec) {
22705 return cast<VectorType>(Vec->getType())
22706 ->getElementCount()
22707 .getKnownMinValue();
22708 },
22709 ResizeToVF,
22710 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
22711 ArrayRef<Value *> Vals) {
22712 assert((Vals.size() == 1 || Vals.size() == 2) &&
22713 "Expected exactly 1 or 2 input values.");
22714 if (Vals.size() == 1) {
22715 // Do not create shuffle if the mask is a simple identity
22716 // non-resizing mask.
22717 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
22718 ->getNumElements() ||
22719 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
22720 return CreateShuffle(Vals.front(), nullptr, Mask);
22721 return Vals.front();
22722 }
22723 return CreateShuffle(Vals.front() ? Vals.front()
22724 : FirstInsert->getOperand(0),
22725 Vals.back(), Mask);
22726 });
22727 auto It = ShuffledInserts[I].InsertElements.rbegin();
22728 // Rebuild buildvector chain.
22729 InsertElementInst *II = nullptr;
22730 if (It != ShuffledInserts[I].InsertElements.rend())
22731 II = *It;
22733 while (It != ShuffledInserts[I].InsertElements.rend()) {
22734 assert(II && "Must be an insertelement instruction.");
22735 if (*It == II)
22736 ++It;
22737 else
22738 Inserts.push_back(cast<Instruction>(II));
22739 II = dyn_cast<InsertElementInst>(II->getOperand(0));
22740 }
22741 for (Instruction *II : reverse(Inserts)) {
22742 II->replaceUsesOfWith(II->getOperand(0), NewInst);
22743 if (auto *NewI = dyn_cast<Instruction>(NewInst))
22744 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
22745 II->moveAfter(NewI);
22746 NewInst = II;
22747 }
22748 LastInsert->replaceAllUsesWith(NewInst);
22749 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
22750 IE->replaceUsesOfWith(IE->getOperand(0),
22751 PoisonValue::get(IE->getOperand(0)->getType()));
22752 IE->replaceUsesOfWith(IE->getOperand(1),
22753 PoisonValue::get(IE->getOperand(1)->getType()));
22754 eraseInstruction(IE);
22755 }
22756 CSEBlocks.insert(LastInsert->getParent());
22757 }
22758
22759 SmallVector<Instruction *> RemovedInsts;
22760 // For each vectorized value:
22761 for (auto &TEPtr : VectorizableTree) {
22762 TreeEntry *Entry = TEPtr.get();
22763
22764 // No need to handle users of gathered values.
22765 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
22766 DeletedNodes.contains(Entry) ||
22767 TransformedToGatherNodes.contains(Entry))
22768 continue;
22769
22770 if (Entry->CombinedOp == TreeEntry::ReducedBitcast ||
22771 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
22772 Entry->CombinedOp == TreeEntry::ReducedBitcastLoads ||
22773 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
22774 Entry->CombinedOp == TreeEntry::ReducedCmpBitcast) {
22775 // Skip constant node
22776 if (!Entry->hasState()) {
22777 assert(allConstant(Entry->Scalars) && "Expected constants only.");
22778 continue;
22779 }
22780 for (Value *Scalar : Entry->Scalars) {
22781 auto *I = dyn_cast<Instruction>(Scalar);
22782
22783 if (!I || Entry->isCopyableElement(I))
22784 continue;
22785 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *I << ".\n");
22786 RemovedInsts.push_back(I);
22787 }
22788 continue;
22789 }
22790
22791 assert(Entry->VectorizedValue && "Can't find vectorizable value");
22792
22793 // For each lane:
22794 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
22795 Value *Scalar = Entry->Scalars[Lane];
22796
22797 if (Entry->getOpcode() == Instruction::GetElementPtr &&
22798 !isa<GetElementPtrInst>(Scalar))
22799 continue;
22800 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
22801 EE && IgnoredExtracts.contains(EE))
22802 continue;
22803 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
22804 continue;
22805#ifndef NDEBUG
22806 Type *Ty = Scalar->getType();
22807 if (!Ty->isVoidTy()) {
22808 for (User *U : Scalar->users()) {
22809 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
22810
22811 // It is legal to delete users in the ignorelist.
22812 assert((isVectorized(U) ||
22813 (UserIgnoreList && UserIgnoreList->contains(U)) ||
22816 "Deleting out-of-tree value");
22817 }
22818 }
22819#endif
22820 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
22821 auto *I = cast<Instruction>(Scalar);
22822 RemovedInsts.push_back(I);
22823 }
22824 }
22825
22826 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
22827 // new vector instruction.
22828 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
22829 V->mergeDIAssignID(RemovedInsts);
22830
22831 // Clear up reduction references, if any.
22832 if (UserIgnoreList) {
22833 for (Instruction *I : RemovedInsts) {
22834 const TreeEntry *IE = getTreeEntries(I).front();
22835 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(I);
22836 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
22837 IE = SplitEntries.front();
22838 if (IE->Idx != 0 &&
22839 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
22840 (ValueToGatherNodes.lookup(I).contains(
22841 VectorizableTree.front().get()) ||
22842 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
22843 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
22844 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
22845 IE->UserTreeIndex &&
22846 is_contained(VectorizableTree.front()->Scalars, I)) &&
22847 !(GatheredLoadsEntriesFirst.has_value() &&
22848 IE->Idx >= *GatheredLoadsEntriesFirst &&
22849 VectorizableTree.front()->isGather() &&
22850 is_contained(VectorizableTree.front()->Scalars, I)) &&
22851 !(!VectorizableTree.front()->isGather() &&
22852 VectorizableTree.front()->isCopyableElement(I)))
22853 continue;
22854 SmallVector<SelectInst *> LogicalOpSelects;
22855 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
22856 // Do not replace condition of the logical op in form select <cond>.
22857 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
22858 (match(U.getUser(), m_LogicalAnd()) ||
22859 match(U.getUser(), m_LogicalOr())) &&
22860 U.getOperandNo() == 0;
22861 if (IsPoisoningLogicalOp) {
22862 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
22863 return false;
22864 }
22865 return UserIgnoreList->contains(U.getUser());
22866 });
22867 // Replace conditions of the poisoning logical ops with the non-poison
22868 // constant value.
22869 for (SelectInst *SI : LogicalOpSelects)
22870 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
22871 }
22872 }
22873 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
22874 // cache correctness.
22875 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
22876 // - instructions are not deleted until later.
22877 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
22878
22879 Builder.ClearInsertionPoint();
22880 InstrElementSize.clear();
22881
22882 const TreeEntry &RootTE = *VectorizableTree.front();
22883 Value *Vec = RootTE.VectorizedValue;
22884 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
22885 It != MinBWs.end() &&
22886 ReductionBitWidth != It->second.first) {
22887 IRBuilder<>::InsertPointGuard Guard(Builder);
22888 Builder.SetInsertPoint(ReductionRoot->getParent(),
22889 ReductionRoot->getIterator());
22891 Vec = Builder.CreateIntCast(Vec, Builder.getIntNTy(ReductionBitWidth),
22892 It->second.second);
22893
22894 } else {
22895 Vec = Builder.CreateIntCast(
22896 Vec,
22897 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
22898 cast<VectorType>(Vec->getType())->getElementCount()),
22899 It->second.second);
22900 }
22901 }
22902 return Vec;
22903}
22904
22906 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
22907 << " gather sequences instructions.\n");
22908 // LICM InsertElementInst sequences.
22909 for (Instruction *I : GatherShuffleExtractSeq) {
22910 if (isDeleted(I))
22911 continue;
22912
22913 // Check if this block is inside a loop.
22914 Loop *L = LI->getLoopFor(I->getParent());
22915 if (!L)
22916 continue;
22917
22918 // Check if it has a preheader.
22919 BasicBlock *PreHeader = L->getLoopPreheader();
22920 if (!PreHeader)
22921 continue;
22922
22923 // If the vector or the element that we insert into it are
22924 // instructions that are defined in this basic block then we can't
22925 // hoist this instruction.
22926 if (any_of(I->operands(), [L](Value *V) {
22927 auto *OpI = dyn_cast<Instruction>(V);
22928 return OpI && L->contains(OpI);
22929 }))
22930 continue;
22931
22932 // We can hoist this instruction. Move it to the pre-header.
22933 I->moveBefore(PreHeader->getTerminator()->getIterator());
22934 CSEBlocks.insert(PreHeader);
22935 }
22936
22937 // Make a list of all reachable blocks in our CSE queue.
22939 CSEWorkList.reserve(CSEBlocks.size());
22940 for (BasicBlock *BB : CSEBlocks)
22941 if (DomTreeNode *N = DT->getNode(BB)) {
22942 assert(DT->isReachableFromEntry(N));
22943 CSEWorkList.push_back(N);
22944 }
22945
22946 // Sort blocks by domination. This ensures we visit a block after all blocks
22947 // dominating it are visited.
22948 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
22949 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
22950 "Different nodes should have different DFS numbers");
22951 return A->getDFSNumIn() < B->getDFSNumIn();
22952 });
22953
22954 // Less defined shuffles can be replaced by the more defined copies.
22955 // Between two shuffles one is less defined if it has the same vector operands
22956 // and its mask indeces are the same as in the first one or undefs. E.g.
22957 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
22958 // poison, <0, 0, 0, 0>.
22959 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
22960 Instruction *I2,
22961 SmallVectorImpl<int> &NewMask) {
22962 if (I1->getType() != I2->getType())
22963 return false;
22964 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
22965 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
22966 if (!SI1 || !SI2)
22967 return I1->isIdenticalTo(I2);
22968 if (SI1->isIdenticalTo(SI2))
22969 return true;
22970 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
22971 if (SI1->getOperand(I) != SI2->getOperand(I))
22972 return false;
22973 // Check if the second instruction is more defined than the first one.
22974 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
22975 ArrayRef<int> SM1 = SI1->getShuffleMask();
22976 // Count trailing undefs in the mask to check the final number of used
22977 // registers.
22978 unsigned LastUndefsCnt = 0;
22979 for (int I = 0, E = NewMask.size(); I < E; ++I) {
22980 if (SM1[I] == PoisonMaskElem)
22981 ++LastUndefsCnt;
22982 else
22983 LastUndefsCnt = 0;
22984 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
22985 NewMask[I] != SM1[I])
22986 return false;
22987 if (NewMask[I] == PoisonMaskElem)
22988 NewMask[I] = SM1[I];
22989 }
22990 // Check if the last undefs actually change the final number of used vector
22991 // registers.
22992 return SM1.size() - LastUndefsCnt > 1 &&
22993 ::getNumberOfParts(*TTI, SI1->getType()) ==
22995 *TTI, getWidenedType(SI1->getType()->getElementType(),
22996 SM1.size() - LastUndefsCnt));
22997 };
22998 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
22999 // instructions. TODO: We can further optimize this scan if we split the
23000 // instructions into different buckets based on the insert lane.
23002 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
23003 assert(*I &&
23004 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
23005 "Worklist not sorted properly!");
23006 BasicBlock *BB = (*I)->getBlock();
23007 // For all instructions in blocks containing gather sequences:
23008 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
23009 if (isDeleted(&In))
23010 continue;
23012 !GatherShuffleExtractSeq.contains(&In))
23013 continue;
23014
23015 // Check if we can replace this instruction with any of the
23016 // visited instructions.
23017 bool Replaced = false;
23018 for (Instruction *&V : Visited) {
23019 SmallVector<int> NewMask;
23020 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
23021 DT->dominates(V->getParent(), In.getParent())) {
23022 In.replaceAllUsesWith(V);
23023 eraseInstruction(&In);
23024 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
23025 if (!NewMask.empty())
23026 SI->setShuffleMask(NewMask);
23027 Replaced = true;
23028 break;
23029 }
23031 GatherShuffleExtractSeq.contains(V) &&
23032 IsIdenticalOrLessDefined(V, &In, NewMask) &&
23033 DT->dominates(In.getParent(), V->getParent())) {
23034 In.moveAfter(V);
23035 V->replaceAllUsesWith(&In);
23037 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
23038 if (!NewMask.empty())
23039 SI->setShuffleMask(NewMask);
23040 V = &In;
23041 Replaced = true;
23042 break;
23043 }
23044 }
23045 if (!Replaced) {
23046 assert(!is_contained(Visited, &In));
23047 Visited.push_back(&In);
23048 }
23049 }
23050 }
23051 CSEBlocks.clear();
23052 GatherShuffleExtractSeq.clear();
23053}
23054
23055BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
23056 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
23057 auto &BundlePtr =
23058 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
23059 for (Value *V : VL) {
23060 if (S.isNonSchedulable(V))
23061 continue;
23062 auto *I = cast<Instruction>(V);
23063 if (S.isCopyableElement(V)) {
23064 // Add a copyable element model.
23065 ScheduleCopyableData &SD =
23066 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
23067 // Group the instructions to a bundle.
23068 BundlePtr->add(&SD);
23069 continue;
23070 }
23071 ScheduleData *BundleMember = getScheduleData(V);
23072 assert(BundleMember && "no ScheduleData for bundle member "
23073 "(maybe not in same basic block)");
23074 // Group the instructions to a bundle.
23075 BundlePtr->add(BundleMember);
23076 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
23077 BundlePtr.get());
23078 }
23079 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
23080 return *BundlePtr;
23081}
23082
23083// Groups the instructions to a bundle (which is then a single scheduling entity)
23084// and schedules instructions until the bundle gets ready.
23085std::optional<BoUpSLP::ScheduleBundle *>
23086BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
23087 const InstructionsState &S,
23088 const EdgeInfo &EI) {
23089 // No need to schedule PHIs, insertelement, extractelement and extractvalue
23090 // instructions.
23091 if (isa<PHINode>(S.getMainOp()) ||
23092 isVectorLikeInstWithConstOps(S.getMainOp()))
23093 return nullptr;
23094 // If the parent node is non-schedulable and the current node is copyable, and
23095 // any of parent instructions are used outside several basic blocks or in
23096 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
23097 // analysis, leading to a crash.
23098 // Non-scheduled nodes may not have related ScheduleData model, which may lead
23099 // to a skipped dep analysis.
23100 bool HasCopyables = S.areInstructionsWithCopyableElements();
23101 bool DoesNotRequireScheduling =
23102 (!HasCopyables && doesNotNeedToSchedule(VL)) ||
23103 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); });
23104 if (!DoesNotRequireScheduling && S.areInstructionsWithCopyableElements() &&
23105 EI && EI.UserTE->hasState() && EI.UserTE->doesNotNeedToSchedule() &&
23106 EI.UserTE->getOpcode() != Instruction::PHI &&
23107 EI.UserTE->getOpcode() != Instruction::InsertElement &&
23108 any_of(EI.UserTE->Scalars, [](Value *V) {
23109 auto *I = dyn_cast<Instruction>(V);
23110 if (!I)
23111 return false;
23112 for (User *U : I->users()) {
23113 auto *UI = cast<Instruction>(U);
23114 if (isa<BinaryOperator>(UI))
23115 return true;
23116 }
23117 return false;
23118 }))
23119 return std::nullopt;
23120 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
23121 EI.UserTE->hasCopyableElements() &&
23122 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
23123 all_of(VL, [&](Value *V) {
23124 if (S.isCopyableElement(V))
23125 return true;
23126 return isUsedOutsideBlock(V);
23127 }))
23128 return std::nullopt;
23129 // If any instruction is used outside block only and its operand is placed
23130 // immediately before it, do not schedule, it may cause wrong def-use chain.
23131 if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
23132 if (isa<PoisonValue>(V) || S.isCopyableElement(V))
23133 return false;
23134 if (isUsedOutsideBlock(V)) {
23135 for (Value *Op : cast<Instruction>(V)->operands()) {
23136 auto *I = dyn_cast<Instruction>(Op);
23137 if (!I)
23138 continue;
23139 return SLP->isVectorized(I) && I->getNextNode() == V;
23140 }
23141 }
23142 return false;
23143 }))
23144 return std::nullopt;
23145 if (S.areInstructionsWithCopyableElements() && EI) {
23146 bool IsNonSchedulableWithParentPhiNode =
23147 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
23148 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
23149 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
23150 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
23151 if (IsNonSchedulableWithParentPhiNode) {
23152 SmallSet<std::pair<Value *, Value *>, 4> Values;
23153 for (const auto [Idx, V] :
23154 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
23155 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
23156 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
23157 auto *I = dyn_cast<Instruction>(Op);
23158 if (!I || !isCommutative(I))
23159 continue;
23160 if (!Values.insert(std::make_pair(V, Op)).second)
23161 return std::nullopt;
23162 }
23163 } else {
23164 // If any of the parent requires scheduling - exit, complex dep between
23165 // schedulable/non-schedulable parents.
23166 if (any_of(EI.UserTE->Scalars, [&](Value *V) {
23167 if (EI.UserTE->hasCopyableElements() &&
23168 EI.UserTE->isCopyableElement(V))
23169 return false;
23170 ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
23171 return any_of(Entries, [](const TreeEntry *TE) {
23172 return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
23173 TE->UserTreeIndex.UserTE->hasState() &&
23174 TE->UserTreeIndex.UserTE->State !=
23175 TreeEntry::SplitVectorize &&
23176 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
23177 });
23178 }))
23179 return std::nullopt;
23180 }
23181 }
23182 if (DoesNotRequireScheduling) {
23183 // If all operands were replaced by copyables, the operands of this node
23184 // might be not, so need to recalculate dependencies for schedule data,
23185 // replaced by copyable schedule data.
23186 for (Value *V : VL) {
23187 auto *I = dyn_cast<Instruction>(V);
23188 if (!I || (HasCopyables && S.isCopyableElement(V)))
23189 continue;
23190 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
23191 for (const Use &U : I->operands()) {
23192 unsigned &NumOps =
23193 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
23194 .first->getSecond();
23195 ++NumOps;
23196 if (auto *Op = dyn_cast<Instruction>(U.get());
23197 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
23198 if (ScheduleData *OpSD = getScheduleData(Op);
23199 OpSD && OpSD->hasValidDependencies())
23200 // TODO: investigate how to improve it instead of early exiting.
23201 return std::nullopt;
23202 }
23203 }
23204 }
23205 return nullptr;
23206 }
23207
23208 // Any schedulable copyable with split vectorize parent - skip, not supported
23209 // currently.
23210 // TODO: investigate fix for this early exit.
23211 if (S.areInstructionsWithCopyableElements() && EI.UserTE &&
23212 EI.UserTE->State == TreeEntry::SplitVectorize &&
23213 any_of(VL, [&](Value *V) {
23214 return !S.isNonSchedulable(V) && S.isCopyableElement(V);
23215 }))
23216 return std::nullopt;
23217
23218 // Initialize the instruction bundle.
23219 Instruction *OldScheduleEnd = ScheduleEnd;
23220 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
23221
23222 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
23223 // Clear deps or recalculate the region, if the memory instruction is a
23224 // copyable. It may have memory deps, which must be recalculated.
23225 SmallVector<ScheduleData *> ControlDependentMembers;
23226 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
23227 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
23228 for (ScheduleEntity *SE : Bundle.getBundle()) {
23229 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
23230 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
23231 BundleMember && BundleMember->hasValidDependencies()) {
23232 BundleMember->clearDirectDependencies();
23233 if (RegionHasStackSave ||
23235 BundleMember->getInst()))
23236 ControlDependentMembers.push_back(BundleMember);
23237 }
23238 continue;
23239 }
23240 auto *SD = cast<ScheduleData>(SE);
23241 if (SD->hasValidDependencies() &&
23242 (!S.areInstructionsWithCopyableElements() ||
23243 !S.isCopyableElement(SD->getInst())) &&
23244 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
23245 EI.UserTE->hasState() &&
23246 (!EI.UserTE->hasCopyableElements() ||
23247 !EI.UserTE->isCopyableElement(SD->getInst())))
23248 SD->clearDirectDependencies();
23249 for (const Use &U : SD->getInst()->operands()) {
23250 unsigned &NumOps =
23251 UserOpToNumOps
23252 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
23253 .first->getSecond();
23254 ++NumOps;
23255 if (auto *Op = dyn_cast<Instruction>(U.get());
23256 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
23257 *SLP, NumOps)) {
23258 if (ScheduleData *OpSD = getScheduleData(Op);
23259 OpSD && OpSD->hasValidDependencies()) {
23260 OpSD->clearDirectDependencies();
23261 if (RegionHasStackSave ||
23263 ControlDependentMembers.push_back(OpSD);
23264 }
23265 }
23266 }
23267 }
23268 };
23269 // The scheduling region got new instructions at the lower end (or it is a
23270 // new region for the first bundle). This makes it necessary to
23271 // recalculate all dependencies.
23272 // It is seldom that this needs to be done a second time after adding the
23273 // initial bundle to the region.
23274 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
23275 for_each(ScheduleDataMap, [&](auto &P) {
23276 if (BB != P.first->getParent())
23277 return;
23278 ScheduleData *SD = P.second;
23279 if (isInSchedulingRegion(*SD))
23280 SD->clearDependencies();
23281 });
23282 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
23283 for_each(P.second, [&](ScheduleCopyableData *SD) {
23284 if (isInSchedulingRegion(*SD))
23285 SD->clearDependencies();
23286 });
23287 });
23288 ReSchedule = true;
23289 }
23290 // Check if the bundle data has deps for copyable elements already. In
23291 // this case need to reset deps and recalculate it.
23292 if (Bundle && !Bundle.getBundle().empty()) {
23293 if (S.areInstructionsWithCopyableElements() ||
23294 !ScheduleCopyableDataMap.empty())
23295 CheckIfNeedToClearDeps(Bundle);
23296 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
23297 << BB->getName() << "\n");
23298 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
23299 ControlDependentMembers);
23300 } else if (!ControlDependentMembers.empty()) {
23301 ScheduleBundle Invalid = ScheduleBundle::invalid();
23302 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
23303 ControlDependentMembers);
23304 }
23305
23306 if (ReSchedule) {
23307 resetSchedule();
23308 initialFillReadyList(ReadyInsts);
23309 }
23310
23311 // Now try to schedule the new bundle or (if no bundle) just calculate
23312 // dependencies. As soon as the bundle is "ready" it means that there are no
23313 // cyclic dependencies and we can schedule it. Note that's important that we
23314 // don't "schedule" the bundle yet.
23315 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
23316 !ReadyInsts.empty()) {
23317 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
23318 assert(Picked->isReady() && "must be ready to schedule");
23319 schedule(*SLP, S, EI, Picked, ReadyInsts);
23320 if (Picked == &Bundle)
23321 break;
23322 }
23323 };
23324
23325 // Make sure that the scheduling region contains all
23326 // instructions of the bundle.
23327 for (Value *V : VL) {
23328 if (S.isNonSchedulable(V))
23329 continue;
23330 if (!extendSchedulingRegion(V, S)) {
23331 // If the scheduling region got new instructions at the lower end (or it
23332 // is a new region for the first bundle). This makes it necessary to
23333 // recalculate all dependencies.
23334 // Otherwise the compiler may crash trying to incorrectly calculate
23335 // dependencies and emit instruction in the wrong order at the actual
23336 // scheduling.
23337 ScheduleBundle Invalid = ScheduleBundle::invalid();
23338 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
23339 return std::nullopt;
23340 }
23341 }
23342
23343 bool ReSchedule = false;
23344 for (Value *V : VL) {
23345 if (S.isNonSchedulable(V))
23346 continue;
23348 getScheduleCopyableData(cast<Instruction>(V));
23349 if (!CopyableData.empty()) {
23350 for (ScheduleCopyableData *SD : CopyableData)
23351 ReadyInsts.remove(SD);
23352 }
23353 ScheduleData *BundleMember = getScheduleData(V);
23354 assert((BundleMember || S.isCopyableElement(V)) &&
23355 "no ScheduleData for bundle member (maybe not in same basic block)");
23356 if (!BundleMember)
23357 continue;
23358
23359 // Make sure we don't leave the pieces of the bundle in the ready list when
23360 // whole bundle might not be ready.
23361 ReadyInsts.remove(BundleMember);
23362 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
23363 !Bundles.empty()) {
23364 for (ScheduleBundle *B : Bundles)
23365 ReadyInsts.remove(B);
23366 }
23367
23368 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
23369 continue;
23370 // A bundle member was scheduled as single instruction before and now
23371 // needs to be scheduled as part of the bundle. We just get rid of the
23372 // existing schedule.
23373 // A bundle member has deps calculated before it was copyable element - need
23374 // to reschedule.
23375 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
23376 << " was already scheduled\n");
23377 ReSchedule = true;
23378 }
23379
23380 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
23381 TryScheduleBundleImpl(ReSchedule, Bundle);
23382 if (!Bundle.isReady()) {
23383 for (ScheduleEntity *BD : Bundle.getBundle()) {
23384 // Copyable data scheduling is just removed.
23386 continue;
23387 if (BD->isReady()) {
23388 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
23389 if (Bundles.empty()) {
23390 ReadyInsts.insert(BD);
23391 continue;
23392 }
23393 for (ScheduleBundle *B : Bundles)
23394 if (B->isReady())
23395 ReadyInsts.insert(B);
23396 }
23397 }
23398 ScheduledBundlesList.pop_back();
23399 SmallVector<ScheduleData *> ControlDependentMembers;
23400 for (Value *V : VL) {
23401 if (S.isNonSchedulable(V))
23402 continue;
23403 auto *I = cast<Instruction>(V);
23404 if (S.isCopyableElement(I)) {
23405 // Remove the copyable data from the scheduling region and restore
23406 // previous mappings.
23407 auto KV = std::make_pair(EI, I);
23408 assert(ScheduleCopyableDataMap.contains(KV) &&
23409 "no ScheduleCopyableData for copyable element");
23410 ScheduleCopyableData *SD =
23411 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
23412 ScheduleCopyableDataMapByUsers[I].remove(SD);
23413 if (EI.UserTE) {
23414 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
23415 const auto *It = find(Op, I);
23416 assert(It != Op.end() && "Lane not set");
23417 SmallPtrSet<Instruction *, 4> Visited;
23418 do {
23419 int Lane = std::distance(Op.begin(), It);
23420 assert(Lane >= 0 && "Lane not set");
23421 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
23422 !EI.UserTE->ReorderIndices.empty())
23423 Lane = EI.UserTE->ReorderIndices[Lane];
23424 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
23425 "Couldn't find extract lane");
23426 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
23427 if (!Visited.insert(In).second) {
23428 It = find(make_range(std::next(It), Op.end()), I);
23429 break;
23430 }
23431 ScheduleCopyableDataMapByInstUser
23432 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
23433 .pop_back();
23434 It = find(make_range(std::next(It), Op.end()), I);
23435 } while (It != Op.end());
23436 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
23437 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
23438 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
23439 }
23440 if (ScheduleCopyableDataMapByUsers[I].empty())
23441 ScheduleCopyableDataMapByUsers.erase(I);
23442 ScheduleCopyableDataMap.erase(KV);
23443 // Need to recalculate dependencies for the actual schedule data.
23444 if (ScheduleData *OpSD = getScheduleData(I);
23445 OpSD && OpSD->hasValidDependencies()) {
23446 OpSD->clearDirectDependencies();
23447 if (RegionHasStackSave ||
23449 ControlDependentMembers.push_back(OpSD);
23450 }
23451 continue;
23452 }
23453 ScheduledBundles.find(I)->getSecond().pop_back();
23454 }
23455 if (!ControlDependentMembers.empty()) {
23456 ScheduleBundle Invalid = ScheduleBundle::invalid();
23457 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
23458 ControlDependentMembers);
23459 }
23460 return std::nullopt;
23461 }
23462 return &Bundle;
23463}
23464
23465BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
23466 // Allocate a new ScheduleData for the instruction.
23467 if (ChunkPos >= ChunkSize) {
23468 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
23469 ChunkPos = 0;
23470 }
23471 return &(ScheduleDataChunks.back()[ChunkPos++]);
23472}
23473
23474bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
23475 Value *V, const InstructionsState &S) {
23477 assert(I && "bundle member must be an instruction");
23478 if (getScheduleData(I))
23479 return true;
23480 if (!ScheduleStart) {
23481 // It's the first instruction in the new region.
23482 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
23483 ScheduleStart = I;
23484 ScheduleEnd = I->getNextNode();
23485 assert(ScheduleEnd && "tried to vectorize a terminator?");
23486 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
23487 return true;
23488 }
23489 // Search up and down at the same time, because we don't know if the new
23490 // instruction is above or below the existing scheduling region.
23491 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
23492 // against the budget. Otherwise debug info could affect codegen.
23494 ++ScheduleStart->getIterator().getReverse();
23495 BasicBlock::reverse_iterator UpperEnd = BB->rend();
23496 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
23497 BasicBlock::iterator LowerEnd = BB->end();
23498 auto IsAssumeLikeIntr = [](const Instruction &I) {
23499 if (auto *II = dyn_cast<IntrinsicInst>(&I))
23500 return II->isAssumeLikeIntrinsic();
23501 return false;
23502 };
23503 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
23504 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
23505 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
23506 &*DownIter != I) {
23507 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
23508 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
23509 return false;
23510 }
23511
23512 ++UpIter;
23513 ++DownIter;
23514
23515 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
23516 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
23517 }
23518 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
23519 assert(I->getParent() == ScheduleStart->getParent() &&
23520 "Instruction is in wrong basic block.");
23521 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
23522 ScheduleStart = I;
23523 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
23524 << "\n");
23525 return true;
23526 }
23527 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
23528 "Expected to reach top of the basic block or instruction down the "
23529 "lower end.");
23530 assert(I->getParent() == ScheduleEnd->getParent() &&
23531 "Instruction is in wrong basic block.");
23532 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
23533 nullptr);
23534 ScheduleEnd = I->getNextNode();
23535 assert(ScheduleEnd && "tried to vectorize a terminator?");
23536 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
23537 return true;
23538}
23539
23540void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
23541 Instruction *ToI,
23542 ScheduleData *PrevLoadStore,
23543 ScheduleData *NextLoadStore) {
23544 ScheduleData *CurrentLoadStore = PrevLoadStore;
23545 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
23546 // No need to allocate data for non-schedulable instructions.
23547 if (isa<PHINode>(I))
23548 continue;
23549 ScheduleData *SD = ScheduleDataMap.lookup(I);
23550 if (!SD) {
23551 SD = allocateScheduleDataChunks();
23552 ScheduleDataMap[I] = SD;
23553 }
23554 assert(!isInSchedulingRegion(*SD) &&
23555 "new ScheduleData already in scheduling region");
23556 SD->init(SchedulingRegionID, I);
23557
23558 auto CanIgnoreLoad = [](const Instruction *I) {
23559 const auto *LI = dyn_cast<LoadInst>(I);
23560 // If there is a simple load marked as invariant, we can ignore it.
23561 // But, in the (unlikely) case of non-simple invariant load,
23562 // we should not ignore it.
23563 return LI && LI->isSimple() &&
23564 LI->getMetadata(LLVMContext::MD_invariant_load);
23565 };
23566
23567 if (I->mayReadOrWriteMemory() &&
23568 // Simple InvariantLoad does not depend on other memory accesses.
23569 !CanIgnoreLoad(I) &&
23570 (!isa<IntrinsicInst>(I) ||
23571 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
23573 Intrinsic::pseudoprobe))) {
23574 // Update the linked list of memory accessing instructions.
23575 if (CurrentLoadStore) {
23576 CurrentLoadStore->setNextLoadStore(SD);
23577 } else {
23578 FirstLoadStoreInRegion = SD;
23579 }
23580 CurrentLoadStore = SD;
23581 }
23582
23585 RegionHasStackSave = true;
23586 }
23587 if (NextLoadStore) {
23588 if (CurrentLoadStore)
23589 CurrentLoadStore->setNextLoadStore(NextLoadStore);
23590 } else {
23591 LastLoadStoreInRegion = CurrentLoadStore;
23592 }
23593}
23594
23595void BoUpSLP::BlockScheduling::calculateDependencies(
23596 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
23597 ArrayRef<ScheduleData *> ControlDeps) {
23598 SmallVector<ScheduleEntity *> WorkList;
23599 auto ProcessNode = [&](ScheduleEntity *SE) {
23600 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
23601 if (CD->hasValidDependencies())
23602 return;
23603 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
23604 CD->initDependencies();
23605 CD->resetUnscheduledDeps();
23606 const EdgeInfo &EI = CD->getEdgeInfo();
23607 if (EI.UserTE) {
23608 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
23609 const auto *It = find(Op, CD->getInst());
23610 assert(It != Op.end() && "Lane not set");
23611 SmallPtrSet<Instruction *, 4> Visited;
23612 do {
23613 int Lane = std::distance(Op.begin(), It);
23614 assert(Lane >= 0 && "Lane not set");
23615 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
23616 !EI.UserTE->ReorderIndices.empty())
23617 Lane = EI.UserTE->ReorderIndices[Lane];
23618 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
23619 "Couldn't find extract lane");
23620 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
23621 if (EI.UserTE->isCopyableElement(In)) {
23622 // We may have not have related copyable scheduling data, if the
23623 // instruction is non-schedulable.
23624 if (ScheduleCopyableData *UseSD =
23625 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
23626 CD->incDependencies();
23627 if (!UseSD->isScheduled())
23628 CD->incrementUnscheduledDeps(1);
23629 if (!UseSD->hasValidDependencies() ||
23630 (InsertInReadyList && UseSD->isReady()))
23631 WorkList.push_back(UseSD);
23632 }
23633 } else if (Visited.insert(In).second) {
23634 if (ScheduleData *UseSD = getScheduleData(In)) {
23635 CD->incDependencies();
23636 if (!UseSD->isScheduled())
23637 CD->incrementUnscheduledDeps(1);
23638 if (!UseSD->hasValidDependencies() ||
23639 (InsertInReadyList && UseSD->isReady()))
23640 WorkList.push_back(UseSD);
23641 }
23642 }
23643 It = find(make_range(std::next(It), Op.end()), CD->getInst());
23644 } while (It != Op.end());
23645 if (CD->isReady() && CD->getDependencies() == 0 &&
23646 (EI.UserTE->hasState() &&
23647 (EI.UserTE->getMainOp()->getParent() !=
23648 CD->getInst()->getParent() ||
23649 (isa<PHINode>(EI.UserTE->getMainOp()) &&
23650 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
23651 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
23652 auto *IU = dyn_cast<Instruction>(U);
23653 if (!IU)
23654 return true;
23655 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
23656 })))))) {
23657 // If no uses in the block - mark as having pseudo-use, which cannot
23658 // be scheduled.
23659 // Prevents incorrect def-use tracking between external user and
23660 // actual instruction.
23661 CD->incDependencies();
23662 CD->incrementUnscheduledDeps(1);
23663 }
23664 }
23665 return;
23666 }
23667 auto *BundleMember = cast<ScheduleData>(SE);
23668 if (BundleMember->hasValidDependencies())
23669 return;
23670 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
23671 BundleMember->initDependencies();
23672 BundleMember->resetUnscheduledDeps();
23673 // Handle def-use chain dependencies.
23674 SmallDenseMap<Value *, unsigned> UserToNumOps;
23675 for (User *U : BundleMember->getInst()->users()) {
23676 if (isa<PHINode>(U))
23677 continue;
23678 if (ScheduleData *UseSD = getScheduleData(U)) {
23679 // The operand is a copyable element - skip.
23680 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
23681 ++NumOps;
23682 if (areAllOperandsReplacedByCopyableData(
23683 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
23684 continue;
23685 BundleMember->incDependencies();
23686 if (!UseSD->isScheduled())
23687 BundleMember->incrementUnscheduledDeps(1);
23688 if (!UseSD->hasValidDependencies() ||
23689 (InsertInReadyList && UseSD->isReady()))
23690 WorkList.push_back(UseSD);
23691 }
23692 }
23693 for (ScheduleCopyableData *UseSD :
23694 getScheduleCopyableDataUsers(BundleMember->getInst())) {
23695 BundleMember->incDependencies();
23696 if (!UseSD->isScheduled())
23697 BundleMember->incrementUnscheduledDeps(1);
23698 if (!UseSD->hasValidDependencies() ||
23699 (InsertInReadyList && UseSD->isReady()))
23700 WorkList.push_back(UseSD);
23701 }
23702
23703 SmallPtrSet<const Instruction *, 4> Visited;
23704 auto MakeControlDependent = [&](Instruction *I) {
23705 // Do not mark control dependent twice.
23706 if (!Visited.insert(I).second)
23707 return;
23708 auto *DepDest = getScheduleData(I);
23709 assert(DepDest && "must be in schedule window");
23710 DepDest->addControlDependency(BundleMember);
23711 BundleMember->incDependencies();
23712 if (!DepDest->isScheduled())
23713 BundleMember->incrementUnscheduledDeps(1);
23714 if (!DepDest->hasValidDependencies() ||
23715 (InsertInReadyList && DepDest->isReady()))
23716 WorkList.push_back(DepDest);
23717 };
23718
23719 // Any instruction which isn't safe to speculate at the beginning of the
23720 // block is control depend on any early exit or non-willreturn call
23721 // which proceeds it.
23722 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
23723 for (Instruction *I = BundleMember->getInst()->getNextNode();
23724 I != ScheduleEnd; I = I->getNextNode()) {
23725 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
23726 continue;
23727
23728 // Add the dependency
23729 MakeControlDependent(I);
23730
23732 // Everything past here must be control dependent on I.
23733 break;
23734 }
23735 }
23736
23737 if (RegionHasStackSave) {
23738 // If we have an inalloc alloca instruction, it needs to be scheduled
23739 // after any preceeding stacksave. We also need to prevent any alloca
23740 // from reordering above a preceeding stackrestore.
23741 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
23742 match(BundleMember->getInst(),
23744 for (Instruction *I = BundleMember->getInst()->getNextNode();
23745 I != ScheduleEnd; I = I->getNextNode()) {
23748 // Any allocas past here must be control dependent on I, and I
23749 // must be memory dependend on BundleMember->Inst.
23750 break;
23751
23752 if (!isa<AllocaInst>(I))
23753 continue;
23754
23755 // Add the dependency
23756 MakeControlDependent(I);
23757 }
23758 }
23759
23760 // In addition to the cases handle just above, we need to prevent
23761 // allocas and loads/stores from moving below a stacksave or a
23762 // stackrestore. Avoiding moving allocas below stackrestore is currently
23763 // thought to be conservatism. Moving loads/stores below a stackrestore
23764 // can lead to incorrect code.
23765 if (isa<AllocaInst>(BundleMember->getInst()) ||
23766 BundleMember->getInst()->mayReadOrWriteMemory()) {
23767 for (Instruction *I = BundleMember->getInst()->getNextNode();
23768 I != ScheduleEnd; I = I->getNextNode()) {
23771 continue;
23772
23773 // Add the dependency
23774 MakeControlDependent(I);
23775 break;
23776 }
23777 }
23778 }
23779
23780 // Handle the memory dependencies (if any).
23781 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
23782 if (!NextLoadStore)
23783 return;
23784 Instruction *SrcInst = BundleMember->getInst();
23785 assert(SrcInst->mayReadOrWriteMemory() &&
23786 "NextLoadStore list for non memory effecting bundle?");
23787 MemoryLocation SrcLoc = getLocation(SrcInst);
23788 bool SrcMayWrite = SrcInst->mayWriteToMemory();
23789 unsigned NumAliased = 0;
23790 unsigned DistToSrc = 1;
23791 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
23792
23793 for (ScheduleData *DepDest = NextLoadStore; DepDest;
23794 DepDest = DepDest->getNextLoadStore()) {
23795 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
23796
23797 // We have two limits to reduce the complexity:
23798 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
23799 // SLP->isAliased (which is the expensive part in this loop).
23800 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
23801 // the whole loop (even if the loop is fast, it's quadratic).
23802 // It's important for the loop break condition (see below) to
23803 // check this limit even between two read-only instructions.
23804 if (DistToSrc >= MaxMemDepDistance ||
23805 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
23806 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
23807 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
23808
23809 // We increment the counter only if the locations are aliased
23810 // (instead of counting all alias checks). This gives a better
23811 // balance between reduced runtime and accurate dependencies.
23812 NumAliased++;
23813
23814 DepDest->addMemoryDependency(BundleMember);
23815 BundleMember->incDependencies();
23816 if (!DepDest->isScheduled())
23817 BundleMember->incrementUnscheduledDeps(1);
23818 if (!DepDest->hasValidDependencies() ||
23819 (InsertInReadyList && DepDest->isReady()))
23820 WorkList.push_back(DepDest);
23821 }
23822
23823 // Example, explaining the loop break condition: Let's assume our
23824 // starting instruction is i0 and MaxMemDepDistance = 3.
23825 //
23826 // +--------v--v--v
23827 // i0,i1,i2,i3,i4,i5,i6,i7,i8
23828 // +--------^--^--^
23829 //
23830 // MaxMemDepDistance let us stop alias-checking at i3 and we add
23831 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
23832 // Previously we already added dependencies from i3 to i6,i7,i8
23833 // (because of MaxMemDepDistance). As we added a dependency from
23834 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
23835 // and we can abort this loop at i6.
23836 if (DistToSrc >= 2 * MaxMemDepDistance)
23837 break;
23838 DistToSrc++;
23839 }
23840 };
23841
23842 assert((Bundle || !ControlDeps.empty()) &&
23843 "expected at least one instruction to schedule");
23844 if (Bundle)
23845 WorkList.push_back(Bundle.getBundle().front());
23846 WorkList.append(ControlDeps.begin(), ControlDeps.end());
23847 SmallPtrSet<ScheduleBundle *, 16> Visited;
23848 while (!WorkList.empty()) {
23849 ScheduleEntity *SD = WorkList.pop_back_val();
23850 SmallVector<ScheduleBundle *, 1> CopyableBundle;
23852 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
23853 CopyableBundle.push_back(&CD->getBundle());
23854 Bundles = CopyableBundle;
23855 } else {
23856 Bundles = getScheduleBundles(SD->getInst());
23857 }
23858 if (Bundles.empty()) {
23859 if (!SD->hasValidDependencies())
23860 ProcessNode(SD);
23861 if (InsertInReadyList && SD->isReady()) {
23862 ReadyInsts.insert(SD);
23863 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
23864 }
23865 continue;
23866 }
23867 for (ScheduleBundle *Bundle : Bundles) {
23868 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
23869 continue;
23870 assert(isInSchedulingRegion(*Bundle) &&
23871 "ScheduleData not in scheduling region");
23872 for_each(Bundle->getBundle(), ProcessNode);
23873 }
23874 if (InsertInReadyList && SD->isReady()) {
23875 for (ScheduleBundle *Bundle : Bundles) {
23876 assert(isInSchedulingRegion(*Bundle) &&
23877 "ScheduleData not in scheduling region");
23878 if (!Bundle->isReady())
23879 continue;
23880 ReadyInsts.insert(Bundle);
23881 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
23882 << "\n");
23883 }
23884 }
23885 }
23886}
23887
23888void BoUpSLP::BlockScheduling::resetSchedule() {
23889 assert(ScheduleStart &&
23890 "tried to reset schedule on block which has not been scheduled");
23891 for_each(ScheduleDataMap, [&](auto &P) {
23892 if (BB != P.first->getParent())
23893 return;
23894 ScheduleData *SD = P.second;
23895 if (isInSchedulingRegion(*SD)) {
23896 SD->setScheduled(/*Scheduled=*/false);
23897 SD->resetUnscheduledDeps();
23898 }
23899 });
23900 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
23901 for_each(P.second, [&](ScheduleCopyableData *SD) {
23902 if (isInSchedulingRegion(*SD)) {
23903 SD->setScheduled(/*Scheduled=*/false);
23904 SD->resetUnscheduledDeps();
23905 }
23906 });
23907 });
23908 for_each(ScheduledBundles, [&](auto &P) {
23909 for_each(P.second, [&](ScheduleBundle *Bundle) {
23910 if (isInSchedulingRegion(*Bundle))
23911 Bundle->setScheduled(/*Scheduled=*/false);
23912 });
23913 });
23914 // Reset schedule data for copyable elements.
23915 for (auto &P : ScheduleCopyableDataMap) {
23916 if (isInSchedulingRegion(*P.second)) {
23917 P.second->setScheduled(/*Scheduled=*/false);
23918 P.second->resetUnscheduledDeps();
23919 }
23920 }
23921 ReadyInsts.clear();
23922}
23923
23924void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
23925 if (!BS->ScheduleStart)
23926 return;
23927
23928 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
23929
23930 // A key point - if we got here, pre-scheduling was able to find a valid
23931 // scheduling of the sub-graph of the scheduling window which consists
23932 // of all vector bundles and their transitive users. As such, we do not
23933 // need to reschedule anything *outside of* that subgraph.
23934
23935 BS->resetSchedule();
23936
23937 // For the real scheduling we use a more sophisticated ready-list: it is
23938 // sorted by the original instruction location. This lets the final schedule
23939 // be as close as possible to the original instruction order.
23940 // WARNING: If changing this order causes a correctness issue, that means
23941 // there is some missing dependence edge in the schedule data graph.
23942 struct ScheduleDataCompare {
23943 bool operator()(const ScheduleEntity *SD1,
23944 const ScheduleEntity *SD2) const {
23945 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
23946 }
23947 };
23948 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
23949
23950 // Ensure that all dependency data is updated (for nodes in the sub-graph)
23951 // and fill the ready-list with initial instructions.
23952 int Idx = 0;
23953 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
23954 I = I->getNextNode()) {
23955 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
23956 if (!Bundles.empty()) {
23957 for (ScheduleBundle *Bundle : Bundles) {
23958 Bundle->setSchedulingPriority(Idx++);
23959 if (!Bundle->hasValidDependencies())
23960 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
23961 }
23962 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
23963 for (ScheduleCopyableData *SD : reverse(SDs)) {
23964 ScheduleBundle &Bundle = SD->getBundle();
23965 Bundle.setSchedulingPriority(Idx++);
23966 if (!Bundle.hasValidDependencies())
23967 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
23968 }
23969 continue;
23970 }
23972 BS->getScheduleCopyableDataUsers(I);
23973 if (ScheduleData *SD = BS->getScheduleData(I)) {
23974 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
23975 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
23976 SDTEs.front()->doesNotNeedToSchedule() ||
23978 "scheduler and vectorizer bundle mismatch");
23979 SD->setSchedulingPriority(Idx++);
23980 if (!CopyableData.empty() ||
23981 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
23982 assert(TE->isGather() && "expected gather node");
23983 return TE->hasState() && TE->hasCopyableElements() &&
23984 TE->isCopyableElement(I);
23985 })) {
23986 SD->clearDirectDependencies();
23987 // Need to calculate deps for these nodes to correctly handle copyable
23988 // dependencies, even if they were cancelled.
23989 // If copyables bundle was cancelled, the deps are cleared and need to
23990 // recalculate them.
23991 ScheduleBundle Bundle;
23992 Bundle.add(SD);
23993 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
23994 }
23995 }
23996 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
23997 ScheduleBundle &Bundle = SD->getBundle();
23998 Bundle.setSchedulingPriority(Idx++);
23999 if (!Bundle.hasValidDependencies())
24000 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
24001 }
24002 }
24003 BS->initialFillReadyList(ReadyInsts);
24004
24005 Instruction *LastScheduledInst = BS->ScheduleEnd;
24006
24007 // Do the "real" scheduling.
24008 SmallPtrSet<Instruction *, 16> Scheduled;
24009 while (!ReadyInsts.empty()) {
24010 auto *Picked = *ReadyInsts.begin();
24011 ReadyInsts.erase(ReadyInsts.begin());
24012
24013 // Move the scheduled instruction(s) to their dedicated places, if not
24014 // there yet.
24015 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
24016 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
24017 Instruction *PickedInst = BundleMember->getInst();
24018 // If copyable must be schedule as part of something else, skip it.
24019 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
24020 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
24021 (!IsCopyable && !Scheduled.insert(PickedInst).second))
24022 continue;
24023 if (PickedInst->getNextNode() != LastScheduledInst)
24024 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
24025 LastScheduledInst = PickedInst;
24026 }
24027 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
24028 LastScheduledInst);
24029 } else {
24030 auto *SD = cast<ScheduleData>(Picked);
24031 Instruction *PickedInst = SD->getInst();
24032 if (PickedInst->getNextNode() != LastScheduledInst)
24033 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
24034 LastScheduledInst = PickedInst;
24035 }
24036 auto Invalid = InstructionsState::invalid();
24037 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
24038 }
24039
24040 // Check that we didn't break any of our invariants.
24041#ifdef EXPENSIVE_CHECKS
24042 BS->verify();
24043#endif
24044
24045#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
24046 // Check that all schedulable entities got scheduled
24047 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
24048 I = I->getNextNode()) {
24049 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
24050 assert(all_of(Bundles,
24051 [](const ScheduleBundle *Bundle) {
24052 return Bundle->isScheduled();
24053 }) &&
24054 "must be scheduled at this point");
24055 }
24056#endif
24057
24058 // Avoid duplicate scheduling of the block.
24059 BS->ScheduleStart = nullptr;
24060}
24061
24063 // If V is a store, just return the width of the stored value (or value
24064 // truncated just before storing) without traversing the expression tree.
24065 // This is the common case.
24066 if (auto *Store = dyn_cast<StoreInst>(V))
24067 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
24068
24069 if (auto *IEI = dyn_cast<InsertElementInst>(V))
24070 return getVectorElementSize(IEI->getOperand(1));
24071
24072 auto E = InstrElementSize.find(V);
24073 if (E != InstrElementSize.end())
24074 return E->second;
24075
24076 // If V is not a store, we can traverse the expression tree to find loads
24077 // that feed it. The type of the loaded value may indicate a more suitable
24078 // width than V's type. We want to base the vector element size on the width
24079 // of memory operations where possible.
24082 if (auto *I = dyn_cast<Instruction>(V)) {
24083 Worklist.emplace_back(I, I->getParent(), 0);
24084 Visited.insert(I);
24085 }
24086
24087 // Traverse the expression tree in bottom-up order looking for loads. If we
24088 // encounter an instruction we don't yet handle, we give up.
24089 auto Width = 0u;
24090 Value *FirstNonBool = nullptr;
24091 while (!Worklist.empty()) {
24092 auto [I, Parent, Level] = Worklist.pop_back_val();
24093
24094 // We should only be looking at scalar instructions here. If the current
24095 // instruction has a vector type, skip.
24096 auto *Ty = I->getType();
24097 if (isa<VectorType>(Ty))
24098 continue;
24099 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
24100 FirstNonBool = I;
24101 if (Level > RecursionMaxDepth)
24102 continue;
24103
24104 // If the current instruction is a load, update MaxWidth to reflect the
24105 // width of the loaded value.
24107 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
24108
24109 // Otherwise, we need to visit the operands of the instruction. We only
24110 // handle the interesting cases from buildTree here. If an operand is an
24111 // instruction we haven't yet visited and from the same basic block as the
24112 // user or the use is a PHI node, we add it to the worklist.
24115 for (Use &U : I->operands()) {
24116 if (auto *J = dyn_cast<Instruction>(U.get()))
24117 if (Visited.insert(J).second &&
24118 (isa<PHINode>(I) || J->getParent() == Parent)) {
24119 Worklist.emplace_back(J, J->getParent(), Level + 1);
24120 continue;
24121 }
24122 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
24123 FirstNonBool = U.get();
24124 }
24125 } else {
24126 break;
24127 }
24128 }
24129
24130 // If we didn't encounter a memory access in the expression tree, or if we
24131 // gave up for some reason, just return the width of V. Otherwise, return the
24132 // maximum width we found.
24133 if (!Width) {
24134 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
24135 V = FirstNonBool;
24136 Width = DL->getTypeSizeInBits(V->getType());
24137 }
24138
24139 for (Instruction *I : Visited)
24140 InstrElementSize[I] = Width;
24141
24142 return Width;
24143}
24144
24145bool BoUpSLP::collectValuesToDemote(
24146 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
24148 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
24149 bool &IsProfitableToDemote, bool IsTruncRoot) const {
24150 // We can always demote constants.
24151 if (all_of(E.Scalars, IsaPred<Constant>))
24152 return true;
24153
24154 unsigned OrigBitWidth =
24155 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
24156 if (OrigBitWidth == BitWidth) {
24157 MaxDepthLevel = 1;
24158 return true;
24159 }
24160
24161 // Check if the node was analyzed already and must keep its original bitwidth.
24162 if (NodesToKeepBWs.contains(E.Idx))
24163 return false;
24164
24165 // If the value is not a vectorized instruction in the expression and not used
24166 // by the insertelement instruction and not used in multiple vector nodes, it
24167 // cannot be demoted.
24168 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
24169 if (isa<PoisonValue>(R))
24170 return false;
24171 return !isKnownNonNegative(R, SimplifyQuery(*DL));
24172 });
24173 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
24174 if (isa<PoisonValue>(V))
24175 return true;
24176 if (getTreeEntries(V).size() > 1)
24177 return false;
24178 // For lat shuffle of sext/zext with many uses need to check the extra bit
24179 // for unsigned values, otherwise may have incorrect casting for reused
24180 // scalars.
24181 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
24182 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
24183 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
24184 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
24185 return true;
24186 }
24187 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
24188 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
24189 if (IsSignedNode)
24190 ++BitWidth1;
24191 if (auto *I = dyn_cast<Instruction>(V)) {
24192 APInt Mask = DB->getDemandedBits(I);
24193 unsigned BitWidth2 =
24194 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
24195 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
24196 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
24197 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
24198 break;
24199 BitWidth2 *= 2;
24200 }
24201 BitWidth1 = std::min(BitWidth1, BitWidth2);
24202 }
24203 BitWidth = std::max(BitWidth, BitWidth1);
24204 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
24205 };
24206 auto FinalAnalysis = [&, TTI = TTI]() {
24207 if (!IsProfitableToDemote)
24208 return false;
24209 bool Res = all_of(
24210 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
24211 // Demote gathers.
24212 if (Res && E.isGather()) {
24213 if (E.hasState()) {
24214 if (const TreeEntry *SameTE =
24215 getSameValuesTreeEntry(E.getMainOp(), E.Scalars))
24216 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
24217 ToDemote, Visited, NodesToKeepBWs,
24218 MaxDepthLevel, IsProfitableToDemote,
24219 IsTruncRoot)) {
24220 ToDemote.push_back(E.Idx);
24221 return true;
24222 }
24223 }
24224 // Check possible extractelement instructions bases and final vector
24225 // length.
24226 SmallPtrSet<Value *, 4> UniqueBases;
24227 for (Value *V : E.Scalars) {
24228 auto *EE = dyn_cast<ExtractElementInst>(V);
24229 if (!EE)
24230 continue;
24231 UniqueBases.insert(EE->getVectorOperand());
24232 }
24233 const unsigned VF = E.Scalars.size();
24234 Type *OrigScalarTy = E.Scalars.front()->getType();
24235 if (UniqueBases.size() <= 2 ||
24236 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
24238 *TTI,
24240 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
24241 VF))) {
24242 ToDemote.push_back(E.Idx);
24243 return true;
24244 }
24245 }
24246 return Res;
24247 };
24248 if (E.isGather() || !Visited.insert(&E).second ||
24249 any_of(E.Scalars, [&](Value *V) {
24250 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
24251 return isa<InsertElementInst>(U) && !isVectorized(U);
24252 });
24253 }))
24254 return FinalAnalysis();
24255
24256 if (any_of(E.Scalars, [&](Value *V) {
24257 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
24258 return isVectorized(U) ||
24259 (E.Idx == 0 && UserIgnoreList &&
24260 UserIgnoreList->contains(U)) ||
24261 (!isa<CmpInst>(U) && U->getType()->isSized() &&
24262 !U->getType()->isScalableTy() &&
24263 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
24264 }) && !IsPotentiallyTruncated(V, BitWidth);
24265 }))
24266 return false;
24267
24268 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
24269 bool &NeedToExit) {
24270 NeedToExit = false;
24271 unsigned InitLevel = MaxDepthLevel;
24272 for (const TreeEntry *Op : Operands) {
24273 unsigned Level = InitLevel;
24274 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
24275 ToDemote, Visited, NodesToKeepBWs, Level,
24276 IsProfitableToDemote, IsTruncRoot)) {
24277 if (!IsProfitableToDemote)
24278 return false;
24279 NeedToExit = true;
24280 if (!FinalAnalysis())
24281 return false;
24282 continue;
24283 }
24284 MaxDepthLevel = std::max(MaxDepthLevel, Level);
24285 }
24286 return true;
24287 };
24288 auto AttemptCheckBitwidth =
24289 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
24290 // Try all bitwidth < OrigBitWidth.
24291 NeedToExit = false;
24292 unsigned BestFailBitwidth = 0;
24293 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
24294 if (Checker(BitWidth, OrigBitWidth))
24295 return true;
24296 if (BestFailBitwidth == 0 && FinalAnalysis())
24297 BestFailBitwidth = BitWidth;
24298 }
24299 if (BitWidth >= OrigBitWidth) {
24300 if (BestFailBitwidth == 0) {
24301 BitWidth = OrigBitWidth;
24302 return false;
24303 }
24304 MaxDepthLevel = 1;
24305 BitWidth = BestFailBitwidth;
24306 NeedToExit = true;
24307 return true;
24308 }
24309 return false;
24310 };
24311 auto TryProcessInstruction =
24312 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
24313 function_ref<bool(unsigned, unsigned)> Checker = {}) {
24314 if (Operands.empty()) {
24315 if (!IsTruncRoot)
24316 MaxDepthLevel = 1;
24317 for (Value *V : E.Scalars)
24318 (void)IsPotentiallyTruncated(V, BitWidth);
24319 } else {
24320 // Several vectorized uses? Check if we can truncate it, otherwise -
24321 // exit.
24322 if (any_of(E.Scalars, [&](Value *V) {
24323 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
24324 }))
24325 return false;
24326 bool NeedToExit = false;
24327 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
24328 return false;
24329 if (NeedToExit)
24330 return true;
24331 if (!ProcessOperands(Operands, NeedToExit))
24332 return false;
24333 if (NeedToExit)
24334 return true;
24335 }
24336
24337 ++MaxDepthLevel;
24338 // Record the entry that we can demote.
24339 ToDemote.push_back(E.Idx);
24340 return IsProfitableToDemote;
24341 };
24342
24343 if (E.State == TreeEntry::SplitVectorize)
24344 return TryProcessInstruction(
24345 BitWidth,
24346 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
24347 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
24348
24349 if (E.isAltShuffle()) {
24350 // Combining these opcodes may lead to incorrect analysis, skip for now.
24351 auto IsDangerousOpcode = [](unsigned Opcode) {
24352 switch (Opcode) {
24353 case Instruction::Shl:
24354 case Instruction::AShr:
24355 case Instruction::LShr:
24356 case Instruction::UDiv:
24357 case Instruction::SDiv:
24358 case Instruction::URem:
24359 case Instruction::SRem:
24360 return true;
24361 default:
24362 break;
24363 }
24364 return false;
24365 };
24366 if (IsDangerousOpcode(E.getAltOpcode()))
24367 return FinalAnalysis();
24368 }
24369
24370 switch (E.getOpcode()) {
24371
24372 // We can always demote truncations and extensions. Since truncations can
24373 // seed additional demotion, we save the truncated value.
24374 case Instruction::Trunc:
24375 if (IsProfitableToDemoteRoot)
24376 IsProfitableToDemote = true;
24377 return TryProcessInstruction(BitWidth);
24378 case Instruction::ZExt:
24379 case Instruction::SExt:
24380 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
24381 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
24382 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
24383 return false;
24384 IsProfitableToDemote = true;
24385 return TryProcessInstruction(BitWidth);
24386
24387 // We can demote certain binary operations if we can demote both of their
24388 // operands.
24389 case Instruction::Add:
24390 case Instruction::Sub:
24391 case Instruction::Mul:
24392 case Instruction::And:
24393 case Instruction::Or:
24394 case Instruction::Xor: {
24395 return TryProcessInstruction(
24396 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
24397 }
24398 case Instruction::Freeze:
24399 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
24400 case Instruction::Shl: {
24401 // If we are truncating the result of this SHL, and if it's a shift of an
24402 // inrange amount, we can always perform a SHL in a smaller type.
24403 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
24404 return all_of(E.Scalars, [&](Value *V) {
24405 if (isa<PoisonValue>(V))
24406 return true;
24407 if (E.isCopyableElement(V))
24408 return true;
24409 auto *I = cast<Instruction>(V);
24410 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
24411 return AmtKnownBits.getMaxValue().ult(BitWidth);
24412 });
24413 };
24414 return TryProcessInstruction(
24415 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
24416 }
24417 case Instruction::LShr: {
24418 // If this is a truncate of a logical shr, we can truncate it to a smaller
24419 // lshr iff we know that the bits we would otherwise be shifting in are
24420 // already zeros.
24421 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24422 return all_of(E.Scalars, [&](Value *V) {
24423 if (isa<PoisonValue>(V))
24424 return true;
24425 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
24426 if (E.isCopyableElement(V))
24427 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
24428 auto *I = cast<Instruction>(V);
24429 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
24430 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
24431 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
24432 SimplifyQuery(*DL));
24433 });
24434 };
24435 return TryProcessInstruction(
24436 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
24437 LShrChecker);
24438 }
24439 case Instruction::AShr: {
24440 // If this is a truncate of an arithmetic shr, we can truncate it to a
24441 // smaller ashr iff we know that all the bits from the sign bit of the
24442 // original type and the sign bit of the truncate type are similar.
24443 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24444 return all_of(E.Scalars, [&](Value *V) {
24445 if (isa<PoisonValue>(V))
24446 return true;
24447 auto *I = cast<Instruction>(V);
24448 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
24449 unsigned ShiftedBits = OrigBitWidth - BitWidth;
24450 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
24451 ShiftedBits <
24452 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
24453 });
24454 };
24455 return TryProcessInstruction(
24456 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
24457 AShrChecker);
24458 }
24459 case Instruction::UDiv:
24460 case Instruction::URem: {
24461 // UDiv and URem can be truncated if all the truncated bits are zero.
24462 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24463 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
24464 return all_of(E.Scalars, [&](Value *V) {
24465 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
24466 if (E.hasCopyableElements() && E.isCopyableElement(V))
24467 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
24468 auto *I = cast<Instruction>(V);
24469 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
24470 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
24471 });
24472 };
24473 return TryProcessInstruction(
24474 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
24475 }
24476
24477 // We can demote selects if we can demote their true and false values.
24478 case Instruction::Select: {
24479 return TryProcessInstruction(
24480 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
24481 }
24482
24483 // We can demote phis if we can demote all their incoming operands.
24484 case Instruction::PHI: {
24485 const unsigned NumOps = E.getNumOperands();
24487 transform(seq<unsigned>(0, NumOps), Ops.begin(),
24488 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
24489
24490 return TryProcessInstruction(BitWidth, Ops);
24491 }
24492
24493 case Instruction::Call: {
24494 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
24495 if (!IC)
24496 break;
24498 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
24499 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
24500 break;
24501 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
24502 function_ref<bool(unsigned, unsigned)> CallChecker;
24503 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24504 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
24505 return all_of(E.Scalars, [&](Value *V) {
24506 auto *I = cast<Instruction>(V);
24507 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
24508 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
24509 return MaskedValueIsZero(I->getOperand(0), Mask,
24510 SimplifyQuery(*DL)) &&
24511 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
24512 }
24513 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
24514 "Expected min/max intrinsics only.");
24515 unsigned SignBits = OrigBitWidth - BitWidth;
24516 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
24517 unsigned Op0SignBits =
24518 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
24519 unsigned Op1SignBits =
24520 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
24521 return SignBits <= Op0SignBits &&
24522 ((SignBits != Op0SignBits &&
24523 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
24524 MaskedValueIsZero(I->getOperand(0), Mask,
24525 SimplifyQuery(*DL))) &&
24526 SignBits <= Op1SignBits &&
24527 ((SignBits != Op1SignBits &&
24528 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
24529 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
24530 });
24531 };
24532 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24533 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
24534 return all_of(E.Scalars, [&](Value *V) {
24535 auto *I = cast<Instruction>(V);
24536 unsigned SignBits = OrigBitWidth - BitWidth;
24537 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
24538 unsigned Op0SignBits =
24539 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
24540 return SignBits <= Op0SignBits &&
24541 ((SignBits != Op0SignBits &&
24542 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
24543 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
24544 });
24545 };
24546 if (ID != Intrinsic::abs) {
24547 Operands.push_back(getOperandEntry(&E, 1));
24548 CallChecker = CompChecker;
24549 } else {
24550 CallChecker = AbsChecker;
24551 }
24552 InstructionCost BestCost =
24553 std::numeric_limits<InstructionCost::CostType>::max();
24554 unsigned BestBitWidth = BitWidth;
24555 unsigned VF = E.Scalars.size();
24556 // Choose the best bitwidth based on cost estimations.
24557 auto Checker = [&](unsigned BitWidth, unsigned) {
24558 unsigned MinBW = PowerOf2Ceil(BitWidth);
24559 SmallVector<Type *> ArgTys =
24560 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
24561 auto VecCallCosts = getVectorCallCosts(
24562 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
24563 TTI, TLI, ArgTys);
24564 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
24565 if (Cost < BestCost) {
24566 BestCost = Cost;
24567 BestBitWidth = BitWidth;
24568 }
24569 return false;
24570 };
24571 [[maybe_unused]] bool NeedToExit;
24572 (void)AttemptCheckBitwidth(Checker, NeedToExit);
24573 BitWidth = BestBitWidth;
24574 return TryProcessInstruction(BitWidth, Operands, CallChecker);
24575 }
24576
24577 // Otherwise, conservatively give up.
24578 default:
24579 break;
24580 }
24581 MaxDepthLevel = 1;
24582 return FinalAnalysis();
24583}
24584
24585static RecurKind getRdxKind(Value *V);
24586
24588 // We only attempt to truncate integer expressions.
24589 bool IsStoreOrInsertElt =
24590 VectorizableTree.front()->hasState() &&
24591 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
24592 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
24593 if ((IsStoreOrInsertElt || UserIgnoreList) &&
24594 ExtraBitWidthNodes.size() <= 1 &&
24595 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
24596 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
24597 return;
24598
24599 unsigned NodeIdx = 0;
24600 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
24601 NodeIdx = 1;
24602
24603 // Ensure the roots of the vectorizable tree don't form a cycle.
24604 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
24605 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
24606 "Unexpected tree is graph.");
24607
24608 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
24609 // resize to the final type.
24610 bool IsTruncRoot = false;
24611 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
24612 SmallVector<unsigned> RootDemotes;
24613 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
24614 if (NodeIdx != 0 &&
24615 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
24616 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
24617 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
24618 IsTruncRoot = true;
24619 RootDemotes.push_back(NodeIdx);
24620 IsProfitableToDemoteRoot = true;
24621 ++NodeIdx;
24622 }
24623
24624 // Analyzed the reduction already and not profitable - exit.
24625 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
24626 return;
24627
24628 SmallVector<unsigned> ToDemote;
24629 auto ComputeMaxBitWidth =
24630 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
24631 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
24632 ToDemote.clear();
24633 // Check if the root is trunc and the next node is gather/buildvector, then
24634 // keep trunc in scalars, which is free in most cases.
24635 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
24636 !NodesToKeepBWs.contains(E.Idx) &&
24637 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
24638 all_of(E.Scalars, [&](Value *V) {
24639 return V->hasOneUse() || isa<Constant>(V) ||
24640 (!V->hasNUsesOrMore(UsesLimit) &&
24641 none_of(V->users(), [&](User *U) {
24642 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
24643 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24644 if (TEs.empty() || is_contained(TEs, UserTE))
24645 return false;
24646 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24647 SelectInst>(U) ||
24648 isa<SIToFPInst, UIToFPInst>(U) ||
24649 (UserTE->hasState() &&
24650 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24651 SelectInst>(UserTE->getMainOp()) ||
24652 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
24653 return true;
24654 unsigned UserTESz = DL->getTypeSizeInBits(
24655 UserTE->Scalars.front()->getType());
24656 if (all_of(TEs, [&](const TreeEntry *TE) {
24657 auto It = MinBWs.find(TE);
24658 return It != MinBWs.end() &&
24659 It->second.first > UserTESz;
24660 }))
24661 return true;
24662 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
24663 }));
24664 })) {
24665 ToDemote.push_back(E.Idx);
24666 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24667 auto It = MinBWs.find(UserTE);
24668 if (It != MinBWs.end())
24669 return It->second.first;
24670 unsigned MaxBitWidth =
24671 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
24672 MaxBitWidth = bit_ceil(MaxBitWidth);
24673 if (MaxBitWidth < 8 && MaxBitWidth > 1)
24674 MaxBitWidth = 8;
24675 return MaxBitWidth;
24676 }
24677
24678 if (!E.hasState())
24679 return 0u;
24680
24681 unsigned VF = E.getVectorFactor();
24682 Type *ScalarTy = E.Scalars.front()->getType();
24683 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
24684 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
24685 if (!TreeRootIT)
24686 return 0u;
24687
24688 if (any_of(E.Scalars,
24689 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
24690 return 0u;
24691
24692 unsigned NumParts = ::getNumberOfParts(
24693 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
24694
24695 // The maximum bit width required to represent all the values that can be
24696 // demoted without loss of precision. It would be safe to truncate the roots
24697 // of the expression to this width.
24698 unsigned MaxBitWidth = 1u;
24699
24700 // True if the roots can be zero-extended back to their original type,
24701 // rather than sign-extended. We know that if the leading bits are not
24702 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
24703 // True.
24704 // Determine if the sign bit of all the roots is known to be zero. If not,
24705 // IsKnownPositive is set to False.
24706 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
24707 if (isa<PoisonValue>(R))
24708 return true;
24709 KnownBits Known = computeKnownBits(R, *DL);
24710 return Known.isNonNegative();
24711 });
24712
24713 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
24714 E.UserTreeIndex.UserTE->hasState() &&
24715 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
24716 MaxBitWidth =
24717 std::min(DL->getTypeSizeInBits(
24718 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
24719 DL->getTypeSizeInBits(ScalarTy));
24720
24721 // We first check if all the bits of the roots are demanded. If they're not,
24722 // we can truncate the roots to this narrower type.
24723 for (Value *Root : E.Scalars) {
24724 if (isa<PoisonValue>(Root))
24725 continue;
24726 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
24727 TypeSize NumTypeBits =
24728 DL->getTypeSizeInBits(Root->getType()->getScalarType());
24729 unsigned BitWidth1 = NumTypeBits - NumSignBits;
24730 // If we can't prove that the sign bit is zero, we must add one to the
24731 // maximum bit width to account for the unknown sign bit. This preserves
24732 // the existing sign bit so we can safely sign-extend the root back to the
24733 // original type. Otherwise, if we know the sign bit is zero, we will
24734 // zero-extend the root instead.
24735 //
24736 // FIXME: This is somewhat suboptimal, as there will be cases where adding
24737 // one to the maximum bit width will yield a larger-than-necessary
24738 // type. In general, we need to add an extra bit only if we can't
24739 // prove that the upper bit of the original type is equal to the
24740 // upper bit of the proposed smaller type. If these two bits are
24741 // the same (either zero or one) we know that sign-extending from
24742 // the smaller type will result in the same value. Here, since we
24743 // can't yet prove this, we are just making the proposed smaller
24744 // type larger to ensure correctness.
24745 if (!IsKnownPositive)
24746 ++BitWidth1;
24747
24748 auto *I = dyn_cast<Instruction>(Root);
24749 if (!I) {
24750 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
24751 continue;
24752 }
24753 APInt Mask = DB->getDemandedBits(I);
24754 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
24755 MaxBitWidth =
24756 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
24757 }
24758
24759 if (MaxBitWidth < 8 && MaxBitWidth > 1)
24760 MaxBitWidth = 8;
24761
24762 // If the original type is large, but reduced type does not improve the reg
24763 // use - ignore it.
24764 if (NumParts > 1 &&
24765 NumParts ==
24767 *TTI, getWidenedType(IntegerType::get(F->getContext(),
24768 bit_ceil(MaxBitWidth)),
24769 VF)))
24770 return 0u;
24771
24772 unsigned Opcode = E.getOpcode();
24773 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
24774 Opcode == Instruction::SExt ||
24775 Opcode == Instruction::ZExt || NumParts > 1;
24776 // Conservatively determine if we can actually truncate the roots of the
24777 // expression. Collect the values that can be demoted in ToDemote and
24778 // additional roots that require investigating in Roots.
24780 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
24781 bool NeedToDemote = IsProfitableToDemote;
24782
24783 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
24784 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
24785 NeedToDemote, IsTruncRoot) ||
24786 (MaxDepthLevel <= Limit &&
24787 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
24788 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
24789 DL->getTypeSizeInBits(TreeRootIT) /
24790 DL->getTypeSizeInBits(
24791 E.getMainOp()->getOperand(0)->getType()) >
24792 2)))))
24793 return 0u;
24794 // Round MaxBitWidth up to the next power-of-two.
24795 MaxBitWidth = bit_ceil(MaxBitWidth);
24796
24797 return MaxBitWidth;
24798 };
24799
24800 // If we can truncate the root, we must collect additional values that might
24801 // be demoted as a result. That is, those seeded by truncations we will
24802 // modify.
24803 // Add reduction ops sizes, if any.
24804 if (UserIgnoreList &&
24805 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
24806 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
24807 // x i1> to in)).
24808 if (all_of(*UserIgnoreList,
24809 [](Value *V) {
24810 return isa<PoisonValue>(V) ||
24811 cast<Instruction>(V)->getOpcode() == Instruction::Add;
24812 }) &&
24813 VectorizableTree.front()->State == TreeEntry::Vectorize &&
24814 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
24815 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
24816 Builder.getInt1Ty()) {
24817 ReductionBitWidth = 1;
24818 } else {
24819 for (Value *V : *UserIgnoreList) {
24820 if (isa<PoisonValue>(V))
24821 continue;
24822 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
24823 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
24824 unsigned BitWidth1 = NumTypeBits - NumSignBits;
24825 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
24826 ++BitWidth1;
24827 unsigned BitWidth2 = BitWidth1;
24829 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
24830 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
24831 }
24832 ReductionBitWidth =
24833 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
24834 }
24835 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
24836 ReductionBitWidth = 8;
24837
24838 ReductionBitWidth = bit_ceil(ReductionBitWidth);
24839 }
24840 }
24841 bool IsTopRoot = NodeIdx == 0;
24842 while (NodeIdx < VectorizableTree.size() &&
24843 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
24844 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
24845 RootDemotes.push_back(NodeIdx);
24846 ++NodeIdx;
24847 IsTruncRoot = true;
24848 }
24849 bool IsSignedCmp = false;
24850 if (UserIgnoreList &&
24851 all_of(*UserIgnoreList,
24853 m_SMax(m_Value(), m_Value())))))
24854 IsSignedCmp = true;
24855 while (NodeIdx < VectorizableTree.size()) {
24856 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
24857 unsigned Limit = 2;
24858 if (IsTopRoot &&
24859 ReductionBitWidth ==
24860 DL->getTypeSizeInBits(
24861 VectorizableTree.front()->Scalars.front()->getType()))
24862 Limit = 3;
24863 unsigned MaxBitWidth = ComputeMaxBitWidth(
24864 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
24865 IsTruncRoot, IsSignedCmp);
24866 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
24867 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
24868 ReductionBitWidth = bit_ceil(MaxBitWidth);
24869 else if (MaxBitWidth == 0)
24870 ReductionBitWidth = 0;
24871 }
24872
24873 for (unsigned Idx : RootDemotes) {
24874 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
24875 uint32_t OrigBitWidth =
24876 DL->getTypeSizeInBits(V->getType()->getScalarType());
24877 if (OrigBitWidth > MaxBitWidth) {
24878 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
24879 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
24880 }
24881 return false;
24882 }))
24883 ToDemote.push_back(Idx);
24884 }
24885 RootDemotes.clear();
24886 IsTopRoot = false;
24887 IsProfitableToDemoteRoot = true;
24888
24889 if (ExtraBitWidthNodes.empty()) {
24890 NodeIdx = VectorizableTree.size();
24891 } else {
24892 unsigned NewIdx = 0;
24893 do {
24894 NewIdx = *ExtraBitWidthNodes.begin();
24895 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
24896 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
24897 NodeIdx = NewIdx;
24898 IsTruncRoot =
24899 NodeIdx < VectorizableTree.size() &&
24900 VectorizableTree[NodeIdx]->UserTreeIndex &&
24901 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
24902 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
24903 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
24904 Instruction::Trunc &&
24905 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
24906 IsSignedCmp =
24907 NodeIdx < VectorizableTree.size() &&
24908 VectorizableTree[NodeIdx]->UserTreeIndex &&
24909 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
24910 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
24911 Instruction::ICmp &&
24912 any_of(
24913 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
24914 [&](Value *V) {
24915 auto *IC = dyn_cast<ICmpInst>(V);
24916 return IC && (IC->isSigned() ||
24917 !isKnownNonNegative(IC->getOperand(0),
24918 SimplifyQuery(*DL)) ||
24919 !isKnownNonNegative(IC->getOperand(1),
24920 SimplifyQuery(*DL)));
24921 });
24922 }
24923
24924 // If the maximum bit width we compute is less than the width of the roots'
24925 // type, we can proceed with the narrowing. Otherwise, do nothing.
24926 if (MaxBitWidth == 0 ||
24927 MaxBitWidth >=
24928 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
24929 ->getBitWidth()) {
24930 if (UserIgnoreList)
24931 AnalyzedMinBWVals.insert_range(TreeRoot);
24932 NodesToKeepBWs.insert_range(ToDemote);
24933 continue;
24934 }
24935
24936 // Finally, map the values we can demote to the maximum bit with we
24937 // computed.
24938 for (unsigned Idx : ToDemote) {
24939 TreeEntry *TE = VectorizableTree[Idx].get();
24940 if (MinBWs.contains(TE))
24941 continue;
24942 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
24943 if (isa<PoisonValue>(R))
24944 return false;
24945 return !isKnownNonNegative(R, SimplifyQuery(*DL));
24946 });
24947 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
24948 }
24949 }
24950}
24951
24953 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
24954 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
24956 auto *AA = &AM.getResult<AAManager>(F);
24957 auto *LI = &AM.getResult<LoopAnalysis>(F);
24958 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
24959 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
24960 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
24962
24963 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
24964 if (!Changed)
24965 return PreservedAnalyses::all();
24966
24969 return PA;
24970}
24971
24973 TargetTransformInfo *TTI_,
24974 TargetLibraryInfo *TLI_, AAResults *AA_,
24975 LoopInfo *LI_, DominatorTree *DT_,
24976 AssumptionCache *AC_, DemandedBits *DB_,
24979 return false;
24980 SE = SE_;
24981 TTI = TTI_;
24982 TLI = TLI_;
24983 AA = AA_;
24984 LI = LI_;
24985 DT = DT_;
24986 AC = AC_;
24987 DB = DB_;
24988 DL = &F.getDataLayout();
24989
24990 Stores.clear();
24991 GEPs.clear();
24992 bool Changed = false;
24993
24994 // If the target claims to have no vector registers don't attempt
24995 // vectorization.
24996 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
24997 LLVM_DEBUG(
24998 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
24999 return false;
25000 }
25001
25002 // Don't vectorize when the attribute NoImplicitFloat is used.
25003 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
25004 return false;
25005
25006 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
25007
25008 // Use the bottom up slp vectorizer to construct chains that start with
25009 // store instructions.
25010 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
25011
25012 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
25013 // delete instructions.
25014
25015 // Update DFS numbers now so that we can use them for ordering.
25016 DT->updateDFSNumbers();
25017
25018 // Scan the blocks in the function in post order.
25019 for (auto *BB : post_order(&F.getEntryBlock())) {
25021 continue;
25022
25023 // Start new block - clear the list of reduction roots.
25024 R.clearReductionData();
25025 collectSeedInstructions(BB);
25026
25027 // Vectorize trees that end at stores.
25028 if (!Stores.empty()) {
25029 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
25030 << " underlying objects.\n");
25031 Changed |= vectorizeStoreChains(R);
25032 }
25033
25034 // Vectorize trees that end at reductions.
25035 Changed |= vectorizeChainsInBlock(BB, R);
25036
25037 // Vectorize the index computations of getelementptr instructions. This
25038 // is primarily intended to catch gather-like idioms ending at
25039 // non-consecutive loads.
25040 if (!GEPs.empty()) {
25041 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
25042 << " underlying objects.\n");
25043 Changed |= vectorizeGEPIndices(BB, R);
25044 }
25045 }
25046
25047 if (Changed) {
25048 R.optimizeGatherSequence();
25049 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
25050 }
25051 return Changed;
25052}
25053
25054std::optional<bool>
25055SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
25056 unsigned Idx, unsigned MinVF,
25057 unsigned &Size) {
25058 Size = 0;
25059 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
25060 << "\n");
25061 const unsigned Sz = R.getVectorElementSize(Chain[0]);
25062 unsigned VF = Chain.size();
25063
25064 if (!has_single_bit(Sz) ||
25066 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
25067 VF) ||
25068 VF < 2 || VF < MinVF) {
25069 // Check if vectorizing with a non-power-of-2 VF should be considered. At
25070 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
25071 // all vector lanes are used.
25072 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
25073 return false;
25074 }
25075
25076 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
25077 << "\n");
25078
25079 SetVector<Value *> ValOps;
25080 for (Value *V : Chain)
25081 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
25082 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
25083 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
25084 InstructionsState S =
25085 Analysis.buildInstructionsState(ValOps.getArrayRef(), R);
25086 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
25087 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
25088 bool IsAllowedSize =
25089 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
25090 ValOps.size()) ||
25091 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
25092 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
25093 (!S.getMainOp()->isSafeToRemove() ||
25094 any_of(ValOps.getArrayRef(),
25095 [&](Value *V) {
25096 return !isa<ExtractElementInst>(V) &&
25097 (V->getNumUses() > Chain.size() ||
25098 any_of(V->users(), [&](User *U) {
25099 return !Stores.contains(U);
25100 }));
25101 }))) ||
25102 (ValOps.size() > Chain.size() / 2 && !S)) {
25103 Size = (!IsAllowedSize && S) ? 1 : 2;
25104 return false;
25105 }
25106 }
25107 R.buildTree(Chain);
25108 // Check if tree tiny and store itself or its value is not vectorized.
25109 if (R.isTreeTinyAndNotFullyVectorizable()) {
25110 if (R.isGathered(Chain.front()) ||
25111 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
25112 return std::nullopt;
25113 Size = R.getCanonicalGraphSize();
25114 return false;
25115 }
25116 if (R.isProfitableToReorder()) {
25117 R.reorderTopToBottom();
25118 R.reorderBottomToTop();
25119 }
25120 R.transformNodes();
25121 R.computeMinimumValueSizes();
25122
25123 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
25124 R.buildExternalUses();
25125
25126 Size = R.getCanonicalGraphSize();
25127 if (S && S.getOpcode() == Instruction::Load)
25128 Size = 2; // cut off masked gather small trees
25129 InstructionCost Cost = R.getTreeCost(TreeCost);
25130
25131 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
25132 if (Cost < -SLPCostThreshold) {
25133 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
25134
25135 using namespace ore;
25136
25137 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
25138 cast<StoreInst>(Chain[0]))
25139 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
25140 << " and with tree size "
25141 << NV("TreeSize", R.getTreeSize()));
25142
25143 R.vectorizeTree();
25144 return true;
25145 }
25146
25147 return false;
25148}
25149
25150/// Checks if the quadratic mean deviation is less than 90% of the mean size.
25151static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes) {
25152 unsigned Num = 0;
25153 uint64_t Sum = std::accumulate(
25154 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
25155 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
25156 unsigned Size = Val.first;
25157 if (Size == 1)
25158 return V;
25159 ++Num;
25160 return V + Size;
25161 });
25162 if (Num == 0)
25163 return true;
25164 uint64_t Mean = Sum / Num;
25165 if (Mean == 0)
25166 return true;
25167 uint64_t Dev = std::accumulate(
25168 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
25169 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
25170 unsigned P = Val.first;
25171 if (P == 1)
25172 return V;
25173 return V + (P - Mean) * (P - Mean);
25174 }) /
25175 Num;
25176 return Dev * 96 / (Mean * Mean) == 0;
25177}
25178
25179namespace {
25180
25181/// A group of stores that we'll try to bundle together using vector ops.
25182/// They are ordered using the signed distance of their address operand to the
25183/// address of this group's BaseInstr.
25184class RelatedStoreInsts {
25185public:
25186 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
25187 : AllStores(AllStores) {
25188 reset(BaseInstrIdx);
25189 }
25190
25191 void reset(unsigned NewBaseInstr) {
25192 assert(NewBaseInstr < AllStores.size() &&
25193 "Instruction index out of bounds");
25194 BaseInstrIdx = NewBaseInstr;
25195 Instrs.clear();
25196 insertOrLookup(NewBaseInstr, 0);
25197 }
25198
25199 /// Tries to insert \p InstrIdx as the store with a pointer distance of
25200 /// \p PtrDist.
25201 /// Does nothing if there is already a store with that \p PtrDist.
25202 /// \returns The previously associated Instruction index, or std::nullopt
25203 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
25204 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
25205 return Inserted ? std::nullopt : std::make_optional(It->second);
25206 }
25207
25208 using DistToInstMap = std::map<int64_t, unsigned>;
25209 const DistToInstMap &getStores() const { return Instrs; }
25210
25211 /// If \p SI is related to this group of stores, return the distance of its
25212 /// pointer operand to the one the group's BaseInstr.
25213 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
25214 ScalarEvolution &SE) const {
25215 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
25216 return getPointersDiff(
25217 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
25218 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
25219 /*StrictCheck=*/true);
25220 }
25221
25222 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
25223 /// Stores whose index is less than \p MinSafeIdx will be dropped.
25224 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
25225 int64_t DistFromCurBase) {
25226 DistToInstMap PrevSet = std::move(Instrs);
25227 reset(NewBaseInstIdx);
25228
25229 // Re-insert stores that come after MinSafeIdx to try and vectorize them
25230 // again. Their distance will be "rebased" to use NewBaseInstIdx as
25231 // reference.
25232 for (auto [Dist, InstIdx] : PrevSet) {
25233 if (InstIdx >= MinSafeIdx)
25234 insertOrLookup(InstIdx, Dist - DistFromCurBase);
25235 }
25236 }
25237
25238 /// Remove all stores that have been vectorized from this group.
25239 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
25240 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
25241 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
25242 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
25243 });
25244
25245 // Get a forward iterator pointing after the last vectorized store and erase
25246 // all stores before it so we don't try to vectorize them again.
25247 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
25248 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
25249 }
25250
25251private:
25252 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
25253 unsigned BaseInstrIdx;
25254
25255 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
25256 DistToInstMap Instrs;
25257
25258 /// Reference to all the stores in the BB being analyzed.
25259 ArrayRef<StoreInst *> AllStores;
25260};
25261
25262} // end anonymous namespace
25263
25264bool SLPVectorizerPass::vectorizeStores(
25265 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
25266 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
25267 &Visited) {
25268 // We may run into multiple chains that merge into a single chain. We mark the
25269 // stores that we vectorized so that we don't visit the same store twice.
25270 BoUpSLP::ValueSet VectorizedStores;
25271 bool Changed = false;
25272
25273 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
25274 int64_t PrevDist = -1;
25275 BoUpSLP::ValueList Operands;
25276 // Collect the chain into a list.
25277 for (auto [Idx, Data] : enumerate(StoreSeq)) {
25278 auto &[Dist, InstIdx] = Data;
25279 if (Operands.empty() || Dist - PrevDist == 1) {
25280 Operands.push_back(Stores[InstIdx]);
25281 PrevDist = Dist;
25282 if (Idx != StoreSeq.size() - 1)
25283 continue;
25284 }
25285 llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
25286 Operands.clear();
25287 Operands.push_back(Stores[InstIdx]);
25288 PrevDist = Dist;
25289 });
25290
25291 if (Operands.size() <= 1 ||
25292 !Visited
25293 .insert({Operands.front(),
25294 cast<StoreInst>(Operands.front())->getValueOperand(),
25295 Operands.back(),
25296 cast<StoreInst>(Operands.back())->getValueOperand(),
25297 Operands.size()})
25298 .second)
25299 continue;
25300
25301 unsigned MaxVecRegSize = R.getMaxVecRegSize();
25302 unsigned EltSize = R.getVectorElementSize(Operands[0]);
25303 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
25304
25305 unsigned MaxVF =
25306 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
25307 auto *Store = cast<StoreInst>(Operands[0]);
25308 Type *StoreTy = Store->getValueOperand()->getType();
25309 Type *ValueTy = StoreTy;
25310 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
25311 ValueTy = Trunc->getSrcTy();
25312 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
25313 // getStoreMinimumVF only support scalar type as arguments. As a result,
25314 // we need to use the element type of StoreTy and ValueTy to retrieve the
25315 // VF and then transform it back.
25316 // Remember: VF is defined as the number we want to vectorize, not the
25317 // number of elements in the final vector.
25318 Type *StoreScalarTy = StoreTy->getScalarType();
25319 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
25320 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
25321 ValueTy->getScalarType()));
25322 MinVF /= getNumElements(StoreTy);
25323 MinVF = std::max<unsigned>(2, MinVF);
25324
25325 if (MaxVF < MinVF) {
25326 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
25327 << ") < "
25328 << "MinVF (" << MinVF << ")\n");
25329 continue;
25330 }
25331
25332 unsigned NonPowerOf2VF = 0;
25334 // First try vectorizing with a non-power-of-2 VF. At the moment, only
25335 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
25336 // lanes are used.
25337 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
25338 if (has_single_bit(CandVF + 1)) {
25339 NonPowerOf2VF = CandVF;
25340 assert(NonPowerOf2VF != MaxVF &&
25341 "Non-power-of-2 VF should not be equal to MaxVF");
25342 }
25343 }
25344
25345 // MaxRegVF represents the number of instructions (scalar, or vector in
25346 // case of revec) that can be vectorized to naturally fit in a vector
25347 // register.
25348 unsigned MaxRegVF = MaxVF;
25349
25350 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
25351 if (MaxVF < MinVF) {
25352 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
25353 << ") < "
25354 << "MinVF (" << MinVF << ")\n");
25355 continue;
25356 }
25357
25358 SmallVector<unsigned> CandidateVFs;
25359 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
25360 VF = divideCeil(VF, 2))
25361 CandidateVFs.push_back(VF);
25362
25363 unsigned End = Operands.size();
25364 unsigned Repeat = 0;
25365 constexpr unsigned MaxAttempts = 4;
25366 // first: the best TreeSize from all prior loops over CandidateVFs, gets
25367 // updated after looping through CandidateVFs
25368 // second: the best TreeSize from all prior loops including the current
25369 // one
25371 Operands.size(), {1, 1});
25372 // The `slice` and `drop_front` interfaces are convenient
25373 const auto RangeSizes = MutableArrayRef(RangeSizesStorage);
25374 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
25375 auto IsNotVectorized = [](const std::pair<unsigned, unsigned> &P) {
25376 return P.first > 0;
25377 };
25378 auto IsVectorized = [](const std::pair<unsigned, unsigned> &P) {
25379 return P.first == 0;
25380 };
25381 auto VFIsProfitable = [](unsigned Size,
25382 const std::pair<unsigned, unsigned> &P) {
25383 return Size >= P.first;
25384 };
25385 auto FirstSizeSame = [](unsigned Size,
25386 const std::pair<unsigned, unsigned> &P) {
25387 return Size == P.first;
25388 };
25389 while (true) {
25390 ++Repeat;
25391 bool RepeatChanged = false;
25392 bool AnyProfitableGraph = false;
25393 for (unsigned VF : CandidateVFs) {
25394 AnyProfitableGraph = false;
25395 unsigned FirstUnvecStore = std::distance(
25396 RangeSizes.begin(), find_if(RangeSizes, IsNotVectorized));
25397
25398 // Form slices of size VF starting from FirstUnvecStore and try to
25399 // vectorize them.
25400 while (FirstUnvecStore < End) {
25401 unsigned FirstVecStore = std::distance(
25402 RangeSizes.begin(),
25403 find_if(RangeSizes.drop_front(FirstUnvecStore), IsVectorized));
25404 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
25405 for (unsigned SliceStartIdx = FirstUnvecStore;
25406 SliceStartIdx + VF <= MaxSliceEnd;) {
25407 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF))) {
25408 ++SliceStartIdx;
25409 continue;
25410 }
25411 ArrayRef<Value *> Slice =
25412 ArrayRef(Operands).slice(SliceStartIdx, VF);
25413 assert(all_of(Slice,
25414 [&](Value *V) {
25415 return cast<StoreInst>(V)
25416 ->getValueOperand()
25417 ->getType() ==
25418 cast<StoreInst>(Slice.front())
25419 ->getValueOperand()
25420 ->getType();
25421 }) &&
25422 "Expected all operands of same type.");
25423 if (!NonSchedulable.empty()) {
25424 auto [NonSchedSizeMax, NonSchedSizeMin] =
25425 NonSchedulable.lookup(Slice.front());
25426 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
25427 // VF is too ambitious. Try to vectorize another slice before
25428 // trying a smaller VF.
25429 SliceStartIdx += NonSchedSizeMax;
25430 continue;
25431 }
25432 }
25433 unsigned TreeSize;
25434 std::optional<bool> Res =
25435 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
25436 if (!Res) {
25437 // Update the range of non schedulable VFs for slices starting
25438 // at SliceStartIdx.
25439 NonSchedulable
25440 .try_emplace(Slice.front(), std::make_pair(VF, VF))
25441 .first->getSecond()
25442 .second = VF;
25443 } else if (*Res) {
25444 // Mark the vectorized stores so that we don't vectorize them
25445 // again.
25446 VectorizedStores.insert_range(Slice);
25447 AnyProfitableGraph = RepeatChanged = Changed = true;
25448 // If we vectorized initial block, no need to try to vectorize
25449 // it again.
25450 for (std::pair<unsigned, unsigned> &P :
25451 RangeSizes.slice(SliceStartIdx, VF))
25452 P.first = P.second = 0;
25453 if (SliceStartIdx < FirstUnvecStore + MinVF) {
25454 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
25455 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
25456 P.first = P.second = 0;
25457 FirstUnvecStore = SliceStartIdx + VF;
25458 }
25459 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
25460 for (std::pair<unsigned, unsigned> &P :
25461 RangeSizes.slice(SliceStartIdx + VF,
25462 MaxSliceEnd - (SliceStartIdx + VF)))
25463 P.first = P.second = 0;
25464 if (MaxSliceEnd == End)
25465 End = SliceStartIdx;
25466 MaxSliceEnd = SliceStartIdx;
25467 }
25468 SliceStartIdx += VF;
25469 continue;
25470 }
25471 if (VF > 2 && Res &&
25472 !all_of(RangeSizes.slice(SliceStartIdx, VF),
25473 std::bind(VFIsProfitable, TreeSize, _1))) {
25474 SliceStartIdx += VF;
25475 continue;
25476 }
25477 // Check for the very big VFs that we're not rebuilding same
25478 // trees, just with larger number of elements.
25479 if (VF > MaxRegVF && TreeSize > 1 &&
25480 all_of(RangeSizes.slice(SliceStartIdx, VF),
25481 std::bind(FirstSizeSame, TreeSize, _1))) {
25482 SliceStartIdx += VF;
25483 while (SliceStartIdx != MaxSliceEnd &&
25484 RangeSizes[SliceStartIdx].first == TreeSize)
25485 ++SliceStartIdx;
25486 continue;
25487 }
25488 if (TreeSize > 1)
25489 for (std::pair<unsigned, unsigned> &P :
25490 RangeSizes.slice(SliceStartIdx, VF))
25491 P.second = std::max(P.second, TreeSize);
25492 ++SliceStartIdx;
25493 AnyProfitableGraph = true;
25494 }
25495 if (FirstUnvecStore >= End)
25496 break;
25497 if (MaxSliceEnd - FirstUnvecStore < VF &&
25498 MaxSliceEnd - FirstUnvecStore >= MinVF)
25499 AnyProfitableGraph = true;
25500 FirstUnvecStore = std::distance(
25501 RangeSizes.begin(),
25502 find_if(RangeSizes.drop_front(MaxSliceEnd), IsNotVectorized));
25503 }
25504 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
25505 break;
25506 // For the MaxRegVF case, save RangeSizes to limit compile time
25507 if (VF == MaxRegVF)
25508 for (std::pair<unsigned, unsigned> &P : RangeSizes)
25509 if (P.first != 0)
25510 P.first = std::max(P.second, P.first);
25511 }
25512 // All values vectorized - exit.
25513 if (all_of(RangeSizes, IsVectorized))
25514 break;
25515 // Check if tried all attempts or no need for the last attempts at all.
25516 if (Repeat >= MaxAttempts ||
25517 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
25518 break;
25519 constexpr unsigned StoresLimit = 64;
25520 const unsigned MaxTotalNum = std::min<unsigned>(
25521 Operands.size(),
25522 static_cast<unsigned>(
25523 End -
25524 std::distance(RangeSizes.begin(),
25525 find_if(RangeSizes, IsNotVectorized)) +
25526 1));
25527 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
25528 if (VF > MaxTotalNum || VF >= StoresLimit)
25529 break;
25530 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
25531 if (P.first != 0)
25532 P.first = std::max(P.second, P.first);
25533 }
25534 // Attempt again to vectorize even larger chains if all previous
25535 // attempts were unsuccessful because of the cost issues.
25536 CandidateVFs.clear();
25537 unsigned Limit =
25538 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
25539 if (bit_floor(Limit) == VF && Limit != VF)
25540 CandidateVFs.push_back(Limit);
25541 CandidateVFs.push_back(VF);
25542 }
25543 }
25544 };
25545
25546 /// Groups of stores to vectorize
25547 SmallVector<RelatedStoreInsts> SortedStores;
25548
25549 // Inserts the specified store SI with the given index Idx to the set of the
25550 // stores. If the store with the same distance is found already - stop
25551 // insertion, try to vectorize already found stores. If some stores from this
25552 // sequence were not vectorized - try to vectorize them with the new store
25553 // later. But this logic is applied only to the stores, that come before the
25554 // previous store with the same distance.
25555 // Example:
25556 // 1. store x, %p
25557 // 2. store y, %p+1
25558 // 3. store z, %p+2
25559 // 4. store a, %p
25560 // 5. store b, %p+3
25561 // - Scan this from the last to first store. The very first bunch of stores is
25562 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
25563 // vector).
25564 // - The next store in the list - #1 - has the same distance from store #5 as
25565 // the store #4.
25566 // - Try to vectorize sequence of stores 4,2,3,5.
25567 // - If all these stores are vectorized - just drop them.
25568 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
25569 // - Start new stores sequence.
25570 // The new bunch of stores is {1, {1, 0}}.
25571 // - Add the stores from previous sequence, that were not vectorized.
25572 // Here we consider the stores in the reversed order, rather they are used in
25573 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
25574 // Store #3 can be added -> comes after store #4 with the same distance as
25575 // store #1.
25576 // Store #5 cannot be added - comes before store #4.
25577 // This logic allows to improve the compile time, we assume that the stores
25578 // after previous store with the same distance most likely have memory
25579 // dependencies and no need to waste compile time to try to vectorize them.
25580 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
25581 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
25582 std::optional<int64_t> PtrDist;
25583 auto *RelatedStores = find_if(
25584 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
25585 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
25586 return PtrDist.has_value();
25587 });
25588
25589 // We did not find a comparable store, start a new group.
25590 if (RelatedStores == SortedStores.end()) {
25591 SortedStores.emplace_back(Idx, Stores);
25592 return;
25593 }
25594
25595 // If there is already a store in the group with the same PtrDiff, try to
25596 // vectorize the existing instructions before adding the current store.
25597 // Otherwise, insert this store and keep collecting.
25598 if (std::optional<unsigned> PrevInst =
25599 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
25600 TryToVectorize(RelatedStores->getStores());
25601 RelatedStores->clearVectorizedStores(VectorizedStores);
25602 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
25603 /*NewBaseInstIdx=*/Idx,
25604 /*DistFromCurBase=*/*PtrDist);
25605 }
25606 };
25607 Type *PrevValTy = nullptr;
25608 for (auto [I, SI] : enumerate(Stores)) {
25609 if (R.isDeleted(SI))
25610 continue;
25611 if (!PrevValTy)
25612 PrevValTy = SI->getValueOperand()->getType();
25613 // Check that we do not try to vectorize stores of different types.
25614 if (PrevValTy != SI->getValueOperand()->getType()) {
25615 for (RelatedStoreInsts &StoreSeq : SortedStores)
25616 TryToVectorize(StoreSeq.getStores());
25617 SortedStores.clear();
25618 PrevValTy = SI->getValueOperand()->getType();
25619 }
25620 FillStoresSet(I, SI);
25621 }
25622
25623 // Final vectorization attempt.
25624 for (RelatedStoreInsts &StoreSeq : SortedStores)
25625 TryToVectorize(StoreSeq.getStores());
25626
25627 return Changed;
25628}
25629
25630void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
25631 // Initialize the collections. We will make a single pass over the block.
25632 Stores.clear();
25633 GEPs.clear();
25634
25635 // Visit the store and getelementptr instructions in BB and organize them in
25636 // Stores and GEPs according to the underlying objects of their pointer
25637 // operands.
25638 for (Instruction &I : *BB) {
25639 // Ignore store instructions that are volatile or have a pointer operand
25640 // that doesn't point to a scalar type.
25641 if (auto *SI = dyn_cast<StoreInst>(&I)) {
25642 if (!SI->isSimple())
25643 continue;
25644 if (!isValidElementType(SI->getValueOperand()->getType()))
25645 continue;
25646 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
25647 }
25648
25649 // Ignore getelementptr instructions that have more than one index, a
25650 // constant index, or a pointer operand that doesn't point to a scalar
25651 // type.
25652 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
25653 if (GEP->getNumIndices() != 1)
25654 continue;
25655 Value *Idx = GEP->idx_begin()->get();
25656 if (isa<Constant>(Idx))
25657 continue;
25658 if (!isValidElementType(Idx->getType()))
25659 continue;
25660 if (GEP->getType()->isVectorTy())
25661 continue;
25662 GEPs[GEP->getPointerOperand()].push_back(GEP);
25663 }
25664 }
25665}
25666
25667bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
25668 bool MaxVFOnly) {
25669 if (VL.size() < 2)
25670 return false;
25671
25672 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
25673 << VL.size() << ".\n");
25674
25675 // Check that all of the parts are instructions of the same type,
25676 // we permit an alternate opcode via InstructionsState.
25677 InstructionsState S = getSameOpcode(VL, *TLI);
25678 if (!S)
25679 return false;
25680
25681 Instruction *I0 = S.getMainOp();
25682 // Make sure invalid types (including vector type) are rejected before
25683 // determining vectorization factor for scalar instructions.
25684 for (Value *V : VL) {
25685 Type *Ty = V->getType();
25687 // NOTE: the following will give user internal llvm type name, which may
25688 // not be useful.
25689 R.getORE()->emit([&]() {
25690 std::string TypeStr;
25691 llvm::raw_string_ostream OS(TypeStr);
25692 Ty->print(OS);
25693 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
25694 << "Cannot SLP vectorize list: type "
25695 << TypeStr + " is unsupported by vectorizer";
25696 });
25697 return false;
25698 }
25699 }
25700
25701 Type *ScalarTy = getValueType(VL[0]);
25702 unsigned Sz = R.getVectorElementSize(I0);
25703 unsigned MinVF = R.getMinVF(Sz);
25704 unsigned MaxVF = std::max<unsigned>(
25705 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
25706 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
25707 if (MaxVF < 2) {
25708 R.getORE()->emit([&]() {
25709 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
25710 << "Cannot SLP vectorize list: vectorization factor "
25711 << "less than 2 is not supported";
25712 });
25713 return false;
25714 }
25715
25716 bool Changed = false;
25717 bool CandidateFound = false;
25718 InstructionCost MinCost = SLPCostThreshold.getValue();
25719
25720 unsigned NextInst = 0, MaxInst = VL.size();
25721 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
25722 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
25723 // No actual vectorization should happen, if number of parts is the same as
25724 // provided vectorization factor (i.e. the scalar type is used for vector
25725 // code during codegen).
25726 auto *VecTy = getWidenedType(ScalarTy, VF);
25727 if (TTI->getNumberOfParts(VecTy) == VF)
25728 continue;
25729 for (unsigned I = NextInst; I < MaxInst; ++I) {
25730 unsigned ActualVF = std::min(MaxInst - I, VF);
25731
25732 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
25733 continue;
25734
25735 if (MaxVFOnly && ActualVF < MaxVF)
25736 break;
25737 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
25738 break;
25739
25740 SmallVector<Value *> Ops(ActualVF, nullptr);
25741 unsigned Idx = 0;
25742 for (Value *V : VL.drop_front(I)) {
25743 // Check that a previous iteration of this loop did not delete the
25744 // Value.
25745 if (auto *Inst = dyn_cast<Instruction>(V);
25746 !Inst || !R.isDeleted(Inst)) {
25747 Ops[Idx] = V;
25748 ++Idx;
25749 if (Idx == ActualVF)
25750 break;
25751 }
25752 }
25753 // Not enough vectorizable instructions - exit.
25754 if (Idx != ActualVF)
25755 break;
25756
25757 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
25758 << "\n");
25759
25760 R.buildTree(Ops);
25761 if (R.isTreeTinyAndNotFullyVectorizable())
25762 continue;
25763 if (R.isProfitableToReorder()) {
25764 R.reorderTopToBottom();
25765 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
25766 }
25767 R.transformNodes();
25768 R.computeMinimumValueSizes();
25769 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
25770 R.buildExternalUses();
25771
25772 InstructionCost Cost = R.getTreeCost(TreeCost);
25773 CandidateFound = true;
25774 MinCost = std::min(MinCost, Cost);
25775
25776 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
25777 << " for VF=" << ActualVF << "\n");
25778 if (Cost < -SLPCostThreshold) {
25779 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
25780 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
25782 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
25783 << " and with tree size "
25784 << ore::NV("TreeSize", R.getTreeSize()));
25785
25786 R.vectorizeTree();
25787 // Move to the next bundle.
25788 I += VF - 1;
25789 NextInst = I + 1;
25790 Changed = true;
25791 }
25792 }
25793 }
25794
25795 if (!Changed && CandidateFound) {
25796 R.getORE()->emit([&]() {
25797 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
25798 << "List vectorization was possible but not beneficial with cost "
25799 << ore::NV("Cost", MinCost) << " >= "
25800 << ore::NV("Treshold", -SLPCostThreshold);
25801 });
25802 } else if (!Changed) {
25803 R.getORE()->emit([&]() {
25804 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
25805 << "Cannot SLP vectorize list: vectorization was impossible"
25806 << " with available vectorization factors";
25807 });
25808 }
25809 return Changed;
25810}
25811
25812namespace {
25813
25814/// Model horizontal reductions.
25815///
25816/// A horizontal reduction is a tree of reduction instructions that has values
25817/// that can be put into a vector as its leaves. For example:
25818///
25819/// mul mul mul mul
25820/// \ / \ /
25821/// + +
25822/// \ /
25823/// +
25824/// This tree has "mul" as its leaf values and "+" as its reduction
25825/// instructions. A reduction can feed into a store or a binary operation
25826/// feeding a phi.
25827/// ...
25828/// \ /
25829/// +
25830/// |
25831/// phi +=
25832///
25833/// Or:
25834/// ...
25835/// \ /
25836/// +
25837/// |
25838/// *p =
25839///
25840class HorizontalReduction {
25841 using ReductionOpsType = SmallVector<Value *, 16>;
25842 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
25843 ReductionOpsListType ReductionOps;
25844 /// List of possibly reduced values.
25846 /// Maps reduced value to the corresponding reduction operation.
25847 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
25848 WeakTrackingVH ReductionRoot;
25849 /// The type of reduction operation.
25850 RecurKind RdxKind;
25851 /// Checks if the optimization of original scalar identity operations on
25852 /// matched horizontal reductions is enabled and allowed.
25853 bool IsSupportedHorRdxIdentityOp = false;
25854 /// The minimum number of the reduced values.
25855 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
25856 /// Contains vector values for reduction including their scale factor and
25857 /// signedness. The last bool is true, if the value was reduced in-tree.
25859 VectorValuesAndScales;
25860
25861 static bool isCmpSelMinMax(Instruction *I) {
25862 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
25864 }
25865
25866 // And/or are potentially poison-safe logical patterns like:
25867 // select x, y, false
25868 // select x, true, y
25869 static bool isBoolLogicOp(Instruction *I) {
25870 return isa<SelectInst>(I) &&
25871 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
25872 }
25873
25874 /// Checks if instruction is associative and can be vectorized.
25875 enum class ReductionOrdering { Unordered, Ordered, None };
25876 ReductionOrdering RK = ReductionOrdering::None;
25877 static ReductionOrdering isVectorizable(RecurKind Kind, Instruction *I,
25878 bool TwoElementReduction = false) {
25879 if (Kind == RecurKind::None)
25880 return ReductionOrdering::None;
25881
25882 // Integer ops that map to select instructions or intrinsics are fine.
25884 isBoolLogicOp(I))
25885 return ReductionOrdering::Unordered;
25886
25887 // No need to check for associativity, if 2 reduced values.
25888 if (TwoElementReduction)
25889 return ReductionOrdering::Unordered;
25890
25891 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
25892 // FP min/max are associative except for NaN and -0.0. We do not
25893 // have to rule out -0.0 here because the intrinsic semantics do not
25894 // specify a fixed result for it.
25895 return I->getFastMathFlags().noNaNs() ? ReductionOrdering::Unordered
25896 : ReductionOrdering::Ordered;
25897 }
25898
25899 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
25900 return ReductionOrdering::Unordered;
25901
25902 if (I->isAssociative())
25903 return ReductionOrdering::Unordered;
25904
25905 return ::isCommutative(I) ? ReductionOrdering::Ordered
25906 : ReductionOrdering::None;
25907 }
25908
25909 static Value *getRdxOperand(Instruction *I, unsigned Index) {
25910 // Poison-safe 'or' takes the form: select X, true, Y
25911 // To make that work with the normal operand processing, we skip the
25912 // true value operand.
25913 // TODO: Change the code and data structures to handle this without a hack.
25914 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
25915 return I->getOperand(2);
25916 return I->getOperand(Index);
25917 }
25918
25919 /// Creates reduction operation with the current opcode.
25920 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
25921 Value *RHS, const Twine &Name, bool UseSelect) {
25922 Type *OpTy = LHS->getType();
25923 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
25924 switch (Kind) {
25925 case RecurKind::Or: {
25926 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
25927 return Builder.CreateSelectWithUnknownProfile(
25928 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
25929 RHS, DEBUG_TYPE, Name);
25930 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25931 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25932 Name);
25933 }
25934 case RecurKind::And: {
25935 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
25936 return Builder.CreateSelectWithUnknownProfile(
25937 LHS, RHS,
25938 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
25939 DEBUG_TYPE, Name);
25940 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25941 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25942 Name);
25943 }
25944 case RecurKind::Add:
25945 case RecurKind::Mul:
25946 case RecurKind::Xor:
25947 case RecurKind::FAdd:
25948 case RecurKind::FMul: {
25949 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25950 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25951 Name);
25952 }
25953 case RecurKind::SMax:
25954 case RecurKind::SMin:
25955 case RecurKind::UMax:
25956 case RecurKind::UMin:
25957 if (UseSelect) {
25959 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
25960 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
25961 Name);
25962 }
25963 [[fallthrough]];
25964 case RecurKind::FMax:
25965 case RecurKind::FMin:
25966 case RecurKind::FMaximum:
25967 case RecurKind::FMinimum:
25968 case RecurKind::FMaximumNum:
25969 case RecurKind::FMinimumNum: {
25971 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
25972 }
25973 default:
25974 llvm_unreachable("Unknown reduction operation.");
25975 }
25976 }
25977
25978 /// Creates reduction operation with the current opcode with the IR flags
25979 /// from \p ReductionOps, dropping nuw/nsw flags.
25980 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
25981 Value *RHS, const Twine &Name,
25982 const ReductionOpsListType &ReductionOps) {
25983 bool UseSelect = ReductionOps.size() == 2 ||
25984 // Logical or/and.
25985 (ReductionOps.size() == 1 &&
25986 any_of(ReductionOps.front(), IsaPred<SelectInst>));
25987 assert((!UseSelect || ReductionOps.size() != 2 ||
25988 isa<SelectInst>(ReductionOps[1][0])) &&
25989 "Expected cmp + select pairs for reduction");
25990 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
25992 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
25993 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
25994 /*IncludeWrapFlags=*/false);
25995 propagateIRFlags(Op, ReductionOps[1], nullptr,
25996 /*IncludeWrapFlags=*/false);
25997 return Op;
25998 }
25999 }
26000 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
26001 return Op;
26002 }
26003
26004public:
26005 static RecurKind getRdxKind(Value *V) {
26006 auto *I = dyn_cast<Instruction>(V);
26007 if (!I)
26008 return RecurKind::None;
26009 if (match(I, m_Add(m_Value(), m_Value())))
26010 return RecurKind::Add;
26011 if (match(I, m_Mul(m_Value(), m_Value())))
26012 return RecurKind::Mul;
26013 if (match(I, m_And(m_Value(), m_Value())) ||
26015 return RecurKind::And;
26016 if (match(I, m_Or(m_Value(), m_Value())) ||
26018 return RecurKind::Or;
26019 if (match(I, m_Xor(m_Value(), m_Value())))
26020 return RecurKind::Xor;
26021 if (match(I, m_FAdd(m_Value(), m_Value())))
26022 return RecurKind::FAdd;
26023 if (match(I, m_FMul(m_Value(), m_Value())))
26024 return RecurKind::FMul;
26025
26027 return RecurKind::FMax;
26029 return RecurKind::FMin;
26030
26031 if (match(I, m_FMaximum(m_Value(), m_Value())))
26032 return RecurKind::FMaximum;
26033 if (match(I, m_FMinimum(m_Value(), m_Value())))
26034 return RecurKind::FMinimum;
26035 // This matches either cmp+select or intrinsics. SLP is expected to handle
26036 // either form.
26037 // TODO: If we are canonicalizing to intrinsics, we can remove several
26038 // special-case paths that deal with selects.
26039 if (match(I, m_SMax(m_Value(), m_Value())))
26040 return RecurKind::SMax;
26041 if (match(I, m_SMin(m_Value(), m_Value())))
26042 return RecurKind::SMin;
26043 if (match(I, m_UMax(m_Value(), m_Value())))
26044 return RecurKind::UMax;
26045 if (match(I, m_UMin(m_Value(), m_Value())))
26046 return RecurKind::UMin;
26047
26048 if (auto *Select = dyn_cast<SelectInst>(I)) {
26049 // Try harder: look for min/max pattern based on instructions producing
26050 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
26051 // During the intermediate stages of SLP, it's very common to have
26052 // pattern like this (since optimizeGatherSequence is run only once
26053 // at the end):
26054 // %1 = extractelement <2 x i32> %a, i32 0
26055 // %2 = extractelement <2 x i32> %a, i32 1
26056 // %cond = icmp sgt i32 %1, %2
26057 // %3 = extractelement <2 x i32> %a, i32 0
26058 // %4 = extractelement <2 x i32> %a, i32 1
26059 // %select = select i1 %cond, i32 %3, i32 %4
26060 CmpPredicate Pred;
26061 Instruction *L1;
26062 Instruction *L2;
26063
26064 Value *LHS = Select->getTrueValue();
26065 Value *RHS = Select->getFalseValue();
26066 Value *Cond = Select->getCondition();
26067
26068 // TODO: Support inverse predicates.
26069 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
26072 return RecurKind::None;
26073 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
26076 return RecurKind::None;
26077 } else {
26079 return RecurKind::None;
26080 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
26083 return RecurKind::None;
26084 }
26085
26086 switch (Pred) {
26087 default:
26088 return RecurKind::None;
26089 case CmpInst::ICMP_SGT:
26090 case CmpInst::ICMP_SGE:
26091 return RecurKind::SMax;
26092 case CmpInst::ICMP_SLT:
26093 case CmpInst::ICMP_SLE:
26094 return RecurKind::SMin;
26095 case CmpInst::ICMP_UGT:
26096 case CmpInst::ICMP_UGE:
26097 return RecurKind::UMax;
26098 case CmpInst::ICMP_ULT:
26099 case CmpInst::ICMP_ULE:
26100 return RecurKind::UMin;
26101 }
26102 }
26103 return RecurKind::None;
26104 }
26105
26106 /// Get the index of the first operand.
26107 static unsigned getFirstOperandIndex(Instruction *I) {
26108 return isCmpSelMinMax(I) ? 1 : 0;
26109 }
26110
26111private:
26112 /// Total number of operands in the reduction operation.
26113 static unsigned getNumberOfOperands(Instruction *I) {
26114 return isCmpSelMinMax(I) ? 3 : 2;
26115 }
26116
26117 /// Checks if the instruction is in basic block \p BB.
26118 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
26119 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
26120 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
26121 auto *Sel = cast<SelectInst>(I);
26122 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
26123 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
26124 }
26125 return I->getParent() == BB;
26126 }
26127
26128 /// Expected number of uses for reduction operations/reduced values.
26129 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
26130 if (IsCmpSelMinMax) {
26131 // SelectInst must be used twice while the condition op must have single
26132 // use only.
26133 if (auto *Sel = dyn_cast<SelectInst>(I))
26134 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
26135 return I->hasNUses(2);
26136 }
26137
26138 // Arithmetic reduction operation must be used once only.
26139 return I->hasOneUse();
26140 }
26141
26142 /// Initializes the list of reduction operations.
26143 void initReductionOps(Instruction *I) {
26144 if (isCmpSelMinMax(I))
26145 ReductionOps.assign(2, ReductionOpsType());
26146 else
26147 ReductionOps.assign(1, ReductionOpsType());
26148 }
26149
26150 /// Add all reduction operations for the reduction instruction \p I.
26151 void addReductionOps(Instruction *I) {
26152 if (isCmpSelMinMax(I)) {
26153 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
26154 ReductionOps[1].emplace_back(I);
26155 } else {
26156 ReductionOps[0].emplace_back(I);
26157 }
26158 }
26159
26160 static bool isGoodForReduction(ArrayRef<Value *> Data) {
26161 int Sz = Data.size();
26162 auto *I = dyn_cast<Instruction>(Data.front());
26163 return Sz > 1 || isConstant(Data.front()) ||
26164 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
26165 }
26166
26167 /// Optimizes original placement of the reduced values for the reduction tree.
26168 /// For example, if there is a zext i1 + selects, we can merge select
26169 /// into zext and improve emission of the reductions.
26170 void optimizeReducedVals(BoUpSLP &R, DominatorTree &DT, const DataLayout &DL,
26171 const TargetTransformInfo &TTI,
26172 const TargetLibraryInfo &TLI) {
26173 SmallDenseMap<unsigned, unsigned> UsedReductionOpIds;
26174 for (const auto [Idx, Vals] : enumerate(ReducedVals)) {
26175 if (auto *I = dyn_cast<Instruction>(Vals.front()))
26176 UsedReductionOpIds.try_emplace(I->getOpcode(), Idx);
26177 }
26178 // Check if zext i1 can be merged with select.
26179 auto ZExtIt = UsedReductionOpIds.find(Instruction::ZExt);
26180 auto SelectIt = UsedReductionOpIds.find(Instruction::Select);
26181 if (ZExtIt != UsedReductionOpIds.end() &&
26182 SelectIt != UsedReductionOpIds.end()) {
26183 unsigned ZExtIdx = ZExtIt->second;
26184 unsigned SelectIdx = SelectIt->second;
26185 auto *ZExt = cast<ZExtInst>(ReducedVals[ZExtIdx].front());
26186 // ZExt is compatible with Select? Merge select to zext, if so.
26187 if (ZExt->getSrcTy()->isIntegerTy(1) &&
26188 ZExt->getType() == ReducedVals[SelectIdx].front()->getType()) {
26189 ReducedVals[ZExtIdx].append(ReducedVals[SelectIdx]);
26190 ReducedVals.erase(std::next(ReducedVals.begin(), SelectIdx));
26191 }
26192 }
26193 // Merge 1 element reduced value groups into larger group of shl, if only 2
26194 // groups available. May trigger extra vectorization with the copyables.
26195 if (ReducedVals.size() == 2 &&
26196 (ReducedVals.front().size() == 1 || ReducedVals.back().size() == 1)) {
26197 SmallVector<Value *> Ops(ReducedVals.front().size() +
26198 ReducedVals.back().size());
26199 copy(ReducedVals.front(), Ops.begin());
26200 copy(ReducedVals.back(),
26201 std::next(Ops.begin(), ReducedVals.front().size()));
26202 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
26203 InstructionsState OpS = Analysis.buildInstructionsState(
26204 Ops, R, /*TryCopyableElementsVectorization=*/true,
26205 /*WithProfitabilityCheck=*/true);
26206 if (OpS && OpS.areInstructionsWithCopyableElements() &&
26207 OpS.getOpcode() == Instruction::Shl) {
26208 // The smallest reduced values group should be the first.
26209 if (ReducedVals.back().size() == 1 && ReducedVals.front().size() != 1)
26210 std::swap(ReducedVals.front(), ReducedVals.back());
26211 // Check if the largest reduced values group are shl and sort them by
26212 // the constant shift amount to improve chances of vectorization with
26213 // the copyables.
26214 auto Comparator = [](Value *V1, Value *V2) {
26215 ConstantInt *C1, *C2;
26216 if (!match(V1, m_Shl(m_Value(), m_ConstantInt(C1))))
26217 return false;
26218 if (!match(V2, m_Shl(m_Value(), m_ConstantInt(C2))))
26219 return true;
26220 return C1->getZExtValue() < C2->getZExtValue();
26221 };
26222 stable_sort(ReducedVals.back(), Comparator);
26223 ReducedVals.front().append(ReducedVals.back());
26224 ReducedVals.pop_back();
26225 }
26226 }
26227 }
26228
26229public:
26230 HorizontalReduction() = default;
26232 : ReductionRoot(I), ReductionLimit(2) {
26233 RdxKind = HorizontalReduction::getRdxKind(I);
26234 ReductionOps.emplace_back().push_back(I);
26235 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
26236 for (Value *V : Ops)
26237 ReducedValsToOps[V].push_back(I);
26238 }
26239
26240 bool matchReductionForOperands() {
26241 // Analyze "regular" integer/FP types for reductions - no target-specific
26242 // types or pointers.
26243 assert(ReductionRoot && "Reduction root is not set!");
26244 RK = isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
26245 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
26246 return Ops.size() == 2;
26247 }));
26248 return RK != ReductionOrdering::None;
26249 }
26250
26251 /// Try to find a reduction tree.
26252 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
26253 ScalarEvolution &SE, DominatorTree &DT,
26254 const DataLayout &DL,
26255 const TargetTransformInfo &TTI,
26256 const TargetLibraryInfo &TLI) {
26257 RdxKind = HorizontalReduction::getRdxKind(Root);
26258 RK = isVectorizable(RdxKind, Root);
26259 if (RK == ReductionOrdering::None)
26260 return false;
26261
26262 // Analyze "regular" integer/FP types for reductions - no target-specific
26263 // types or pointers.
26264 Type *Ty = Root->getType();
26265 if (!isValidElementType(Ty) || Ty->isPointerTy())
26266 return false;
26267
26268 // Though the ultimate reduction may have multiple uses, its condition must
26269 // have only single use.
26270 if (auto *Sel = dyn_cast<SelectInst>(Root))
26271 if (!Sel->getCondition()->hasOneUse())
26272 RK = ReductionOrdering::Ordered;
26273
26274 ReductionRoot = Root;
26275
26276 // Iterate through all the operands of the possible reduction tree and
26277 // gather all the reduced values, sorting them by their value id.
26278 BasicBlock *BB = Root->getParent();
26279 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
26281 1, std::make_pair(Root, 0));
26282 SmallVector<std::pair<Instruction *, unsigned>> PossibleOrderedReductionOps;
26283 // Checks if the operands of the \p TreeN instruction are also reduction
26284 // operations or should be treated as reduced values or an extra argument,
26285 // which is not part of the reduction.
26286 auto CheckOperands = [&](Instruction *TreeN,
26287 SmallVectorImpl<Value *> &PossibleReducedVals,
26288 SmallVectorImpl<Instruction *> &ReductionOps,
26289 unsigned Level) {
26290 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
26291 getNumberOfOperands(TreeN)))) {
26292 Value *EdgeVal = getRdxOperand(TreeN, I);
26293 ReducedValsToOps[EdgeVal].push_back(TreeN);
26294 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
26295 // If the edge is not an instruction, or it is different from the main
26296 // reduction opcode or has too many uses - possible reduced value.
26297 // Also, do not try to reduce const values, if the operation is not
26298 // foldable.
26299 bool IsReducedVal = !EdgeInst || Level > RecursionMaxDepth ||
26300 getRdxKind(EdgeInst) != RdxKind ||
26301 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst);
26302 ReductionOrdering CurrentRK = IsReducedVal
26303 ? ReductionOrdering::None
26304 : isVectorizable(RdxKind, EdgeInst);
26305 if (!IsReducedVal && CurrentRK == ReductionOrdering::Unordered &&
26306 RK == ReductionOrdering::Unordered &&
26307 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst)) {
26308 IsReducedVal = true;
26309 CurrentRK = ReductionOrdering::None;
26310 if (PossibleReducedVals.size() < ReductionLimit)
26311 PossibleOrderedReductionOps.emplace_back(EdgeInst, Level);
26312 }
26313 if (CurrentRK == ReductionOrdering::None ||
26314 (R.isAnalyzedReductionRoot(EdgeInst) &&
26315 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
26316 PossibleReducedVals.push_back(EdgeVal);
26317 continue;
26318 }
26319 if (CurrentRK == ReductionOrdering::Ordered)
26320 RK = ReductionOrdering::Ordered;
26321 ReductionOps.push_back(EdgeInst);
26322 }
26323 };
26324 // Try to regroup reduced values so that it gets more profitable to try to
26325 // reduce them. Values are grouped by their value ids, instructions - by
26326 // instruction op id and/or alternate op id, plus do extra analysis for
26327 // loads (grouping them by the distance between pointers) and cmp
26328 // instructions (grouping them by the predicate).
26329 SmallMapVector<
26330 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
26331 8>
26332 PossibleReducedVals;
26333 initReductionOps(Root);
26334 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
26335 SmallSet<size_t, 2> LoadKeyUsed;
26336
26337 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
26338 Key = hash_combine(hash_value(LI->getParent()->getNumber()), Key);
26339 Value *Ptr =
26341 if (!LoadKeyUsed.insert(Key).second) {
26342 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
26343 if (LIt != LoadsMap.end()) {
26344 for (LoadInst *RLI : LIt->second) {
26345 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
26346 LI->getType(), LI->getPointerOperand(), DL, SE,
26347 /*StrictCheck=*/true))
26348 return hash_value(RLI->getPointerOperand());
26349 }
26350 for (LoadInst *RLI : LIt->second) {
26352 LI->getPointerOperand(), TLI)) {
26353 hash_code SubKey = hash_value(RLI->getPointerOperand());
26354 return SubKey;
26355 }
26356 }
26357 if (LIt->second.size() > 2) {
26358 hash_code SubKey =
26359 hash_value(LIt->second.back()->getPointerOperand());
26360 return SubKey;
26361 }
26362 }
26363 }
26364 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
26365 .first->second.push_back(LI);
26366 return hash_value(LI->getPointerOperand());
26367 };
26368
26369 SmallVector<Value *> ReducedValsCandidates;
26370 while (!Worklist.empty()) {
26371 auto [TreeN, Level] = Worklist.pop_back_val();
26372 SmallVector<Value *> PossibleRedVals;
26373 SmallVector<Instruction *> PossibleReductionOps;
26374 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
26375 addReductionOps(TreeN);
26376 ReducedValsCandidates.append(PossibleRedVals.begin(),
26377 PossibleRedVals.end());
26378 for (Instruction *I : reverse(PossibleReductionOps))
26379 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
26380 // If not enough elements for unordered vectorization, check if there are
26381 // potential candidates for the ordered vectorization and try to add them
26382 // to the worklist.
26383 if (Worklist.empty() && ReducedValsCandidates.size() < ReductionLimit &&
26384 !PossibleOrderedReductionOps.empty() &&
26385 RK == ReductionOrdering::Unordered) {
26386 RK = ReductionOrdering::Ordered;
26387 SmallPtrSet<const Instruction *, 4> Ops;
26388 for (const auto &P : PossibleOrderedReductionOps)
26389 Ops.insert(P.first);
26390 erase_if(ReducedValsCandidates, [&](Value *V) {
26391 auto *I = dyn_cast<Instruction>(V);
26392 return I && Ops.contains(I);
26393 });
26394 Worklist.append(PossibleOrderedReductionOps.begin(),
26395 PossibleOrderedReductionOps.end());
26396 PossibleOrderedReductionOps.clear();
26397 }
26398 }
26399 // Add reduction values. The values are sorted for better vectorization
26400 // results.
26401 for (Value *V : ReducedValsCandidates) {
26402 size_t Key, Idx;
26403 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
26404 /*AllowAlternate=*/false);
26405 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
26406 }
26407 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
26408 // Sort values by the total number of values kinds to start the reduction
26409 // from the longest possible reduced values sequences.
26410 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
26411 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
26412 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
26413 for (auto &Slice : PossibleRedVals) {
26414 PossibleRedValsVect.emplace_back();
26415 auto RedValsVect = Slice.second.takeVector();
26416 stable_sort(RedValsVect, llvm::less_second());
26417 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
26418 PossibleRedValsVect.back().append(Data.second, Data.first);
26419 }
26420 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
26421 return P1.size() > P2.size();
26422 });
26423 bool First = true;
26424 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
26425 if (First) {
26426 First = false;
26427 ReducedVals.emplace_back();
26428 } else if (!isGoodForReduction(Data)) {
26429 auto *LI = dyn_cast<LoadInst>(Data.front());
26430 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
26431 if (!LI || !LastLI ||
26433 getUnderlyingObject(LastLI->getPointerOperand()))
26434 ReducedVals.emplace_back();
26435 }
26436 ReducedVals.back().append(Data.rbegin(), Data.rend());
26437 }
26438 }
26439 // Post optimize reduced values to get better reduction sequences and sort
26440 // them by size.
26441 optimizeReducedVals(R, DT, DL, TTI, TLI);
26442 // Sort the reduced values by number of same/alternate opcode and/or pointer
26443 // operand.
26444 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
26445 return P1.size() > P2.size();
26446 });
26447 return true;
26448 }
26449
26450 /// Attempt to vectorize the tree found by matchAssociativeReduction.
26451 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
26452 const TargetLibraryInfo &TLI, AssumptionCache *AC,
26453 DominatorTree &DT) {
26454 constexpr unsigned RegMaxNumber = 4;
26455 constexpr unsigned RedValsMaxNumber = 128;
26456 // If there are a sufficient number of reduction values, reduce
26457 // to a nearby power-of-2. We can safely generate oversized
26458 // vectors and rely on the backend to split them to legal sizes.
26459 if (unsigned NumReducedVals = std::accumulate(
26460 ReducedVals.begin(), ReducedVals.end(), 0,
26461 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
26462 if (!isGoodForReduction(Vals))
26463 return Num;
26464 return Num + Vals.size();
26465 });
26466 NumReducedVals < ReductionLimit &&
26467 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
26468 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
26469 })) {
26470 for (ReductionOpsType &RdxOps : ReductionOps)
26471 for (Value *RdxOp : RdxOps)
26472 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
26473 return nullptr;
26474 }
26475
26476 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
26477 TargetFolder(DL));
26478 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
26479
26480 // Track the reduced values in case if they are replaced by extractelement
26481 // because of the vectorization.
26482 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
26483 ReducedVals.front().size());
26484
26485 // The compare instruction of a min/max is the insertion point for new
26486 // instructions and may be replaced with a new compare instruction.
26487 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
26488 assert(isa<SelectInst>(RdxRootInst) &&
26489 "Expected min/max reduction to have select root instruction");
26490 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
26491 assert(isa<Instruction>(ScalarCond) &&
26492 "Expected min/max reduction to have compare condition");
26493 return cast<Instruction>(ScalarCond);
26494 };
26495
26496 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
26497 return isBoolLogicOp(cast<Instruction>(V));
26498 });
26499 // Return new VectorizedTree, based on previous value.
26500 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
26501 if (VectorizedTree) {
26502 // Update the final value in the reduction.
26504 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
26505 if (AnyBoolLogicOp) {
26506 auto It = ReducedValsToOps.find(VectorizedTree);
26507 auto It1 = ReducedValsToOps.find(Res);
26508 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
26509 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
26510 (It != ReducedValsToOps.end() &&
26511 any_of(It->getSecond(), [&](Instruction *I) {
26512 return isBoolLogicOp(I) &&
26513 getRdxOperand(I, 0) == VectorizedTree;
26514 }))) {
26515 ;
26516 } else if (isGuaranteedNotToBePoison(Res, AC) ||
26517 (It1 != ReducedValsToOps.end() &&
26518 any_of(It1->getSecond(), [&](Instruction *I) {
26519 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
26520 }))) {
26521 std::swap(VectorizedTree, Res);
26522 } else {
26523 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
26524 }
26525 }
26526
26527 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
26528 ReductionOps);
26529 }
26530 // Initialize the final value in the reduction.
26531 return Res;
26532 };
26533 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
26534 ReductionOps.front().size());
26535 for (ReductionOpsType &RdxOps : ReductionOps)
26536 for (Value *RdxOp : RdxOps) {
26537 if (!RdxOp)
26538 continue;
26539 IgnoreList.insert(RdxOp);
26540 }
26541 // Intersect the fast-math-flags from all reduction operations.
26542 FastMathFlags RdxFMF;
26543 RdxFMF.set();
26544 for (Value *U : IgnoreList)
26545 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
26546 RdxFMF &= FPMO->getFastMathFlags();
26547 // For ordered reductions here we need to generate extractelement
26548 // instructions, so clear IgnoreList.
26549 if (RK == ReductionOrdering::Ordered)
26550 IgnoreList.clear();
26551 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
26552
26553 // Need to track reduced vals, they may be changed during vectorization of
26554 // subvectors.
26555 for (ArrayRef<Value *> Candidates : ReducedVals)
26556 for (Value *V : Candidates)
26557 TrackedVals.try_emplace(V, V);
26558
26559 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
26560 Value *V) -> unsigned & {
26561 auto *It = MV.find(V);
26562 assert(It != MV.end() && "Unable to find given key.");
26563 return It->second;
26564 };
26565
26566 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
26567 // List of the values that were reduced in other trees as part of gather
26568 // nodes and thus requiring extract if fully vectorized in other trees.
26569 SmallPtrSet<Value *, 4> RequiredExtract;
26570 WeakTrackingVH VectorizedTree = nullptr;
26571 bool CheckForReusedReductionOps = false;
26572 // Try to vectorize elements based on their type.
26574 SmallVector<SmallVector<Value *>> LocalReducedVals;
26575 // Try merge consecutive reduced values into a single vectorizable group and
26576 // check, if they can be vectorized as copyables.
26577 const bool TwoGroupsOnly = ReducedVals.size() == 2;
26578 const bool TwoGroupsOfSameSmallSize =
26579 TwoGroupsOnly &&
26580 ReducedVals.front().size() == ReducedVals.back().size() &&
26581 ReducedVals.front().size() < ReductionLimit;
26582 for (ArrayRef<Value *> RV : ReducedVals) {
26583 // Loads are not very compatible with undefs.
26584 if (isa<UndefValue>(RV.front()) &&
26585 (States.empty() || !States.back() ||
26586 States.back().getOpcode() == Instruction::Load)) {
26587 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
26588 States.push_back(InstructionsState::invalid());
26589 continue;
26590 }
26591 if (!LocalReducedVals.empty() &&
26592 isa<UndefValue>(LocalReducedVals.back().front()) &&
26593 isa<LoadInst>(RV.front())) {
26594 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
26595 States.push_back(getSameOpcode(RV, TLI));
26596 continue;
26597 }
26598 // Do some copyables analysis only if more than 2 groups exist or they
26599 // are large enough.
26600 if (!TwoGroupsOfSameSmallSize) {
26602 if (!LocalReducedVals.empty())
26603 Ops = LocalReducedVals.back();
26604 Ops.append(RV.begin(), RV.end());
26605 InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
26606 InstructionsState OpS = Analysis.buildInstructionsState(
26607 Ops, V, /*WithProfitabilityCheck=*/true,
26608 /*SkipSameCodeCheck=*/true);
26609 if (OpS && OpS.areInstructionsWithCopyableElements()) {
26610 if (LocalReducedVals.empty()) {
26611 LocalReducedVals.push_back(Ops);
26612 States.push_back(OpS);
26613 continue;
26614 }
26615 LocalReducedVals.back().swap(Ops);
26616 States.back() = OpS;
26617 continue;
26618 }
26619 // For safety, allow split vectorization only if 2 groups are available
26620 // overall.
26621 if (TwoGroupsOnly) {
26622 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(Ops);
26623 OpS = InstructionsState(MainOp, AltOp);
26624 // Last chance to try to vectorize alternate node.
26625 SmallVector<Value *> Op1, Op2;
26626 BoUpSLP::OrdersType ReorderIndices;
26627 if (MainOp && AltOp &&
26628 V.canBuildSplitNode(Ops, OpS, Op1, Op2, ReorderIndices)) {
26629 if (LocalReducedVals.empty()) {
26630 LocalReducedVals.push_back(Ops);
26631 States.push_back(OpS);
26632 continue;
26633 }
26634 LocalReducedVals.back().swap(Ops);
26635 States.back() = OpS;
26636 continue;
26637 }
26638 }
26639 }
26640 LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
26641 States.push_back(getSameOpcode(RV, TLI));
26642 }
26643 ReducedVals.swap(LocalReducedVals);
26644 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
26645 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
26646 InstructionsState S = States[I];
26647 SmallVector<Value *> Candidates;
26648 Candidates.reserve(2 * OrigReducedVals.size());
26649 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
26650 for (Value *ReducedVal : OrigReducedVals) {
26651 Value *RdxVal = TrackedVals.at(ReducedVal);
26652 // Check if the reduction value was not overriden by the extractelement
26653 // instruction because of the vectorization and exclude it, if it is not
26654 // compatible with other values.
26655 // Also check if the instruction was folded to constant/other value.
26656 auto *Inst = dyn_cast<Instruction>(RdxVal);
26657 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
26658 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
26659 !S.isCopyableElement(Inst)))) ||
26660 (S && !Inst && !isa<PoisonValue>(RdxVal) &&
26661 !S.isCopyableElement(RdxVal)))
26662 continue;
26663 Candidates.push_back(RdxVal);
26664 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
26665 }
26666 bool ShuffledExtracts = false;
26667 // Try to handle shuffled extractelements.
26668 if (S && S.getOpcode() == Instruction::ExtractElement &&
26669 !S.isAltShuffle() && I + 1 < E) {
26670 SmallVector<Value *> CommonCandidates(Candidates);
26671 for (Value *RV : ReducedVals[I + 1]) {
26672 Value *RdxVal = TrackedVals.at(RV);
26673 // Check if the reduction value was not overriden by the
26674 // extractelement instruction because of the vectorization and
26675 // exclude it, if it is not compatible with other values.
26676 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
26677 if (!Inst)
26678 continue;
26679 CommonCandidates.push_back(RdxVal);
26680 TrackedToOrig.try_emplace(RdxVal, RV);
26681 }
26682 SmallVector<int> Mask;
26683 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
26684 ++I;
26685 Candidates.swap(CommonCandidates);
26686 ShuffledExtracts = true;
26687 }
26688 }
26689
26690 // Emit code for constant values.
26691 if (Candidates.size() > 1 && allConstant(Candidates)) {
26692 if (RK == ReductionOrdering::Ordered)
26693 continue;
26694 Value *Res = Candidates.front();
26695 Value *OrigV = TrackedToOrig.at(Candidates.front());
26696 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
26697 for (Value *VC : ArrayRef(Candidates).drop_front()) {
26698 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
26699 Value *OrigV = TrackedToOrig.at(VC);
26700 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
26701 if (auto *ResI = dyn_cast<Instruction>(Res))
26702 V.analyzedReductionRoot(ResI);
26703 }
26704 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
26705 continue;
26706 }
26707
26708 unsigned NumReducedVals = Candidates.size();
26709 if (NumReducedVals < ReductionLimit &&
26710 (NumReducedVals < 2 || !isSplat(Candidates)))
26711 continue;
26712
26713 // Check if we support repeated scalar values processing (optimization of
26714 // original scalar identity operations on matched horizontal reductions).
26715 IsSupportedHorRdxIdentityOp =
26716 RK == ReductionOrdering::Unordered && RdxKind != RecurKind::Mul &&
26717 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
26718 // Gather same values.
26719 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
26720 if (IsSupportedHorRdxIdentityOp)
26721 for (Value *V : Candidates) {
26722 Value *OrigV = TrackedToOrig.at(V);
26723 ++SameValuesCounter.try_emplace(OrigV).first->second;
26724 }
26725 // Used to check if the reduced values used same number of times. In this
26726 // case the compiler may produce better code. E.g. if reduced values are
26727 // aabbccdd (8 x values), then the first node of the tree will have a node
26728 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
26729 // Plus, the final reduction will be performed on <8 x aabbccdd>.
26730 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
26731 // x abcd) * 2.
26732 // Currently it only handles add/fadd/xor. and/or/min/max do not require
26733 // this analysis, other operations may require an extra estimation of
26734 // the profitability.
26735 bool SameScaleFactor = false;
26736 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
26737 SameValuesCounter.size() != Candidates.size();
26738 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
26739 if (OptReusedScalars) {
26740 SameScaleFactor =
26741 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
26742 RdxKind == RecurKind::Xor) &&
26743 all_of(drop_begin(SameValuesCounter),
26744 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
26745 return P.second == SameValuesCounter.front().second;
26746 });
26747 Candidates.resize(SameValuesCounter.size());
26748 transform(SameValuesCounter, Candidates.begin(),
26749 [&](const auto &P) { return TrackedVals.at(P.first); });
26750 NumReducedVals = Candidates.size();
26751 // Have a reduction of the same element.
26752 if (NumReducedVals == 1) {
26753 Value *OrigV = TrackedToOrig.at(Candidates.front());
26754 unsigned Cnt = At(SameValuesCounter, OrigV);
26755 Value *RedVal =
26756 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
26757 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
26758 VectorizedVals.try_emplace(OrigV, Cnt);
26759 ExternallyUsedValues.insert(OrigV);
26760 continue;
26761 }
26762 }
26763
26764 unsigned MaxVecRegSize = V.getMaxVecRegSize();
26765 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
26766 const unsigned MaxElts = std::clamp<unsigned>(
26767 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
26768 RegMaxNumber * RedValsMaxNumber);
26769
26770 unsigned ReduxWidth = NumReducedVals;
26771 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
26772 unsigned NumParts, NumRegs;
26773 Type *ScalarTy = Candidates.front()->getType();
26774 ReduxWidth =
26775 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
26776 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
26777 NumParts = ::getNumberOfParts(TTI, Tp);
26778 NumRegs =
26780 while (NumParts > NumRegs) {
26781 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
26782 ReduxWidth = bit_floor(ReduxWidth - 1);
26783 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
26784 NumParts = ::getNumberOfParts(TTI, Tp);
26785 NumRegs =
26787 }
26788 if (NumParts > NumRegs / 2)
26789 ReduxWidth = bit_floor(ReduxWidth);
26790 return ReduxWidth;
26791 };
26792 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
26793 ReduxWidth = GetVectorFactor(ReduxWidth);
26794 ReduxWidth = std::min(ReduxWidth, MaxElts);
26795
26796 unsigned Start = 0;
26797 unsigned Pos = Start;
26798 // Restarts vectorization attempt with lower vector factor.
26799 unsigned PrevReduxWidth = ReduxWidth;
26800 bool CheckForReusedReductionOpsLocal = false;
26801 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
26802 bool IsAnyRedOpGathered =
26803 !IgnoreVL &&
26804 (RK == ReductionOrdering::Ordered || V.isAnyGathered(IgnoreList));
26805 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
26806 // Check if any of the reduction ops are gathered. If so, worth
26807 // trying again with less number of reduction ops.
26808 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
26809 }
26810 ++Pos;
26811 if (Pos < NumReducedVals - ReduxWidth + 1)
26812 return IsAnyRedOpGathered;
26813 Pos = Start;
26814 --ReduxWidth;
26815 if (ReduxWidth > 1)
26816 ReduxWidth = GetVectorFactor(ReduxWidth);
26817 return IsAnyRedOpGathered;
26818 };
26819 bool AnyVectorized = false;
26820 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
26821 while (Pos < NumReducedVals - ReduxWidth + 1 &&
26822 ReduxWidth >= ReductionLimit) {
26823 // Dependency in tree of the reduction ops - drop this attempt, try
26824 // later.
26825 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
26826 Start == 0) {
26827 CheckForReusedReductionOps = true;
26828 break;
26829 }
26830 PrevReduxWidth = ReduxWidth;
26831 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
26832 // Been analyzed already - skip.
26833 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
26834 (!has_single_bit(ReduxWidth) &&
26835 (IgnoredCandidates.contains(
26836 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
26837 IgnoredCandidates.contains(
26838 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
26839 bit_floor(ReduxWidth))))) ||
26840 V.areAnalyzedReductionVals(VL)) {
26841 (void)AdjustReducedVals(/*IgnoreVL=*/true);
26842 continue;
26843 }
26844 // Early exit if any of the reduction values were deleted during
26845 // previous vectorization attempts.
26846 if (any_of(VL, [&V](Value *RedVal) {
26847 auto *RedValI = dyn_cast<Instruction>(RedVal);
26848 return RedValI && V.isDeleted(RedValI);
26849 }))
26850 break;
26851 if (RK == ReductionOrdering::Ordered)
26852 V.buildTree(VL);
26853 else
26854 V.buildTree(VL, IgnoreList);
26855 if (V.isTreeTinyAndNotFullyVectorizable(RK ==
26856 ReductionOrdering::Unordered)) {
26857 if (!AdjustReducedVals())
26858 V.analyzedReductionVals(VL);
26859 continue;
26860 }
26861 V.reorderTopToBottom();
26862 // No need to reorder the root node at all for reassociative reduction.
26863 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
26864 VL.front()->getType()->isIntOrIntVectorTy() ||
26865 ReductionLimit > 2 ||
26866 RK == ReductionOrdering::Ordered);
26867 // Keep extracted other reduction values, if they are used in the
26868 // vectorization trees.
26869 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
26870 ExternallyUsedValues);
26871 // The reduction root is used as the insertion point for new
26872 // instructions, so set it as externally used to prevent it from being
26873 // deleted.
26874 LocalExternallyUsedValues.insert(ReductionRoot);
26875 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
26876 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
26877 continue;
26878 for (Value *V : ReducedVals[Cnt])
26879 if (isa<Instruction>(V))
26880 LocalExternallyUsedValues.insert(TrackedVals[V]);
26881 }
26882 if (!IsSupportedHorRdxIdentityOp) {
26883 // Number of uses of the candidates in the vector of values.
26884 assert(SameValuesCounter.empty() &&
26885 "Reused values counter map is not empty");
26886 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
26887 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
26888 continue;
26889 Value *V = Candidates[Cnt];
26890 Value *OrigV = TrackedToOrig.at(V);
26891 ++SameValuesCounter.try_emplace(OrigV).first->second;
26892 }
26893 }
26894 V.transformNodes();
26895 V.computeMinimumValueSizes();
26896 InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL);
26897
26898 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
26899 // Gather externally used values.
26900 SmallPtrSet<Value *, 4> Visited;
26901 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
26902 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
26903 continue;
26904 Value *RdxVal = Candidates[Cnt];
26905 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
26906 RdxVal = It->second;
26907 if (!Visited.insert(RdxVal).second)
26908 continue;
26909 // Check if the scalar was vectorized as part of the vectorization
26910 // tree but not the top node.
26911 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
26912 LocalExternallyUsedValues.insert(RdxVal);
26913 continue;
26914 }
26915 Value *OrigV = TrackedToOrig.at(RdxVal);
26916 unsigned NumOps =
26917 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
26918 if (NumOps != ReducedValsToOps.at(OrigV).size())
26919 LocalExternallyUsedValues.insert(RdxVal);
26920 }
26921 // Do not need the list of reused scalars in regular mode anymore.
26922 if (!IsSupportedHorRdxIdentityOp)
26923 SameValuesCounter.clear();
26924 for (Value *RdxVal : VL)
26925 if (RequiredExtract.contains(RdxVal))
26926 LocalExternallyUsedValues.insert(RdxVal);
26927 V.buildExternalUses(LocalExternallyUsedValues);
26928
26929 // Estimate cost.
26930 InstructionCost ReductionCost;
26931 if (RK == ReductionOrdering::Ordered || V.isReducedBitcastRoot() ||
26932 V.isReducedCmpBitcastRoot())
26933 ReductionCost = 0;
26934 else
26935 ReductionCost =
26936 getReductionCost(TTI, VL, SameValuesCounter, IsCmpSelMinMax,
26937 RdxFMF, V, DT, DL, TLI);
26938 InstructionCost Cost = V.getTreeCost(TreeCost, VL, ReductionCost);
26939 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
26940 << " for reduction\n");
26941 if (!Cost.isValid())
26942 break;
26943 if (Cost >= -SLPCostThreshold) {
26944 V.getORE()->emit([&]() {
26945 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
26946 ReducedValsToOps.at(VL[0]).front())
26947 << "Vectorizing horizontal reduction is possible "
26948 << "but not beneficial with cost " << ore::NV("Cost", Cost)
26949 << " and threshold "
26950 << ore::NV("Threshold", -SLPCostThreshold);
26951 });
26952 if (!AdjustReducedVals()) {
26953 V.analyzedReductionVals(VL);
26954 unsigned Offset = Pos == Start ? Pos : Pos - 1;
26955 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
26956 // Add subvectors of VL to the list of the analyzed values.
26957 for (unsigned VF = getFloorFullVectorNumberOfElements(
26958 *TTI, VL.front()->getType(), ReduxWidth - 1);
26959 VF >= ReductionLimit;
26961 *TTI, VL.front()->getType(), VF - 1)) {
26962 if (has_single_bit(VF) &&
26963 V.getCanonicalGraphSize() != V.getTreeSize())
26964 continue;
26965 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
26966 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
26967 }
26968 }
26969 }
26970 continue;
26971 }
26972
26973 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
26974 << Cost << ". (HorRdx)\n");
26975 V.getORE()->emit([&]() {
26976 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
26977 ReducedValsToOps.at(VL[0]).front())
26978 << "Vectorized horizontal reduction with cost "
26979 << ore::NV("Cost", Cost) << " and with tree size "
26980 << ore::NV("TreeSize", V.getTreeSize());
26981 });
26982
26983 Builder.setFastMathFlags(RdxFMF);
26984
26985 // Emit a reduction. If the root is a select (min/max idiom), the insert
26986 // point is the compare condition of that select.
26987 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
26988 Instruction *InsertPt = RdxRootInst;
26989 if (IsCmpSelMinMax)
26990 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
26991
26992 // Vectorize a tree.
26993 Value *VectorizedRoot = V.vectorizeTree(
26994 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
26995 // Update TrackedToOrig mapping, since the tracked values might be
26996 // updated.
26997 for (Value *RdxVal : Candidates) {
26998 Value *OrigVal = TrackedToOrig.at(RdxVal);
26999 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
27000 if (TransformedRdxVal != RdxVal)
27001 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
27002 }
27003
27004 if (RK == ReductionOrdering::Ordered) {
27005 // No need to generate reduction here, emit extractelements instead in
27006 // the tree vectorizer.
27007 assert(VectorizedRoot && "Expected vectorized tree");
27008 // Count vectorized reduced values to exclude them from final
27009 // reduction.
27010 for (Value *RdxVal : VL)
27011 ++VectorizedVals.try_emplace(RdxVal).first->getSecond();
27012 Pos += ReduxWidth;
27013 Start = Pos;
27014 ReduxWidth = NumReducedVals - Pos;
27015 if (ReduxWidth > 1)
27016 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
27017 AnyVectorized = true;
27018 VectorizedTree = ReductionRoot;
27019 continue;
27020 }
27021 Builder.SetInsertPoint(InsertPt);
27022
27023 // To prevent poison from leaking across what used to be sequential,
27024 // safe, scalar boolean logic operations, the reduction operand must be
27025 // frozen.
27026 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
27027 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
27028
27029 // Emit code to correctly handle reused reduced values, if required.
27030 if (OptReusedScalars && !SameScaleFactor) {
27031 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
27032 SameValuesCounter, TrackedToOrig);
27033 }
27034
27035 Type *ScalarTy = VL.front()->getType();
27036 Type *VecTy = VectorizedRoot->getType();
27037 Type *RedScalarTy = VecTy->getScalarType();
27038 VectorValuesAndScales.emplace_back(
27039 VectorizedRoot,
27040 OptReusedScalars && SameScaleFactor
27041 ? SameValuesCounter.front().second
27042 : 1,
27043 RedScalarTy != ScalarTy->getScalarType()
27044 ? V.isSignedMinBitwidthRootNode()
27045 : true,
27046 V.isReducedBitcastRoot() || V.isReducedCmpBitcastRoot());
27047
27048 // Count vectorized reduced values to exclude them from final reduction.
27049 for (Value *RdxVal : VL) {
27050 Value *OrigV = TrackedToOrig.at(RdxVal);
27051 if (IsSupportedHorRdxIdentityOp) {
27052 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
27053 continue;
27054 }
27055 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
27056 if (!V.isVectorized(RdxVal))
27057 RequiredExtract.insert(RdxVal);
27058 }
27059 Pos += ReduxWidth;
27060 Start = Pos;
27061 ReduxWidth = NumReducedVals - Pos;
27062 if (ReduxWidth > 1)
27063 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
27064 AnyVectorized = true;
27065 }
27066 if (OptReusedScalars && !AnyVectorized) {
27067 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
27068 Value *RdxVal = TrackedVals.at(P.first);
27069 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
27070 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
27071 VectorizedVals.try_emplace(P.first, P.second);
27072 }
27073 continue;
27074 }
27075 }
27076 // Early exit for the ordered reductions.
27077 // No need to do anything else here, so we can just exit.
27078 if (RK == ReductionOrdering::Ordered)
27079 return VectorizedTree;
27080
27081 if (!VectorValuesAndScales.empty())
27082 VectorizedTree = GetNewVectorizedTree(
27083 VectorizedTree,
27084 emitReduction(Builder, *TTI, ReductionRoot->getType()));
27085
27086 if (!VectorizedTree) {
27087 if (!CheckForReusedReductionOps) {
27088 for (ReductionOpsType &RdxOps : ReductionOps)
27089 for (Value *RdxOp : RdxOps)
27090 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
27091 }
27092 return nullptr;
27093 }
27094
27095 // Reorder operands of bool logical op in the natural order to avoid
27096 // possible problem with poison propagation. If not possible to reorder
27097 // (both operands are originally RHS), emit an extra freeze instruction
27098 // for the LHS operand.
27099 // I.e., if we have original code like this:
27100 // RedOp1 = select i1 ?, i1 LHS, i1 false
27101 // RedOp2 = select i1 RHS, i1 ?, i1 false
27102
27103 // Then, we swap LHS/RHS to create a new op that matches the poison
27104 // semantics of the original code.
27105
27106 // If we have original code like this and both values could be poison:
27107 // RedOp1 = select i1 ?, i1 LHS, i1 false
27108 // RedOp2 = select i1 ?, i1 RHS, i1 false
27109
27110 // Then, we must freeze LHS in the new op.
27111 auto FixBoolLogicalOps =
27112 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
27113 Instruction *RedOp2, bool InitStep) {
27114 if (!AnyBoolLogicOp)
27115 return;
27116 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
27117 getRdxOperand(RedOp1, 0) == LHS ||
27119 return;
27120 bool NeedFreeze = LHS != VectorizedTree;
27121 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
27122 getRdxOperand(RedOp2, 0) == RHS ||
27124 // If RedOp2 was used as a second operand - do not swap.
27125 if ((InitStep || RHS != VectorizedTree) &&
27126 getRdxOperand(RedOp2, 0) == RHS &&
27127 ((isBoolLogicOp(RedOp1) &&
27128 getRdxOperand(RedOp1, 1) == RedOp2) ||
27129 any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
27130 return any_of(Ops, [&](Value *Op) {
27131 auto *OpI = dyn_cast<Instruction>(Op);
27132 return OpI && isBoolLogicOp(OpI) &&
27133 getRdxOperand(OpI, 1) == RedOp2;
27134 });
27135 }))) {
27136 NeedFreeze = false;
27137 } else {
27138 std::swap(LHS, RHS);
27139 return;
27140 }
27141 }
27142 if (NeedFreeze)
27143 LHS = Builder.CreateFreeze(LHS);
27144 };
27145 // Finish the reduction.
27146 // Need to add extra arguments and not vectorized possible reduction values.
27147 // Try to avoid dependencies between the scalar remainders after reductions.
27148 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
27149 bool InitStep) {
27150 unsigned Sz = InstVals.size();
27151 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
27152 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
27153 Instruction *RedOp = InstVals[I + 1].first;
27154 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
27155 Value *RdxVal1 = InstVals[I].second;
27156 Value *StableRdxVal1 = RdxVal1;
27157 auto It1 = TrackedVals.find(RdxVal1);
27158 if (It1 != TrackedVals.end())
27159 StableRdxVal1 = It1->second;
27160 Value *RdxVal2 = InstVals[I + 1].second;
27161 Value *StableRdxVal2 = RdxVal2;
27162 auto It2 = TrackedVals.find(RdxVal2);
27163 if (It2 != TrackedVals.end())
27164 StableRdxVal2 = It2->second;
27165 // To prevent poison from leaking across what used to be sequential,
27166 // safe, scalar boolean logic operations, the reduction operand must be
27167 // frozen.
27168 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
27169 RedOp, InitStep);
27170 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
27171 StableRdxVal2, "op.rdx", ReductionOps);
27172 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
27173 }
27174 if (Sz % 2 == 1)
27175 ExtraReds[Sz / 2] = InstVals.back();
27176 return ExtraReds;
27177 };
27179 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
27180 VectorizedTree);
27181 SmallPtrSet<Value *, 8> Visited;
27182 for (ArrayRef<Value *> Candidates : ReducedVals) {
27183 for (Value *RdxVal : Candidates) {
27184 if (!Visited.insert(RdxVal).second)
27185 continue;
27186 unsigned NumOps = VectorizedVals.lookup(RdxVal);
27187 for (Instruction *RedOp :
27188 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
27189 ExtraReductions.emplace_back(RedOp, RdxVal);
27190 }
27191 }
27192 // Iterate through all not-vectorized reduction values/extra arguments.
27193 bool InitStep = true;
27194 while (ExtraReductions.size() > 1) {
27196 FinalGen(ExtraReductions, InitStep);
27197 ExtraReductions.swap(NewReds);
27198 InitStep = false;
27199 }
27200 VectorizedTree = ExtraReductions.front().second;
27201
27202 ReductionRoot->replaceAllUsesWith(VectorizedTree);
27203
27204 // The original scalar reduction is expected to have no remaining
27205 // uses outside the reduction tree itself. Assert that we got this
27206 // correct, replace internal uses with undef, and mark for eventual
27207 // deletion.
27208#ifndef NDEBUG
27209 SmallPtrSet<Value *, 4> IgnoreSet;
27210 for (ArrayRef<Value *> RdxOps : ReductionOps)
27211 IgnoreSet.insert_range(RdxOps);
27212#endif
27213 for (ArrayRef<Value *> RdxOps : ReductionOps) {
27214 for (Value *Ignore : RdxOps) {
27215 if (!Ignore)
27216 continue;
27217#ifndef NDEBUG
27218 for (auto *U : Ignore->users()) {
27219 assert(IgnoreSet.count(U) &&
27220 "All users must be either in the reduction ops list.");
27221 }
27222#endif
27223 if (!Ignore->use_empty()) {
27224 Value *P = PoisonValue::get(Ignore->getType());
27225 Ignore->replaceAllUsesWith(P);
27226 }
27227 }
27228 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
27229 }
27230 return VectorizedTree;
27231 }
27232
27233private:
27234 /// Creates the reduction from the given \p Vec vector value with the given
27235 /// scale \p Scale and signedness \p IsSigned.
27236 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
27237 Value *Vec, unsigned Scale, bool IsSigned, Type *DestTy,
27238 bool ReducedInTree) {
27239 Value *Rdx;
27240 if (ReducedInTree) {
27241 Rdx = Vec;
27242 } else if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
27243 unsigned DestTyNumElements = getNumElements(VecTy);
27244 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
27245 Rdx = PoisonValue::get(
27246 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
27247 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
27248 // Do reduction for each lane.
27249 // e.g., do reduce add for
27250 // VL[0] = <4 x Ty> <a, b, c, d>
27251 // VL[1] = <4 x Ty> <e, f, g, h>
27252 // Lane[0] = <2 x Ty> <a, e>
27253 // Lane[1] = <2 x Ty> <b, f>
27254 // Lane[2] = <2 x Ty> <c, g>
27255 // Lane[3] = <2 x Ty> <d, h>
27256 // result[0] = reduce add Lane[0]
27257 // result[1] = reduce add Lane[1]
27258 // result[2] = reduce add Lane[2]
27259 // result[3] = reduce add Lane[3]
27260 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
27261 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
27262 Rdx = Builder.CreateInsertElement(
27263 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
27264 }
27265 } else {
27266 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
27267 }
27268 if (Rdx->getType() != DestTy)
27269 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
27270 // Improved analysis for add/fadd/xor reductions with same scale
27271 // factor for all operands of reductions. We can emit scalar ops for
27272 // them instead.
27273 if (Scale > 1)
27274 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
27275 return Rdx;
27276 }
27277
27278 /// Calculate the cost of a reduction.
27279 InstructionCost getReductionCost(
27280 TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals,
27281 const SmallMapVector<Value *, unsigned, 16> SameValuesCounter,
27282 bool IsCmpSelMinMax, FastMathFlags FMF, const BoUpSLP &R,
27283 DominatorTree &DT, const DataLayout &DL, const TargetLibraryInfo &TLI) {
27285 Type *ScalarTy = ReducedVals.front()->getType();
27286 unsigned ReduxWidth = ReducedVals.size();
27287 FixedVectorType *VectorTy = R.getReductionType();
27288 InstructionCost VectorCost = 0, ScalarCost;
27289 // If all of the reduced values are constant, the vector cost is 0, since
27290 // the reduction value can be calculated at the compile time.
27291 bool AllConsts = allConstant(ReducedVals);
27292 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
27294 // Scalar cost is repeated for N-1 elements.
27295 int Cnt = ReducedVals.size();
27296 for (Value *RdxVal : ReducedVals) {
27297 if (!isa<Instruction>(RdxVal))
27298 continue;
27299 if (Cnt == 1) {
27300 unsigned SameValueCount = SameValuesCounter.lookup(RdxVal);
27301 Cost += (SameValueCount ? SameValueCount - 1 : 0) * GenCostFn();
27302 break;
27303 }
27304 --Cnt;
27305 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
27306 unsigned SameValueCount = SameValuesCounter.lookup(RdxVal);
27307 Cost += (SameValueCount ? SameValueCount : 1) * GenCostFn();
27308 continue;
27309 }
27310 InstructionCost ScalarCost = 0;
27311 for (User *U : RdxVal->users()) {
27312 auto *RdxOp = cast<Instruction>(U);
27313 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
27314 if (RdxKind == RecurKind::FAdd) {
27316 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
27317 if (FMACost.isValid()) {
27318 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
27319 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
27320 // Also, exclude scalar fmul cost.
27321 InstructionCost FMulCost =
27323 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
27324 FMACost -= FMulCost;
27325 }
27326 ScalarCost += FMACost;
27327 continue;
27328 }
27329 }
27330 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
27331 continue;
27332 }
27333 ScalarCost = InstructionCost::getInvalid();
27334 break;
27335 }
27336 if (ScalarCost.isValid())
27337 Cost += ScalarCost;
27338 else
27339 Cost += GenCostFn();
27340 }
27341 return Cost;
27342 };
27343 // Require reduction cost if:
27344 // 1. This type is not a full register type and no other vectors with the
27345 // same type in the storage (first vector with small type).
27346 // 2. The storage does not have any vector with full vector use (first
27347 // vector with full register use).
27348 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
27349 switch (RdxKind) {
27350 case RecurKind::Add:
27351 case RecurKind::Mul:
27352 case RecurKind::Or:
27353 case RecurKind::And:
27354 case RecurKind::Xor:
27355 case RecurKind::FAdd:
27356 case RecurKind::FMul: {
27357 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
27358 if (!AllConsts) {
27359 if (DoesRequireReductionOp) {
27360 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
27361 assert(SLPReVec && "FixedVectorType is not expected.");
27362 unsigned ScalarTyNumElements = VecTy->getNumElements();
27363 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
27364 VectorCost += TTI->getShuffleCost(
27367 ReducedVals.size()),
27368 VectorTy,
27369 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
27370 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
27371 FMF, CostKind);
27372 }
27373 VectorCost += TTI->getScalarizationOverhead(
27374 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
27375 /*Extract*/ false, TTI::TCK_RecipThroughput);
27376 } else {
27377 Type *RedTy = VectorTy->getElementType();
27378 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
27379 std::make_pair(RedTy, true));
27380 if (RType == RedTy) {
27381 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
27382 FMF, CostKind);
27383 } else {
27384 VectorCost = TTI->getExtendedReductionCost(
27385 RdxOpcode, !IsSigned, RedTy,
27386 getWidenedType(RType, ReduxWidth), FMF, CostKind);
27387 }
27388 }
27389 } else {
27390 Type *RedTy = VectorTy->getElementType();
27391 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
27392 std::make_pair(RedTy, true));
27393 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
27394 InstructionCost FMACost = InstructionCost::getInvalid();
27395 if (RdxKind == RecurKind::FAdd) {
27396 // Check if the reduction operands can be converted to FMA.
27398 FastMathFlags FMF;
27399 FMF.set();
27400 for (Value *RdxVal : ReducedVals) {
27401 if (!RdxVal->hasOneUse()) {
27402 Ops.clear();
27403 break;
27404 }
27405 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
27406 FMF &= FPCI->getFastMathFlags();
27407 Ops.push_back(RdxVal->user_back());
27408 }
27409 if (!Ops.empty()) {
27410 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
27411 *TTI, TLI);
27412 if (FMACost.isValid()) {
27413 // Calculate actual FMAD cost.
27414 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
27415 {RVecTy, RVecTy, RVecTy}, FMF);
27416 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
27417
27418 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
27419 // Also, exclude vector fmul cost.
27421 Instruction::FMul, RVecTy, CostKind);
27423 << "Minus vector FMul cost: " << FMulCost << "\n");
27424 FMACost -= FMulCost;
27425 }
27426 }
27427 }
27428 if (FMACost.isValid())
27429 VectorCost += FMACost;
27430 else
27431 VectorCost +=
27432 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
27433 if (RType != RedTy) {
27434 unsigned Opcode = Instruction::Trunc;
27435 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
27436 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
27437 VectorCost += TTI->getCastInstrCost(
27438 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
27439 }
27440 }
27441 }
27442 ScalarCost = EvaluateScalarCost([&]() {
27443 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
27444 });
27445 break;
27446 }
27447 case RecurKind::FMax:
27448 case RecurKind::FMin:
27449 case RecurKind::FMaximum:
27450 case RecurKind::FMinimum:
27451 case RecurKind::SMax:
27452 case RecurKind::SMin:
27453 case RecurKind::UMax:
27454 case RecurKind::UMin: {
27456 if (!AllConsts) {
27457 if (DoesRequireReductionOp) {
27458 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
27459 } else {
27460 // Check if the previous reduction already exists and account it as
27461 // series of operations + single reduction.
27462 Type *RedTy = VectorTy->getElementType();
27463 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
27464 std::make_pair(RedTy, true));
27465 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
27466 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
27467 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
27468 if (RType != RedTy) {
27469 unsigned Opcode = Instruction::Trunc;
27470 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
27471 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
27472 VectorCost += TTI->getCastInstrCost(
27473 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
27474 }
27475 }
27476 }
27477 ScalarCost = EvaluateScalarCost([&]() {
27478 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
27479 return TTI->getIntrinsicInstrCost(ICA, CostKind);
27480 });
27481 break;
27482 }
27483 default:
27484 llvm_unreachable("Expected arithmetic or min/max reduction operation");
27485 }
27486
27487 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
27488 << " for reduction of " << shortBundleName(ReducedVals)
27489 << " (It is a splitting reduction)\n");
27490 return VectorCost - ScalarCost;
27491 }
27492
27493 /// Splits the values, stored in VectorValuesAndScales, into registers/free
27494 /// sub-registers, combines them with the given reduction operation as a
27495 /// vector operation and then performs single (small enough) reduction.
27496 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
27497 Type *DestTy) {
27498 Value *ReducedSubTree = nullptr;
27499 // Creates reduction and combines with the previous reduction.
27500 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned,
27501 bool ReducedInTree) {
27502 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy,
27503 ReducedInTree);
27504 if (ReducedSubTree)
27505 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
27506 "op.rdx", ReductionOps);
27507 else
27508 ReducedSubTree = Rdx;
27509 };
27510 if (VectorValuesAndScales.size() == 1) {
27511 const auto &[Vec, Scale, IsSigned, ReducedInTree] =
27512 VectorValuesAndScales.front();
27513 CreateSingleOp(Vec, Scale, IsSigned, ReducedInTree);
27514 return ReducedSubTree;
27515 }
27516 // Scales Vec using given Cnt scale factor and then performs vector combine
27517 // with previous value of VecOp.
27518 Value *VecRes = nullptr;
27519 bool VecResSignedness = false;
27520 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned,
27521 bool ReducedInTree) {
27522 if (ReducedInTree) {
27523 CreateSingleOp(Vec, Cnt, IsSigned, ReducedInTree);
27524 return;
27525 }
27526 Type *ScalarTy = Vec->getType()->getScalarType();
27527 // Scale Vec using given Cnt scale factor.
27528 if (Cnt > 1) {
27529 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
27530 switch (RdxKind) {
27531 case RecurKind::Add: {
27532 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
27533 unsigned VF = getNumElements(Vec->getType());
27534 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
27535 << ". (HorRdx)\n");
27536 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
27537 for (unsigned I : seq<unsigned>(Cnt))
27538 std::iota(std::next(Mask.begin(), VF * I),
27539 std::next(Mask.begin(), VF * (I + 1)), 0);
27540 ++NumVectorInstructions;
27541 Vec = Builder.CreateShuffleVector(Vec, Mask);
27542 break;
27543 }
27544 // res = mul vv, n
27545 if (ScalarTy != DestTy->getScalarType())
27546 Vec = Builder.CreateIntCast(
27547 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
27548 IsSigned);
27550 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
27551 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
27552 << ". (HorRdx)\n");
27553 ++NumVectorInstructions;
27554 Vec = Builder.CreateMul(Vec, Scale);
27555 break;
27556 }
27557 case RecurKind::Xor: {
27558 // res = n % 2 ? 0 : vv
27560 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
27561 if (Cnt % 2 == 0)
27562 Vec = Constant::getNullValue(Vec->getType());
27563 break;
27564 }
27565 case RecurKind::FAdd: {
27566 // res = fmul v, n
27567 Value *Scale =
27568 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
27569 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
27570 << ". (HorRdx)\n");
27571 ++NumVectorInstructions;
27572 Vec = Builder.CreateFMul(Vec, Scale);
27573 break;
27574 }
27575 case RecurKind::And:
27576 case RecurKind::Or:
27577 case RecurKind::SMax:
27578 case RecurKind::SMin:
27579 case RecurKind::UMax:
27580 case RecurKind::UMin:
27581 case RecurKind::FMax:
27582 case RecurKind::FMin:
27583 case RecurKind::FMaximum:
27584 case RecurKind::FMinimum:
27585 // res = vv
27586 break;
27587 case RecurKind::Sub:
27588 case RecurKind::AddChainWithSubs:
27589 case RecurKind::Mul:
27590 case RecurKind::FMul:
27591 case RecurKind::FMulAdd:
27592 case RecurKind::AnyOf:
27593 case RecurKind::FindIV:
27594 case RecurKind::FindLast:
27595 case RecurKind::FMaxNum:
27596 case RecurKind::FMinNum:
27597 case RecurKind::FMaximumNum:
27598 case RecurKind::FMinimumNum:
27599 case RecurKind::None:
27600 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
27601 }
27602 }
27603 // Combine Vec with the previous VecOp.
27604 if (!VecRes) {
27605 VecRes = Vec;
27606 VecResSignedness = IsSigned;
27607 } else {
27608 ++NumVectorInstructions;
27609 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
27610 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
27611 // Handle ctpop.
27612 unsigned VecResVF = getNumElements(VecRes->getType());
27613 unsigned VecVF = getNumElements(Vec->getType());
27614 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
27615 std::iota(Mask.begin(), Mask.end(), 0);
27616 // Ensure that VecRes is always larger than Vec
27617 if (VecResVF < VecVF) {
27618 std::swap(VecRes, Vec);
27619 std::swap(VecResVF, VecVF);
27620 }
27621 if (VecResVF != VecVF) {
27622 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
27623 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
27624 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
27625 }
27626 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
27627 return;
27628 }
27629 if (VecRes->getType()->getScalarType() != DestTy->getScalarType()) {
27630 assert(getNumElements(VecRes->getType()) % getNumElements(DestTy) ==
27631 0 &&
27632 "Expected the number of elements in VecRes to be a multiple "
27633 "of the number of elements in DestTy");
27634 VecRes = Builder.CreateIntCast(
27635 VecRes,
27636 getWidenedType(DestTy->getScalarType(),
27637 getNumElements(VecRes->getType())),
27638 VecResSignedness);
27639 }
27640 if (ScalarTy != DestTy->getScalarType())
27641 Vec = Builder.CreateIntCast(
27642 Vec,
27643 getWidenedType(DestTy->getScalarType(),
27644 getNumElements(Vec->getType())),
27645 IsSigned);
27646 unsigned VecResVF = getNumElements(VecRes->getType());
27647 unsigned VecVF = getNumElements(Vec->getType());
27648 // Ensure that VecRes is always larger than Vec
27649 if (VecResVF < VecVF) {
27650 std::swap(VecRes, Vec);
27651 std::swap(VecResVF, VecVF);
27652 }
27653 // extract + op + insert
27654 Value *Op = VecRes;
27655 if (VecResVF != VecVF)
27656 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
27657 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
27658 if (VecResVF != VecVF)
27659 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
27660 VecRes = Op;
27661 }
27662 };
27663 for (auto [Vec, Scale, IsSigned, ReducedInTree] : VectorValuesAndScales)
27664 CreateVecOp(Vec, Scale, IsSigned, ReducedInTree);
27665 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false,
27666 /*ReducedInTree=*/false);
27667
27668 return ReducedSubTree;
27669 }
27670
27671 /// Emit a horizontal reduction of the vectorized value.
27672 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
27673 const TargetTransformInfo *TTI, Type *DestTy) {
27674 assert(VectorizedValue && "Need to have a vectorized tree node");
27675 assert(RdxKind != RecurKind::FMulAdd &&
27676 "A call to the llvm.fmuladd intrinsic is not handled yet");
27677
27678 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
27679 if (FTy->getScalarType() == Builder.getInt1Ty() &&
27680 RdxKind == RecurKind::Add &&
27681 DestTy->getScalarType() != FTy->getScalarType()) {
27682 // Convert vector_reduce_add(ZExt(<n x i1>)) to
27683 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
27684 Value *V = Builder.CreateBitCast(
27685 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
27686 ++NumVectorInstructions;
27687 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
27688 }
27689 ++NumVectorInstructions;
27690 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
27691 }
27692
27693 /// Emits optimized code for unique scalar value reused \p Cnt times.
27694 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
27695 unsigned Cnt) {
27696 assert(IsSupportedHorRdxIdentityOp &&
27697 "The optimization of matched scalar identity horizontal reductions "
27698 "must be supported.");
27699 if (Cnt == 1)
27700 return VectorizedValue;
27701 switch (RdxKind) {
27702 case RecurKind::Add: {
27703 // res = mul vv, n
27704 Value *Scale =
27705 ConstantInt::get(VectorizedValue->getType(), Cnt,
27706 /*IsSigned=*/false, /*ImplicitTrunc=*/true);
27707 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
27708 << VectorizedValue << ". (HorRdx)\n");
27709 return Builder.CreateMul(VectorizedValue, Scale);
27710 }
27711 case RecurKind::Xor: {
27712 // res = n % 2 ? 0 : vv
27713 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
27714 << ". (HorRdx)\n");
27715 if (Cnt % 2 == 0)
27716 return Constant::getNullValue(VectorizedValue->getType());
27717 return VectorizedValue;
27718 }
27719 case RecurKind::FAdd: {
27720 // res = fmul v, n
27721 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
27722 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
27723 << VectorizedValue << ". (HorRdx)\n");
27724 return Builder.CreateFMul(VectorizedValue, Scale);
27725 }
27726 case RecurKind::And:
27727 case RecurKind::Or:
27728 case RecurKind::SMax:
27729 case RecurKind::SMin:
27730 case RecurKind::UMax:
27731 case RecurKind::UMin:
27732 case RecurKind::FMax:
27733 case RecurKind::FMin:
27734 case RecurKind::FMaximum:
27735 case RecurKind::FMinimum:
27736 // res = vv
27737 return VectorizedValue;
27738 case RecurKind::Sub:
27739 case RecurKind::AddChainWithSubs:
27740 case RecurKind::Mul:
27741 case RecurKind::FMul:
27742 case RecurKind::FMulAdd:
27743 case RecurKind::AnyOf:
27744 case RecurKind::FindIV:
27745 case RecurKind::FindLast:
27746 case RecurKind::FMaxNum:
27747 case RecurKind::FMinNum:
27748 case RecurKind::FMaximumNum:
27749 case RecurKind::FMinimumNum:
27750 case RecurKind::None:
27751 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
27752 }
27753 return nullptr;
27754 }
27755
27756 /// Emits actual operation for the scalar identity values, found during
27757 /// horizontal reduction analysis.
27758 Value *
27759 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
27760 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
27761 const DenseMap<Value *, Value *> &TrackedToOrig) {
27762 assert(IsSupportedHorRdxIdentityOp &&
27763 "The optimization of matched scalar identity horizontal reductions "
27764 "must be supported.");
27765 ArrayRef<Value *> VL = R.getRootNodeScalars();
27766 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
27767 if (VTy->getElementType() != VL.front()->getType()) {
27768 VectorizedValue = Builder.CreateIntCast(
27769 VectorizedValue,
27770 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
27771 R.isSignedMinBitwidthRootNode());
27772 }
27773 switch (RdxKind) {
27774 case RecurKind::Add: {
27775 // root = mul prev_root, <1, 1, n, 1>
27777 for (Value *V : VL) {
27778 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
27779 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
27780 }
27781 auto *Scale = ConstantVector::get(Vals);
27782 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
27783 << VectorizedValue << ". (HorRdx)\n");
27784 return Builder.CreateMul(VectorizedValue, Scale);
27785 }
27786 case RecurKind::And:
27787 case RecurKind::Or:
27788 // No need for multiple or/and(s).
27789 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
27790 << ". (HorRdx)\n");
27791 return VectorizedValue;
27792 case RecurKind::SMax:
27793 case RecurKind::SMin:
27794 case RecurKind::UMax:
27795 case RecurKind::UMin:
27796 case RecurKind::FMax:
27797 case RecurKind::FMin:
27798 case RecurKind::FMaximum:
27799 case RecurKind::FMinimum:
27800 // No need for multiple min/max(s) of the same value.
27801 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
27802 << ". (HorRdx)\n");
27803 return VectorizedValue;
27804 case RecurKind::Xor: {
27805 // Replace values with even number of repeats with 0, since
27806 // x xor x = 0.
27807 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
27808 // 7>, if elements 4th and 6th elements have even number of repeats.
27809 SmallVector<int> Mask(
27810 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
27812 std::iota(Mask.begin(), Mask.end(), 0);
27813 bool NeedShuffle = false;
27814 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
27815 Value *V = VL[I];
27816 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
27817 if (Cnt % 2 == 0) {
27818 Mask[I] = VF;
27819 NeedShuffle = true;
27820 }
27821 }
27822 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
27823 : Mask) dbgs()
27824 << I << " ";
27825 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
27826 if (NeedShuffle)
27827 VectorizedValue = Builder.CreateShuffleVector(
27828 VectorizedValue,
27829 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
27830 return VectorizedValue;
27831 }
27832 case RecurKind::FAdd: {
27833 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
27835 for (Value *V : VL) {
27836 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
27837 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
27838 }
27839 auto *Scale = ConstantVector::get(Vals);
27840 return Builder.CreateFMul(VectorizedValue, Scale);
27841 }
27842 case RecurKind::Sub:
27843 case RecurKind::AddChainWithSubs:
27844 case RecurKind::Mul:
27845 case RecurKind::FMul:
27846 case RecurKind::FMulAdd:
27847 case RecurKind::AnyOf:
27848 case RecurKind::FindIV:
27849 case RecurKind::FindLast:
27850 case RecurKind::FMaxNum:
27851 case RecurKind::FMinNum:
27852 case RecurKind::FMaximumNum:
27853 case RecurKind::FMinimumNum:
27854 case RecurKind::None:
27855 llvm_unreachable("Unexpected reduction kind for reused scalars.");
27856 }
27857 return nullptr;
27858 }
27859};
27860} // end anonymous namespace
27861
27862/// Gets recurrence kind from the specified value.
27864 return HorizontalReduction::getRdxKind(V);
27865}
27866static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
27867 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
27868 return cast<FixedVectorType>(IE->getType())->getNumElements();
27869
27870 unsigned AggregateSize = 1;
27871 auto *IV = cast<InsertValueInst>(InsertInst);
27872 Type *CurrentType = IV->getType();
27873 do {
27874 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
27875 for (auto *Elt : ST->elements())
27876 if (Elt != ST->getElementType(0)) // check homogeneity
27877 return std::nullopt;
27878 AggregateSize *= ST->getNumElements();
27879 CurrentType = ST->getElementType(0);
27880 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
27881 AggregateSize *= AT->getNumElements();
27882 CurrentType = AT->getElementType();
27883 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
27884 AggregateSize *= VT->getNumElements();
27885 return AggregateSize;
27886 } else if (CurrentType->isSingleValueType()) {
27887 return AggregateSize;
27888 } else {
27889 return std::nullopt;
27890 }
27891 } while (true);
27892}
27893
27894static void findBuildAggregateRec(Instruction *LastInsertInst,
27896 SmallVectorImpl<Value *> &BuildVectorOpds,
27897 SmallVectorImpl<Value *> &InsertElts,
27898 unsigned OperandOffset, const BoUpSLP &R) {
27899 do {
27900 Value *InsertedOperand = LastInsertInst->getOperand(1);
27901 std::optional<unsigned> OperandIndex =
27902 getElementIndex(LastInsertInst, OperandOffset);
27903 if (!OperandIndex || R.isDeleted(LastInsertInst))
27904 return;
27905 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
27907 BuildVectorOpds, InsertElts, *OperandIndex, R);
27908
27909 } else {
27910 BuildVectorOpds[*OperandIndex] = InsertedOperand;
27911 InsertElts[*OperandIndex] = LastInsertInst;
27912 }
27913 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
27914 } while (LastInsertInst != nullptr &&
27916 LastInsertInst->hasOneUse());
27917}
27918
27919/// Recognize construction of vectors like
27920/// %ra = insertelement <4 x float> poison, float %s0, i32 0
27921/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
27922/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
27923/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
27924/// starting from the last insertelement or insertvalue instruction.
27925///
27926/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
27927/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
27928/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
27929///
27930/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
27931///
27932/// \return true if it matches.
27933static bool findBuildAggregate(Instruction *LastInsertInst,
27935 SmallVectorImpl<Value *> &BuildVectorOpds,
27936 SmallVectorImpl<Value *> &InsertElts,
27937 const BoUpSLP &R) {
27938
27939 assert((isa<InsertElementInst>(LastInsertInst) ||
27940 isa<InsertValueInst>(LastInsertInst)) &&
27941 "Expected insertelement or insertvalue instruction!");
27942
27943 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
27944 "Expected empty result vectors!");
27945
27946 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
27947 if (!AggregateSize)
27948 return false;
27949 BuildVectorOpds.resize(*AggregateSize);
27950 InsertElts.resize(*AggregateSize);
27951
27952 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
27953 llvm::erase(BuildVectorOpds, nullptr);
27954 llvm::erase(InsertElts, nullptr);
27955 if (BuildVectorOpds.size() >= 2)
27956 return true;
27957
27958 return false;
27959}
27960
27961/// Try and get a reduction instruction from a phi node.
27962///
27963/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
27964/// if they come from either \p ParentBB or a containing loop latch.
27965///
27966/// \returns A candidate reduction value if possible, or \code nullptr \endcode
27967/// if not possible.
27969 BasicBlock *ParentBB, LoopInfo *LI) {
27970 // There are situations where the reduction value is not dominated by the
27971 // reduction phi. Vectorizing such cases has been reported to cause
27972 // miscompiles. See PR25787.
27973 auto DominatedReduxValue = [&](Value *R) {
27974 return isa<Instruction>(R) &&
27975 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
27976 };
27977
27978 Instruction *Rdx = nullptr;
27979
27980 // Return the incoming value if it comes from the same BB as the phi node.
27981 if (P->getIncomingBlock(0) == ParentBB) {
27982 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
27983 } else if (P->getIncomingBlock(1) == ParentBB) {
27984 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
27985 }
27986
27987 if (Rdx && DominatedReduxValue(Rdx))
27988 return Rdx;
27989
27990 // Otherwise, check whether we have a loop latch to look at.
27991 Loop *BBL = LI->getLoopFor(ParentBB);
27992 if (!BBL)
27993 return nullptr;
27994 BasicBlock *BBLatch = BBL->getLoopLatch();
27995 if (!BBLatch)
27996 return nullptr;
27997
27998 // There is a loop latch, return the incoming value if it comes from
27999 // that. This reduction pattern occasionally turns up.
28000 if (P->getIncomingBlock(0) == BBLatch) {
28001 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
28002 } else if (P->getIncomingBlock(1) == BBLatch) {
28003 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
28004 }
28005
28006 if (Rdx && DominatedReduxValue(Rdx))
28007 return Rdx;
28008
28009 return nullptr;
28010}
28011
28012static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
28013 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
28014 return true;
28015 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
28016 return true;
28017 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
28018 return true;
28019 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
28020 return true;
28021 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
28022 return true;
28024 return true;
28026 return true;
28028 return true;
28030 return true;
28031 return false;
28032}
28033
28034/// We could have an initial reduction that is not an add.
28035/// r *= v1 + v2 + v3 + v4
28036/// In such a case start looking for a tree rooted in the first '+'.
28037/// \Returns the new root if found, which may be nullptr if not an instruction.
28039 Instruction *Root) {
28040 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
28041 isa<IntrinsicInst>(Root)) &&
28042 "Expected binop, select, or intrinsic for reduction matching");
28043 Value *LHS =
28044 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
28045 Value *RHS =
28046 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
28047 if (LHS == Phi)
28048 return dyn_cast<Instruction>(RHS);
28049 if (RHS == Phi)
28050 return dyn_cast<Instruction>(LHS);
28051 return nullptr;
28052}
28053
28054/// \p Returns the first operand of \p I that does not match \p Phi. If
28055/// operand is not an instruction it returns nullptr.
28057 Value *Op0 = nullptr;
28058 Value *Op1 = nullptr;
28059 if (!matchRdxBop(I, Op0, Op1))
28060 return nullptr;
28061 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
28062}
28063
28064/// \Returns true if \p I is a candidate instruction for reduction vectorization.
28066 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
28067 Value *B0 = nullptr, *B1 = nullptr;
28068 bool IsBinop = matchRdxBop(I, B0, B1);
28069 return IsBinop || IsSelect;
28070}
28071
28072bool SLPVectorizerPass::vectorizeHorReduction(
28073 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
28074 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
28075 if (!ShouldVectorizeHor)
28076 return false;
28077 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
28078
28079 if (Root->getParent() != BB || isa<PHINode>(Root))
28080 return false;
28081
28082 // If we can find a secondary reduction root, use that instead.
28083 auto SelectRoot = [&]() {
28084 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
28085 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
28086 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
28087 return NewRoot;
28088 return Root;
28089 };
28090
28091 // Start analysis starting from Root instruction. If horizontal reduction is
28092 // found, try to vectorize it. If it is not a horizontal reduction or
28093 // vectorization is not possible or not effective, and currently analyzed
28094 // instruction is a binary operation, try to vectorize the operands, using
28095 // pre-order DFS traversal order. If the operands were not vectorized, repeat
28096 // the same procedure considering each operand as a possible root of the
28097 // horizontal reduction.
28098 // Interrupt the process if the Root instruction itself was vectorized or all
28099 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
28100 // If a horizintal reduction was not matched or vectorized we collect
28101 // instructions for possible later attempts for vectorization.
28102 std::queue<std::pair<Instruction *, unsigned>> Stack;
28103 Stack.emplace(SelectRoot(), 0);
28104 SmallPtrSet<Value *, 8> VisitedInstrs;
28105 bool Res = false;
28106 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
28107 if (R.isAnalyzedReductionRoot(Inst))
28108 return nullptr;
28109 if (!isReductionCandidate(Inst))
28110 return nullptr;
28111 HorizontalReduction HorRdx;
28112 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DT, *DL, *TTI, *TLI))
28113 return nullptr;
28114 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
28115 };
28116 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
28117 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
28118 FutureSeed = getNonPhiOperand(Root, P);
28119 if (!FutureSeed)
28120 return false;
28121 }
28122 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
28123 // analysis is done separately.
28125 PostponedInsts.push_back(FutureSeed);
28126 return true;
28127 };
28128
28129 while (!Stack.empty()) {
28130 Instruction *Inst;
28131 unsigned Level;
28132 std::tie(Inst, Level) = Stack.front();
28133 Stack.pop();
28134 // Do not try to analyze instruction that has already been vectorized.
28135 // This may happen when we vectorize instruction operands on a previous
28136 // iteration while stack was populated before that happened.
28137 if (R.isDeleted(Inst))
28138 continue;
28139 if (Value *VectorizedV = TryToReduce(Inst)) {
28140 Res = true;
28141 if (auto *I = dyn_cast<Instruction>(VectorizedV); I && I != Inst) {
28142 // Try to find another reduction.
28143 Stack.emplace(I, Level);
28144 continue;
28145 }
28146 if (R.isDeleted(Inst))
28147 continue;
28148 } else {
28149 // We could not vectorize `Inst` so try to use it as a future seed.
28150 if (!TryAppendToPostponedInsts(Inst)) {
28151 assert(Stack.empty() && "Expected empty stack");
28152 break;
28153 }
28154 }
28155
28156 // Try to vectorize operands.
28157 // Continue analysis for the instruction from the same basic block only to
28158 // save compile time.
28159 if (++Level < RecursionMaxDepth)
28160 for (auto *Op : Inst->operand_values())
28161 if (VisitedInstrs.insert(Op).second)
28162 if (auto *I = dyn_cast<Instruction>(Op))
28163 // Do not try to vectorize CmpInst operands, this is done
28164 // separately.
28166 !R.isDeleted(I) && I->getParent() == BB)
28167 Stack.emplace(I, Level);
28168 }
28169 return Res;
28170}
28171
28172bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
28173 if (!I)
28174 return false;
28175
28176 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
28177 return false;
28178 // Skip potential FMA candidates.
28179 if ((I->getOpcode() == Instruction::FAdd ||
28180 I->getOpcode() == Instruction::FSub) &&
28181 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
28182 .isValid())
28183 return false;
28184
28185 Value *P = I->getParent();
28186
28187 // Vectorize in current basic block only.
28188 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
28189 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
28190 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
28191 R.isDeleted(Op0) || R.isDeleted(Op1))
28192 return false;
28193
28194 // First collect all possible candidates
28196 Candidates.emplace_back(Op0, Op1);
28197
28198 auto *A = dyn_cast<BinaryOperator>(Op0);
28199 auto *B = dyn_cast<BinaryOperator>(Op1);
28200 // Try to skip B.
28201 if (A && B && B->hasOneUse()) {
28202 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
28203 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
28204 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
28205 Candidates.emplace_back(A, B0);
28206 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
28207 Candidates.emplace_back(A, B1);
28208 }
28209 // Try to skip A.
28210 if (B && A && A->hasOneUse()) {
28211 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
28212 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
28213 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
28214 Candidates.emplace_back(A0, B);
28215 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
28216 Candidates.emplace_back(A1, B);
28217 }
28218
28219 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
28221 if (!isReductionCandidate(Inst))
28222 return false;
28223 Type *Ty = Inst->getType();
28224 if (!isValidElementType(Ty) || Ty->isPointerTy())
28225 return false;
28226 HorizontalReduction HorRdx(Inst, Ops);
28227 if (!HorRdx.matchReductionForOperands())
28228 return false;
28229 // Check the cost of operations.
28230 VectorType *VecTy = getWidenedType(Ty, Ops.size());
28232 InstructionCost ScalarCost =
28233 TTI.getScalarizationOverhead(
28234 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
28235 /*Extract=*/true, CostKind) +
28236 TTI.getInstructionCost(Inst, CostKind);
28237 InstructionCost RedCost;
28238 switch (::getRdxKind(Inst)) {
28239 case RecurKind::Add:
28240 case RecurKind::Mul:
28241 case RecurKind::Or:
28242 case RecurKind::And:
28243 case RecurKind::Xor:
28244 case RecurKind::FAdd:
28245 case RecurKind::FMul: {
28246 FastMathFlags FMF;
28247 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
28248 FMF = FPCI->getFastMathFlags();
28249 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
28250 CostKind);
28251 break;
28252 }
28253 default:
28254 return false;
28255 }
28256 if (RedCost >= ScalarCost)
28257 return false;
28258
28259 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
28260 };
28261 if (Candidates.size() == 1)
28262 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
28263
28264 // We have multiple options. Try to pick the single best.
28265 std::optional<int> BestCandidate = R.findBestRootPair(Candidates).first;
28266 if (!BestCandidate)
28267 return false;
28268 return (*BestCandidate == 0 &&
28269 TryToReduce(I, {Candidates[*BestCandidate].first,
28270 Candidates[*BestCandidate].second})) ||
28271 tryToVectorizeList({Candidates[*BestCandidate].first,
28272 Candidates[*BestCandidate].second},
28273 R);
28274}
28275
28276bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
28277 BasicBlock *BB, BoUpSLP &R) {
28278 SmallVector<WeakTrackingVH> PostponedInsts;
28279 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
28280 Res |= tryToVectorize(PostponedInsts, R);
28281 return Res;
28282}
28283
28284bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
28285 BoUpSLP &R) {
28286 bool Res = false;
28287 for (Value *V : Insts)
28288 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
28289 Res |= tryToVectorize(Inst, R);
28290 return Res;
28291}
28292
28293bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
28294 BasicBlock *BB, BoUpSLP &R,
28295 bool MaxVFOnly) {
28296 if (!R.canMapToVector(IVI->getType()))
28297 return false;
28298
28299 SmallVector<Value *, 16> BuildVectorOpds;
28300 SmallVector<Value *, 16> BuildVectorInsts;
28301 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
28302 return false;
28303
28304 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
28305 R.getORE()->emit([&]() {
28306 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
28307 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
28308 "trying reduction first.";
28309 });
28310 return false;
28311 }
28312 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
28313 // Aggregate value is unlikely to be processed in vector register.
28314 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
28315}
28316
28317bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
28318 BasicBlock *BB, BoUpSLP &R,
28319 bool MaxVFOnly) {
28320 SmallVector<Value *, 16> BuildVectorInsts;
28321 SmallVector<Value *, 16> BuildVectorOpds;
28322 SmallVector<int> Mask;
28323 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
28325 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
28326 return false;
28327
28328 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
28329 R.getORE()->emit([&]() {
28330 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
28331 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
28332 "trying reduction first.";
28333 });
28334 return false;
28335 }
28336 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
28337 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
28338}
28339
28340template <typename T>
28342 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
28343 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
28344 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
28345 bool MaxVFOnly, BoUpSLP &R) {
28346 bool Changed = false;
28347 // Sort by type, parent, operands.
28348 stable_sort(Incoming, Comparator);
28349
28350 // Try to vectorize elements base on their type.
28351 SmallVector<T *> Candidates;
28353 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
28354 VL.clear()) {
28355 // Look for the next elements with the same type, parent and operand
28356 // kinds.
28357 auto *I = dyn_cast<Instruction>(*IncIt);
28358 if (!I || R.isDeleted(I)) {
28359 ++IncIt;
28360 continue;
28361 }
28362 auto *SameTypeIt = IncIt;
28363 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
28364 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
28365 AreCompatible(VL, *SameTypeIt))) {
28366 auto *I = dyn_cast<Instruction>(*SameTypeIt);
28367 ++SameTypeIt;
28368 if (I && !R.isDeleted(I))
28369 VL.push_back(cast<T>(I));
28370 }
28371
28372 // Try to vectorize them.
28373 unsigned NumElts = VL.size();
28374 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
28375 << NumElts << ")\n");
28376 // The vectorization is a 3-state attempt:
28377 // 1. Try to vectorize instructions with the same/alternate opcodes with the
28378 // size of maximal register at first.
28379 // 2. Try to vectorize remaining instructions with the same type, if
28380 // possible. This may result in the better vectorization results rather than
28381 // if we try just to vectorize instructions with the same/alternate opcodes.
28382 // 3. Final attempt to try to vectorize all instructions with the
28383 // same/alternate ops only, this may result in some extra final
28384 // vectorization.
28385 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
28386 // Success start over because instructions might have been changed.
28387 Changed = true;
28388 VL.swap(Candidates);
28389 Candidates.clear();
28390 for (T *V : VL) {
28391 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
28392 Candidates.push_back(V);
28393 }
28394 } else {
28395 /// \Returns the minimum number of elements that we will attempt to
28396 /// vectorize.
28397 auto GetMinNumElements = [&R](Value *V) {
28398 unsigned EltSize = R.getVectorElementSize(V);
28399 return std::max(2U, R.getMaxVecRegSize() / EltSize);
28400 };
28401 if (NumElts < GetMinNumElements(*IncIt) &&
28402 (Candidates.empty() ||
28403 Candidates.front()->getType() == (*IncIt)->getType())) {
28404 for (T *V : VL) {
28405 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
28406 Candidates.push_back(V);
28407 }
28408 }
28409 }
28410 // Final attempt to vectorize instructions with the same types.
28411 if (Candidates.size() > 1 &&
28412 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
28413 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
28414 // Success start over because instructions might have been changed.
28415 Changed = true;
28416 } else if (MaxVFOnly) {
28417 // Try to vectorize using small vectors.
28419 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
28420 VL.clear()) {
28421 auto *I = dyn_cast<Instruction>(*It);
28422 if (!I || R.isDeleted(I)) {
28423 ++It;
28424 continue;
28425 }
28426 auto *SameTypeIt = It;
28427 while (SameTypeIt != End &&
28428 (!isa<Instruction>(*SameTypeIt) ||
28429 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
28430 AreCompatible(*SameTypeIt, *It))) {
28431 auto *I = dyn_cast<Instruction>(*SameTypeIt);
28432 ++SameTypeIt;
28433 if (I && !R.isDeleted(I))
28434 VL.push_back(cast<T>(I));
28435 }
28436 unsigned NumElts = VL.size();
28437 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
28438 /*MaxVFOnly=*/false))
28439 Changed = true;
28440 It = SameTypeIt;
28441 }
28442 }
28443 Candidates.clear();
28444 }
28445
28446 // Start over at the next instruction of a different type (or the end).
28447 IncIt = SameTypeIt;
28448 }
28449 return Changed;
28450}
28451
28452/// Compare two cmp instructions. If IsCompatibility is true, function returns
28453/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
28454/// operands. If IsCompatibility is false, function implements strict weak
28455/// ordering relation between two cmp instructions, returning true if the first
28456/// instruction is "less" than the second, i.e. its predicate is less than the
28457/// predicate of the second or the operands IDs are less than the operands IDs
28458/// of the second cmp instruction.
28459template <bool IsCompatibility>
28460static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
28461 const DominatorTree &DT) {
28462 assert(isValidElementType(V->getType()) &&
28463 isValidElementType(V2->getType()) &&
28464 "Expected valid element types only.");
28465 if (V == V2)
28466 return IsCompatibility;
28467 auto *CI1 = cast<CmpInst>(V);
28468 auto *CI2 = cast<CmpInst>(V2);
28469 if (CI1->getOperand(0)->getType()->getTypeID() <
28470 CI2->getOperand(0)->getType()->getTypeID())
28471 return !IsCompatibility;
28472 if (CI1->getOperand(0)->getType()->getTypeID() >
28473 CI2->getOperand(0)->getType()->getTypeID())
28474 return false;
28475 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
28477 return !IsCompatibility;
28478 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
28480 return false;
28481 CmpInst::Predicate Pred1 = CI1->getPredicate();
28482 CmpInst::Predicate Pred2 = CI2->getPredicate();
28485 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
28486 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
28487 if (BasePred1 < BasePred2)
28488 return !IsCompatibility;
28489 if (BasePred1 > BasePred2)
28490 return false;
28491 // Compare operands.
28492 bool CI1Preds = Pred1 == BasePred1;
28493 bool CI2Preds = Pred2 == BasePred1;
28494 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
28495 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
28496 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
28497 if (Op1 == Op2)
28498 continue;
28499 if (Op1->getValueID() < Op2->getValueID())
28500 return !IsCompatibility;
28501 if (Op1->getValueID() > Op2->getValueID())
28502 return false;
28503 if (auto *I1 = dyn_cast<Instruction>(Op1))
28504 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
28505 if (IsCompatibility) {
28506 if (I1->getParent() != I2->getParent())
28507 return false;
28508 } else {
28509 // Try to compare nodes with same parent.
28510 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
28511 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
28512 if (!NodeI1)
28513 return NodeI2 != nullptr;
28514 if (!NodeI2)
28515 return false;
28516 assert((NodeI1 == NodeI2) ==
28517 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
28518 "Different nodes should have different DFS numbers");
28519 if (NodeI1 != NodeI2)
28520 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
28521 }
28522 InstructionsState S = getSameOpcode({I1, I2}, TLI);
28523 if (S && (IsCompatibility || !S.isAltShuffle()))
28524 continue;
28525 if (IsCompatibility)
28526 return false;
28527 if (I1->getOpcode() != I2->getOpcode())
28528 return I1->getOpcode() < I2->getOpcode();
28529 }
28530 }
28531 return IsCompatibility;
28532}
28533
28534template <typename ItT>
28535bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
28536 BasicBlock *BB, BoUpSLP &R) {
28537 bool Changed = false;
28538 // Try to find reductions first.
28539 for (CmpInst *I : CmpInsts) {
28540 if (R.isDeleted(I))
28541 continue;
28542 for (Value *Op : I->operands())
28543 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
28544 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
28545 if (R.isDeleted(I))
28546 break;
28547 }
28548 }
28549 // Try to vectorize operands as vector bundles.
28550 for (CmpInst *I : CmpInsts) {
28551 if (R.isDeleted(I))
28552 continue;
28553 Changed |= tryToVectorize(I, R);
28554 }
28555 // Try to vectorize list of compares.
28556 // Sort by type, compare predicate, etc.
28557 auto CompareSorter = [&](Value *V, Value *V2) {
28558 if (V == V2)
28559 return false;
28560 return compareCmp<false>(V, V2, *TLI, *DT);
28561 };
28562
28563 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
28564 if (VL.empty() || VL.back() == V1)
28565 return true;
28566 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
28567 };
28568
28570 for (Instruction *V : CmpInsts)
28571 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
28572 Vals.push_back(V);
28573 if (Vals.size() <= 1)
28574 return Changed;
28576 Vals, CompareSorter, AreCompatibleCompares,
28577 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
28578 // Exclude possible reductions from other blocks.
28579 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
28580 return any_of(V->users(), [V](User *U) {
28581 auto *Select = dyn_cast<SelectInst>(U);
28582 return Select &&
28583 Select->getParent() != cast<Instruction>(V)->getParent();
28584 });
28585 });
28586 if (ArePossiblyReducedInOtherBlock)
28587 return false;
28588 return tryToVectorizeList(Candidates, R, MaxVFOnly);
28589 },
28590 /*MaxVFOnly=*/true, R);
28591 return Changed;
28592}
28593
28594bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
28595 BasicBlock *BB, BoUpSLP &R) {
28597 "This function only accepts Insert instructions");
28598 bool OpsChanged = false;
28599 SmallVector<WeakTrackingVH> PostponedInsts;
28600 for (auto *I : reverse(Instructions)) {
28601 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
28602 if (R.isDeleted(I) || isa<CmpInst>(I))
28603 continue;
28604 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
28605 OpsChanged |=
28606 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
28607 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
28608 OpsChanged |=
28609 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
28610 }
28611 // pass2 - try to vectorize reductions only
28612 if (R.isDeleted(I))
28613 continue;
28614 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
28615 if (R.isDeleted(I) || isa<CmpInst>(I))
28616 continue;
28617 // pass3 - try to match and vectorize a buildvector sequence.
28618 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
28619 OpsChanged |=
28620 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
28621 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
28622 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
28623 /*MaxVFOnly=*/false);
28624 }
28625 }
28626 // Now try to vectorize postponed instructions.
28627 OpsChanged |= tryToVectorize(PostponedInsts, R);
28628
28629 Instructions.clear();
28630 return OpsChanged;
28631}
28632
28633bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
28634 bool Changed = false;
28635 SmallVector<Value *, 4> Incoming;
28636 SmallPtrSet<Value *, 16> VisitedInstrs;
28637 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
28638 // node. Allows better to identify the chains that can be vectorized in the
28639 // better way.
28640 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
28641 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
28643 isValidElementType(V2->getType()) &&
28644 "Expected vectorizable types only.");
28645 if (V1 == V2)
28646 return false;
28647 // It is fine to compare type IDs here, since we expect only vectorizable
28648 // types, like ints, floats and pointers, we don't care about other type.
28649 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
28650 return true;
28651 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
28652 return false;
28653 if (V1->getType()->getScalarSizeInBits() <
28654 V2->getType()->getScalarSizeInBits())
28655 return true;
28656 if (V1->getType()->getScalarSizeInBits() >
28657 V2->getType()->getScalarSizeInBits())
28658 return false;
28659 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
28660 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
28661 if (Opcodes1.size() < Opcodes2.size())
28662 return true;
28663 if (Opcodes1.size() > Opcodes2.size())
28664 return false;
28665 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
28666 {
28667 // Instructions come first.
28668 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
28669 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
28670 if (I1 && I2) {
28671 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
28672 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
28673 if (!NodeI1)
28674 return NodeI2 != nullptr;
28675 if (!NodeI2)
28676 return false;
28677 assert((NodeI1 == NodeI2) ==
28678 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
28679 "Different nodes should have different DFS numbers");
28680 if (NodeI1 != NodeI2)
28681 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
28682 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
28683 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
28684 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
28685 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
28686 if (!E1 || !E2)
28687 continue;
28688
28689 // Sort on ExtractElementInsts primarily by vector operands. Prefer
28690 // program order of the vector operands.
28691 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
28692 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
28693 if (V1 != V2) {
28694 if (V1 && !V2)
28695 return true;
28696 if (!V1 && V2)
28697 return false;
28699 DT->getNode(V1->getParent());
28701 DT->getNode(V2->getParent());
28702 if (!NodeI1)
28703 return NodeI2 != nullptr;
28704 if (!NodeI2)
28705 return false;
28706 assert((NodeI1 == NodeI2) ==
28707 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
28708 "Different nodes should have different DFS numbers");
28709 if (NodeI1 != NodeI2)
28710 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
28711 return V1->comesBefore(V2);
28712 }
28713 // If we have the same vector operand, try to sort by constant
28714 // index.
28715 std::optional<unsigned> Id1 = getExtractIndex(E1);
28716 std::optional<unsigned> Id2 = getExtractIndex(E2);
28717 // Bring constants to the top
28718 if (Id1 && !Id2)
28719 return true;
28720 if (!Id1 && Id2)
28721 return false;
28722 // First elements come first.
28723 if (Id1 && Id2)
28724 return *Id1 < *Id2;
28725
28726 continue;
28727 }
28728 if (I1->getOpcode() == I2->getOpcode())
28729 continue;
28730 return I1->getOpcode() < I2->getOpcode();
28731 }
28732 if (I1)
28733 return true;
28734 if (I2)
28735 return false;
28736 }
28737 {
28738 // Non-undef constants come next.
28739 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
28740 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
28741 if (C1 && C2)
28742 continue;
28743 if (C1)
28744 return true;
28745 if (C2)
28746 return false;
28747 }
28748 bool U1 = isa<UndefValue>(Opcodes1[I]);
28749 bool U2 = isa<UndefValue>(Opcodes2[I]);
28750 {
28751 // Non-constant non-instructions come next.
28752 if (!U1 && !U2) {
28753 auto ValID1 = Opcodes1[I]->getValueID();
28754 auto ValID2 = Opcodes2[I]->getValueID();
28755 if (ValID1 == ValID2)
28756 continue;
28757 if (ValID1 < ValID2)
28758 return true;
28759 if (ValID1 > ValID2)
28760 return false;
28761 }
28762 if (!U1)
28763 return true;
28764 if (!U2)
28765 return false;
28766 }
28767 // Undefs come last.
28768 assert(U1 && U2 && "The only thing left should be undef & undef.");
28769 }
28770 return false;
28771 };
28772 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
28773 Value *V1) {
28774 if (VL.empty() || V1 == VL.back())
28775 return true;
28776 Value *V2 = VL.back();
28777 if (V1->getType() != V2->getType())
28778 return false;
28779 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
28780 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
28781 if (Opcodes1.size() != Opcodes2.size())
28782 return false;
28783 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
28784 // Undefs are compatible with any other value.
28785 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
28786 continue;
28787 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
28788 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
28789 if (R.isDeleted(I1) || R.isDeleted(I2))
28790 return false;
28791 if (I1->getParent() != I2->getParent())
28792 return false;
28793 if (getSameOpcode({I1, I2}, *TLI))
28794 continue;
28795 return false;
28796 }
28797 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
28798 continue;
28799 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
28800 return false;
28801 }
28802 return true;
28803 };
28804
28805 bool HaveVectorizedPhiNodes = false;
28806 do {
28807 // Collect the incoming values from the PHIs.
28808 Incoming.clear();
28809 for (Instruction &I : *BB) {
28810 auto *P = dyn_cast<PHINode>(&I);
28811 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
28812 break;
28813
28814 // No need to analyze deleted, vectorized and non-vectorizable
28815 // instructions.
28816 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
28817 isValidElementType(P->getType()))
28818 Incoming.push_back(P);
28819 }
28820
28821 if (Incoming.size() <= 1)
28822 break;
28823
28824 // Find the corresponding non-phi nodes for better matching when trying to
28825 // build the tree.
28826 for (Value *V : Incoming) {
28827 SmallVectorImpl<Value *> &Opcodes =
28828 PHIToOpcodes.try_emplace(V).first->getSecond();
28829 if (!Opcodes.empty())
28830 continue;
28831 SmallVector<Value *, 4> Nodes(1, V);
28832 SmallPtrSet<Value *, 4> Visited;
28833 while (!Nodes.empty()) {
28834 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
28835 if (!Visited.insert(PHI).second)
28836 continue;
28837 for (Value *V : PHI->incoming_values()) {
28838 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
28839 Nodes.push_back(PHI1);
28840 continue;
28841 }
28842 Opcodes.emplace_back(V);
28843 }
28844 }
28845 }
28846
28847 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
28848 Incoming, PHICompare, AreCompatiblePHIs,
28849 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
28850 return tryToVectorizeList(Candidates, R, MaxVFOnly);
28851 },
28852 /*MaxVFOnly=*/true, R);
28853 Changed |= HaveVectorizedPhiNodes;
28854 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
28855 auto *PHI = dyn_cast<PHINode>(P.first);
28856 return !PHI || R.isDeleted(PHI);
28857 }))
28858 PHIToOpcodes.clear();
28859 VisitedInstrs.insert_range(Incoming);
28860 } while (HaveVectorizedPhiNodes);
28861
28862 VisitedInstrs.clear();
28863
28864 InstSetVector PostProcessInserts;
28865 SmallSetVector<CmpInst *, 8> PostProcessCmps;
28866 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
28867 // also vectorizes `PostProcessCmps`.
28868 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
28869 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
28870 if (VectorizeCmps) {
28871 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
28872 PostProcessCmps.clear();
28873 }
28874 PostProcessInserts.clear();
28875 return Changed;
28876 };
28877 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
28878 auto IsInPostProcessInstrs = [&](Instruction *I) {
28879 if (auto *Cmp = dyn_cast<CmpInst>(I))
28880 return PostProcessCmps.contains(Cmp);
28882 PostProcessInserts.contains(I);
28883 };
28884 // Returns true if `I` is an instruction without users, like terminator, or
28885 // function call with ignored return value, store. Ignore unused instructions
28886 // (basing on instruction type, except for CallInst and InvokeInst).
28887 auto HasNoUsers = [](Instruction *I) {
28888 return I->use_empty() &&
28889 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
28890 };
28891 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
28892 // Skip instructions with scalable type. The num of elements is unknown at
28893 // compile-time for scalable type.
28894 if (isa<ScalableVectorType>(It->getType()))
28895 continue;
28896
28897 // Skip instructions marked for the deletion.
28898 if (R.isDeleted(&*It))
28899 continue;
28900 // We may go through BB multiple times so skip the one we have checked.
28901 if (!VisitedInstrs.insert(&*It).second) {
28902 if (HasNoUsers(&*It) &&
28903 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
28904 // We would like to start over since some instructions are deleted
28905 // and the iterator may become invalid value.
28906 Changed = true;
28907 It = BB->begin();
28908 E = BB->end();
28909 }
28910 continue;
28911 }
28912
28913 // Try to vectorize reductions that use PHINodes.
28914 if (PHINode *P = dyn_cast<PHINode>(It)) {
28915 // Check that the PHI is a reduction PHI.
28916 if (P->getNumIncomingValues() == 2) {
28917 // Try to match and vectorize a horizontal reduction.
28918 Instruction *Root = getReductionInstr(DT, P, BB, LI);
28919 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
28920 Changed = true;
28921 It = BB->begin();
28922 E = BB->end();
28923 continue;
28924 }
28925 }
28926 // Try to vectorize the incoming values of the PHI, to catch reductions
28927 // that feed into PHIs.
28928 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
28929 // Skip if the incoming block is the current BB for now. Also, bypass
28930 // unreachable IR for efficiency and to avoid crashing.
28931 // TODO: Collect the skipped incoming values and try to vectorize them
28932 // after processing BB.
28933 if (BB == P->getIncomingBlock(I) ||
28934 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
28935 continue;
28936
28937 // Postponed instructions should not be vectorized here, delay their
28938 // vectorization.
28939 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
28940 PI && !IsInPostProcessInstrs(PI)) {
28941 bool Res =
28942 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
28943 Changed |= Res;
28944 if (Res && R.isDeleted(P)) {
28945 It = BB->begin();
28946 E = BB->end();
28947 break;
28948 }
28949 }
28950 }
28951 continue;
28952 }
28953
28954 if (HasNoUsers(&*It)) {
28955 bool OpsChanged = false;
28956 auto *SI = dyn_cast<StoreInst>(It);
28957 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
28958 if (SI) {
28959 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
28960 // Try to vectorize chain in store, if this is the only store to the
28961 // address in the block.
28962 // TODO: This is just a temporarily solution to save compile time. Need
28963 // to investigate if we can safely turn on slp-vectorize-hor-store
28964 // instead to allow lookup for reduction chains in all non-vectorized
28965 // stores (need to check side effects and compile time).
28966 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
28967 SI->getValueOperand()->hasOneUse();
28968 }
28969 if (TryToVectorizeRoot) {
28970 for (auto *V : It->operand_values()) {
28971 // Postponed instructions should not be vectorized here, delay their
28972 // vectorization.
28973 if (auto *VI = dyn_cast<Instruction>(V);
28974 VI && !IsInPostProcessInstrs(VI))
28975 // Try to match and vectorize a horizontal reduction.
28976 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
28977 }
28978 }
28979 // Start vectorization of post-process list of instructions from the
28980 // top-tree instructions to try to vectorize as many instructions as
28981 // possible.
28982 OpsChanged |=
28983 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
28984 if (OpsChanged) {
28985 // We would like to start over since some instructions are deleted
28986 // and the iterator may become invalid value.
28987 Changed = true;
28988 It = BB->begin();
28989 E = BB->end();
28990 continue;
28991 }
28992 }
28993
28995 PostProcessInserts.insert(&*It);
28996 else if (isa<CmpInst>(It))
28997 PostProcessCmps.insert(cast<CmpInst>(&*It));
28998 }
28999
29000 return Changed;
29001}
29002
29003bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
29004 auto Changed = false;
29005 for (auto &Entry : GEPs) {
29006 // If the getelementptr list has fewer than two elements, there's nothing
29007 // to do.
29008 if (Entry.second.size() < 2)
29009 continue;
29010
29011 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
29012 << Entry.second.size() << ".\n");
29013
29014 // Process the GEP list in chunks suitable for the target's supported
29015 // vector size. If a vector register can't hold 1 element, we are done. We
29016 // are trying to vectorize the index computations, so the maximum number of
29017 // elements is based on the size of the index expression, rather than the
29018 // size of the GEP itself (the target's pointer size).
29019 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
29020 return !R.isDeleted(GEP);
29021 });
29022 if (It == Entry.second.end())
29023 continue;
29024 unsigned MaxVecRegSize = R.getMaxVecRegSize();
29025 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
29026 if (MaxVecRegSize < EltSize)
29027 continue;
29028
29029 unsigned MaxElts = MaxVecRegSize / EltSize;
29030 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
29031 auto Len = std::min<unsigned>(BE - BI, MaxElts);
29032 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
29033
29034 // Initialize a set a candidate getelementptrs. Note that we use a
29035 // SetVector here to preserve program order. If the index computations
29036 // are vectorizable and begin with loads, we want to minimize the chance
29037 // of having to reorder them later.
29038 SetVector<Value *> Candidates(llvm::from_range, GEPList);
29039
29040 // Some of the candidates may have already been vectorized after we
29041 // initially collected them or their index is optimized to constant value.
29042 // If so, they are marked as deleted, so remove them from the set of
29043 // candidates.
29044 Candidates.remove_if([&R](Value *I) {
29045 return R.isDeleted(cast<Instruction>(I)) ||
29046 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
29047 });
29048
29049 // Remove from the set of candidates all pairs of getelementptrs with
29050 // constant differences. Such getelementptrs are likely not good
29051 // candidates for vectorization in a bottom-up phase since one can be
29052 // computed from the other. We also ensure all candidate getelementptr
29053 // indices are unique.
29054 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
29055 auto *GEPI = GEPList[I];
29056 if (!Candidates.count(GEPI))
29057 continue;
29058 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
29059 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
29060 auto *GEPJ = GEPList[J];
29061 if (!Candidates.count(GEPJ))
29062 continue;
29063 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
29064 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
29065 Candidates.remove(GEPI);
29066 Candidates.remove(GEPJ);
29067 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
29068 Candidates.remove(GEPJ);
29069 }
29070 }
29071 }
29072
29073 // We break out of the above computation as soon as we know there are
29074 // fewer than two candidates remaining.
29075 if (Candidates.size() < 2)
29076 continue;
29077
29078 // Add the single, non-constant index of each candidate to the bundle. We
29079 // ensured the indices met these constraints when we originally collected
29080 // the getelementptrs.
29081 SmallVector<Value *, 16> Bundle(Candidates.size());
29082 auto BundleIndex = 0u;
29083 for (auto *V : Candidates) {
29084 auto *GEP = cast<GetElementPtrInst>(V);
29085 auto *GEPIdx = GEP->idx_begin()->get();
29086 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
29087 Bundle[BundleIndex++] = GEPIdx;
29088 }
29089
29090 // Try and vectorize the indices. We are currently only interested in
29091 // gather-like cases of the form:
29092 //
29093 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
29094 //
29095 // where the loads of "a", the loads of "b", and the subtractions can be
29096 // performed in parallel. It's likely that detecting this pattern in a
29097 // bottom-up phase will be simpler and less costly than building a
29098 // full-blown top-down phase beginning at the consecutive loads.
29099 Changed |= tryToVectorizeList(Bundle, R);
29100 }
29101 }
29102 return Changed;
29103}
29104
29105bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
29106 bool Changed = false;
29107 // Sort by type, base pointers and values operand. Value operands must be
29108 // compatible (have the same opcode, same parent), otherwise it is
29109 // definitely not profitable to try to vectorize them.
29110 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
29111 if (V->getValueOperand()->getType()->getTypeID() <
29112 V2->getValueOperand()->getType()->getTypeID())
29113 return true;
29114 if (V->getValueOperand()->getType()->getTypeID() >
29115 V2->getValueOperand()->getType()->getTypeID())
29116 return false;
29117 if (V->getPointerOperandType()->getTypeID() <
29118 V2->getPointerOperandType()->getTypeID())
29119 return true;
29120 if (V->getPointerOperandType()->getTypeID() >
29121 V2->getPointerOperandType()->getTypeID())
29122 return false;
29123 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
29124 V2->getValueOperand()->getType()->getScalarSizeInBits())
29125 return true;
29126 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
29127 V2->getValueOperand()->getType()->getScalarSizeInBits())
29128 return false;
29129 // UndefValues are compatible with all other values.
29130 auto *I1 = dyn_cast<Instruction>(V->getValueOperand());
29131 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
29132 if (I1 && I2) {
29133 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
29134 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
29135 assert(NodeI1 && "Should only process reachable instructions");
29136 assert(NodeI2 && "Should only process reachable instructions");
29137 assert((NodeI1 == NodeI2) ==
29138 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
29139 "Different nodes should have different DFS numbers");
29140 if (NodeI1 != NodeI2)
29141 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
29142 return I1->getOpcode() < I2->getOpcode();
29143 }
29144 if (I1 && !I2)
29145 return true;
29146 if (!I1 && I2)
29147 return false;
29148 return V->getValueOperand()->getValueID() <
29149 V2->getValueOperand()->getValueID();
29150 };
29151
29152 bool SameParent = true;
29153 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
29154 if (VL.empty()) {
29155 SameParent = true;
29156 return true;
29157 }
29158 StoreInst *V2 = VL.back();
29159 if (V1 == V2)
29160 return true;
29161 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
29162 return false;
29163 if (V1->getPointerOperandType() != V2->getPointerOperandType())
29164 return false;
29165 // Undefs are compatible with any other value.
29166 if (isa<UndefValue>(V1->getValueOperand()) ||
29168 return true;
29169 if (isa<Constant>(V1->getValueOperand()) &&
29171 return true;
29172 // Check if the operands of the stores can be vectorized. They can be
29173 // vectorized, if they have compatible operands or have operands, which can
29174 // be vectorized as copyables.
29175 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
29176 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
29177 if (I1 || I2) {
29178 // Accept only tail-following non-compatible values for now.
29179 // TODO: investigate if it is possible to vectorize incompatible values,
29180 // if the copyables are first in the list.
29181 if (I1 && !I2)
29182 return false;
29183 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
29184 SmallVector<Value *> NewVL(VL.size() + 1);
29185 for (auto [SI, V] : zip(VL, NewVL))
29186 V = SI->getValueOperand();
29187 NewVL.back() = V1->getValueOperand();
29188 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
29189 InstructionsState S = Analysis.buildInstructionsState(
29190 NewVL, R, /*WithProfitabilityCheck=*/true,
29191 /*SkipSameCodeCheck=*/!SameParent);
29192 if (S)
29193 return true;
29194 if (!SameParent)
29195 return false;
29196 }
29197 return V1->getValueOperand()->getValueID() ==
29198 V2->getValueOperand()->getValueID();
29199 };
29200
29201 // Attempt to sort and vectorize each of the store-groups.
29202 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
29203 for (auto &Pair : Stores) {
29204 if (Pair.second.size() < 2)
29205 continue;
29206
29207 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
29208 << Pair.second.size() << ".\n");
29209
29210 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
29211 continue;
29212
29213 // Reverse stores to do bottom-to-top analysis. This is important if the
29214 // values are stores to the same addresses several times, in this case need
29215 // to follow the stores order (reversed to meet the memory dependecies).
29216 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
29217 Pair.second.rend());
29219 ReversedStores, StoreSorter, AreCompatibleStores,
29220 [&](ArrayRef<StoreInst *> Candidates, bool) {
29221 return vectorizeStores(Candidates, R, Attempted);
29222 },
29223 /*MaxVFOnly=*/false, R);
29224 }
29225 return Changed;
29226}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
ManagedStatic< HTTPClientCleanup > Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
static unsigned getLoopTripCount(const Loop *L, ScalarEvolution &SE)
Get the assumed loop trip count for the loop L.
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static cl::opt< unsigned > LoopAwareTripCount("slp-cost-loop-trip-count", cl::init(2), cl::Hidden, cl::desc("Loop trip count, considered by the cost model during " "modeling (0=loops are ignored and considered flat code)"))
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static const Loop * findInnermostNonInvariantLoop(const Loop *L, ArrayRef< Value * > VL)
Find the innermost loop starting from L, for which at least a single value in VL is not invariant.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1421
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1118
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1411
unsigned logBase2() const
Definition APInt.h:1776
void setAllBits()
Set every bit to 1.
Definition APInt.h:1334
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1382
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:178
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:219
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:195
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:157
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:462
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:449
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:467
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:470
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:692
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:986
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
bool hasSameSign() const
Query samesign information, for optimizations.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static bool shouldExecute(CounterInfo &Counter)
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
void clear()
Definition FMF.h:63
bool allowReassoc() const
Flag queries.
Definition FMF.h:67
bool allowContract() const
Definition FMF.h:72
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:859
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2584
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:564
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:592
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2650
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2199
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2606
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1734
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2272
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2441
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1677
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1463
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:154
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
T & front() const
front - Get the first element.
Definition ArrayRef.h:349
iterator end() const
Definition ArrayRef.h:343
iterator begin() const
Definition ArrayRef.h:342
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
void clear()
clear - Erase all elements from the queue.
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:91
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:132
void insert_range(Range &&R)
Definition SetVector.h:176
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:94
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:176
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:184
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:263
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:313
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:287
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:138
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:259
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
iterator_range< value_op_iterator > operand_values()
Definition User.h:291
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:427
User * user_back()
Definition Value.h:413
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:544
bool hasUseList() const
Check if this Value has a use-list.
Definition Value.h:345
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:347
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:381
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
bool isReducedCmpBitcastRoot() const
Returns true if the tree results in the reduced cmp bitcast root.
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< WeakTrackingVH, unsigned, bool, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
std::pair< std::optional< int >, int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
bool canBuildSplitNode(ArrayRef< Value * > VL, const InstructionsState &LocalState, SmallVectorImpl< Value * > &Op1, SmallVectorImpl< Value * > &Op2, OrdersType &ReorderIndices) const
Checks if it is legal and profitable to build SplitVectorize node for the given VL.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isReducedBitcastRoot() const
Returns true if the tree results in one of the reduced bitcasts variants.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char IsConst[]
Key for Kernel::Arg::Metadata::mIsConst.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
NoWrapTrunc_match< OpTy, TruncInst::NoUnsignedWrap > m_NUWTrunc(const OpTy &Op)
Matches trunc nuw.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< CodeNode * > Code
Definition RDFGraph.h:388
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1725
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2313
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:2039
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2026
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1777
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:446
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:368
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1399
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:422
constexpr bool is_sorted_constexpr(R &&Range, Cmp C=Cmp{})
Check if elements in a range R are sorted with respect to a comparator C.
Definition STLExtras.h:1984
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Add
Sum of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A memory-efficient immutable range with a single value repeated N times.
Definition Repeated.h:75
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1439
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1448
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)